elpa-2016.05.001/0000755000312500001440000000000012717541041010104 500000000000000elpa-2016.05.001/Makefile.am0000644000312500001440000003640312717516040012067 00000000000000## Process this file with automake to produce Makefile.in ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4 AM_FCFLAGS = $(SCALAPACK_FCFLAGS) @FC_MODINC@modules @FC_MODOUT@modules AM_LDFLAGS = $(SCALAPACK_LDFLAGS) # libelpa lib_LTLIBRARIES = libelpa@SUFFIX@.la libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++ libelpa@SUFFIX@_la_SOURCES = \ src/mod_precision.f90 \ src/mod_mpi.F90 \ src/mod_mpi_stubs.F90 \ src/elpa2_kernels/mod_fortran_interfaces.F90 \ src/elpa_utilities.F90 \ src/elpa1_compute.F90 \ src/elpa1.F90 \ src/elpa2_utilities.F90 \ src/mod_pack_unpack_real.F90 \ src/elpa2_kernels/mod_single_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_complex.F90 \ src/mod_pack_unpack_complex.F90 \ src/aligned_mem.F90 \ src/elpa2_compute.F90 \ src/elpa2.F90 \ src/elpa_c_interface.F90 \ src/elpa_qr/qr_utils.F90 \ src/elpa_qr/elpa_qrkernels.f90 \ src/elpa_qr/elpa_pdlarfb.F90 \ src/elpa_qr/elpa_pdgeqrf.F90 EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES = \ src/elpa_reduce_add_vectors.X90 \ src/elpa_transpose_vectors.X90 \ src/redist_band.X90 if HAVE_DETAILED_TIMINGS libelpa@SUFFIX@_la_SOURCES += \ src/timer.F90 \ src/ftimings/ftimings.F90 \ src/ftimings/ftimings_type.F90 \ src/ftimings/ftimings_value.F90 \ src/ftimings/highwater_mark.c \ src/ftimings/resident_set_size.c \ src/ftimings/time.c \ src/ftimings/virtual_memory.c \ src/ftimings/papi.c endif if !WITH_MPI libelpa@SUFFIX@_la_SOURCES += src/mod_time_c.F90 if !HAVE_DETAILED_TIMINGS libelpa@SUFFIX@_la_SOURCES += src/ftimings/time.c endif endif if WITH_REAL_GENERIC_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.F90 endif if WITH_COMPLEX_GENERIC_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.F90 endif if WITH_REAL_GENERIC_SIMPLE_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.F90 endif if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.F90 endif if WITH_REAL_BGP_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 endif if WITH_REAL_BGQ_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 endif if WITH_REAL_SSE_ASSEMBLY_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s else if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s endif endif if WITH_REAL_SSE_BLOCK2_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c endif if WITH_REAL_AVX_BLOCK2_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c endif if WITH_REAL_SSE_BLOCK4_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c endif if WITH_REAL_AVX_BLOCK4_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c endif if WITH_REAL_SSE_BLOCK6_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c endif if WITH_REAL_AVX_BLOCK6_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c endif if WITH_COMPLEX_SSE_BLOCK1_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c endif if WITH_COMPLEX_AVX_BLOCK1_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c endif if WITH_COMPLEX_SSE_BLOCK2_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c endif if WITH_COMPLEX_AVX_BLOCK2_KERNEL libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c endif include generated_headers.am BUILT_SOURCES = $(generated_headers) # install any .mod files in the include/ dir elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ nobase_elpa_include_HEADERS = $(wildcard modules/*) nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_generated.h dist_man_MANS = \ man/solve_evp_real.3 \ man/solve_evp_real_1stage.3 \ man/solve_evp_complex.3 \ man/solve_evp_complex_1stage.3 \ man/solve_evp_real_2stage.3 \ man/solve_evp_complex_2stage.3 \ man/get_elpa_row_col_comms.3 \ man/get_elpa_communicators.3 \ man/elpa2_print_kernels.1 # other files to distribute filesdir = $(docdir)/examples dist_files_DATA = \ test/fortran_test_programs/read_real.F90 \ test/fortran_test_programs/test_complex2.F90 \ test/fortran_test_programs/test_complex2_default_kernel.F90 \ test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \ test/fortran_test_programs/test_complex.F90 \ test/fortran_test_programs/test_real2.F90 \ test/fortran_test_programs/test_real2_default_kernel.F90 \ test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \ test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \ test/fortran_test_programs/test_real.F90 \ test/fortran_test_programs/test_real_with_c.F90 \ src/elpa2_print_kernels.F90 dist_doc_DATA = README.md USERS_GUIDE.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt # pkg-config stuff pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = @PKG_CONFIG_FILE@ # programs bin_PROGRAMS = \ elpa2_print_kernels@SUFFIX@ noinst_PROGRAMS = \ elpa1_test_real@SUFFIX@ \ elpa1_test_complex@SUFFIX@ \ elpa2_test_real@SUFFIX@ \ elpa2_test_complex@SUFFIX@ \ elpa2_test_real_default_kernel@SUFFIX@ \ elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@ \ elpa2_test_complex_default_kernel@SUFFIX@ \ elpa2_test_real_choose_kernel_with_api@SUFFIX@ \ elpa2_test_complex_choose_kernel_with_api@SUFFIX@ \ elpa1_test_real_with_c@SUFFIX@ if !WITH_OPENMP noinst_PROGRAMS += \ elpa1_test_real_c_version@SUFFIX@ \ elpa1_test_complex_c_version@SUFFIX@ \ elpa2_test_real_c_version@SUFFIX@ \ elpa2_test_complex_c_version@SUFFIX@ endif build_lib = libelpa@SUFFIX@.la if HAVE_REDIRECT redirect_sources = test/shared_sources/redir.c test/shared_sources/redirect.F90 else redirect_sources = endif #test/shared_sources/mod_precision_created.f90: src/mod_precision.f90 # cp $(top_srcdir)/src/mod_precision.f90 $(top_srcdir)/test/shared_sources/mod_precision_created.f90 shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 if !WITH_OPENMP elpa1_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_real_c_version.c $(shared_sources) $(redirect_sources) elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib) elpa1_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) EXTRA_elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa1_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_complex_c_version.c $(shared_sources) $(redirect_sources) elpa1_test_complex_c_version@SUFFIX@_LDADD = $(build_lib) elpa1_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) EXTRA_elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_real_c_version.c $(shared_sources) $(redirect_sources) elpa2_test_real_c_version@SUFFIX@_LDADD = $(build_lib) elpa2_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) EXTRA_elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_complex_c_version.c $(shared_sources) $(redirect_sources) elpa2_test_complex_c_version@SUFFIX@_LDADD = $(build_lib) elpa2_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) EXTRA_elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 endif elpa1_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real.F90 $(shared_sources) $(redirect_sources) elpa1_test_real@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa1_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa1_test_real_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_real_with_c.F90 test/shared_sources/mod_from_c.F90 \ test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources) elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 #elpa1_test_complex_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex_with_c.F90 test/shared_sources/mod_from_c.F90 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources) #elpa1_test_complex_with_c@SUFFIX@_LDADD = $(build_lib) #EXTRA_elpa1_test_complex_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2.F90 $(shared_sources) $(redirect_sources) elpa2_test_real@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \ $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \ $(shared_sources) $(redirect_sources) elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa1_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex.F90 $(shared_sources) $(redirect_sources) elpa1_test_complex@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa1_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2.F90 $(shared_sources) $(redirect_sources) elpa2_test_complex@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel.F90 $(shared_sources) $(redirect_sources) elpa2_test_complex_default_kernel@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \ $(shared_sources) $(redirect_sources) elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2_print_kernels.F90 $(shared_sources) $(redirect_sources) elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib) check_SCRIPTS = \ elpa1_test_real@SUFFIX@.sh \ elpa1_test_real_with_c@SUFFIX@.sh \ elpa2_test_real@SUFFIX@.sh \ elpa2_test_real_default_kernel@SUFFIX@.sh \ elpa1_test_complex@SUFFIX@.sh \ elpa2_test_complex@SUFFIX@.sh \ elpa2_test_complex_default_kernel@SUFFIX@.sh \ elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \ elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \ elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \ elpa2_print_kernels@SUFFIX@ if !WITH_OPENMP check_SCRIPTS += \ elpa1_test_real_c_version@SUFFIX@.sh \ elpa1_test_complex_c_version@SUFFIX@.sh \ elpa2_test_real_c_version@SUFFIX@.sh \ elpa2_test_complex_c_version@SUFFIX@.sh endif # test scripts if WITH_MPI wrapper="mpiexec -n 2 " else wrapper="" endif TESTS = $(check_SCRIPTS) %.sh: % echo '$(wrapper)./$^ $$TEST_FLAGS' > $@ chmod +x $@ ## this one does not want any arguments #elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh: # echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@ # chmod +x $@ #elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh: # echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@ # chmod +x $@ # Preprocessed files (just used for manual inspection) elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@ elpa2.i: $(top_srcdir)/src/elpa2.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@ elpa1.i: $(top_srcdir)/src/elpa1.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@ mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@ mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@ include doxygen.am CLEANFILES = \ elpa-generated.h \ elpa1_test* \ elpa2_test*\ *.i clean-local: -rm -rf modules/* .fortran_dependencies/* -rm -rf $(generated_headers) distclean-local: -rm config-f90.h -rm -rf ./src/elpa2_kernels/.deps -rm -rf ./src/.deps -rm -rf ./test/.deps -rmdir ./src/elpa2_kernels/ -rmdir ./src -rmdir ./test -rmdir ./m4 -rmdir modules/ -rmdir .fortran_dependencies/ EXTRA_DIST = \ fdep/fortran_dependencies.pl \ fdep/fortran_dependencies.mk \ test/fortran_test_programs/elpa_test_programs_print_headers.X90 \ src/elpa_reduce_add_vectors.X90 \ src/elpa_transpose_vectors.X90 \ src/redist_band.X90 \ elpa.spec LIBTOOL_DEPS = @LIBTOOL_DEPS@ libtool: $(LIBTOOL_DEPS) $(SHELL) ./config.status libtool @FORTRAN_MODULE_DEPS@ # Fortran module dependencies only work within each target, # specify that the test programs need a finished library before # one can compile them # $1 Object name define require_elpa_lib $1: libelpa@SUFFIX@.la endef $(foreach p,$(bin_PROGRAMS) $(noinst_PROGRAMS),$(foreach o,$($p_OBJECTS),$(eval $(call require_elpa_lib,$o)))) elpa-2016.05.001/USERS_GUIDE.md0000644000312500001440000005410312717537604012201 00000000000000## Users guide for the ELPA library ## This document provides the guide for using the *ELPA* library in user applications. ### Online and local documentation ### Local documentation (via man pages) should be available (if *ELPA* has been installed with the documentation): For example "man get_elpa_communicators" should provide the documentation for the *ELPA* function which sets the necessary communicators. Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2016.05.001/html/index.html) for each *ELPA* release is available. ### General concept of the *ELPA* library ### The *ELPA* library consists of two main parts: - *ELPA 1stage* solver - *ELPA 2stage* solver Both variants of the *ELPA* solvers are available for real or complex valued matrices. Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2016.05.001/html/index.html) for details): - get_elpa_communicators : set the row / column communicators for *ELPA* - solve_evp_complex_1stage : solve a complex valued eigenvale proplem with the *ELPA 1stage* solver - solve_evp_real_1stage : solve a real valued eigenvale proplem with the *ELPA 1stage* solver - solve_evp_complex_2stage : solve a complex valued eigenvale proplem with the *ELPA 2stage* solver - solve_evp_real_2stage : solve a real valued eigenvale proplem with the *ELPA 2stage* solver Furthermore *ELPA* provides the utility binary "print_available_elpa2_kernels": it tells the user which *ELPA 2stage* compute kernels have been installed and which default kernels are set If you want to solve an eigenvalue problem with *ELPA*, you have to decide whether you want to use *ELPA 1stage* or *ELPA 2stage* solver. Normally, *ELPA 2stage* is the better choice since it is faster, but there a matrix dimensions where *ELPA 1stage* is supperior. Independent of the choice of the solver, the concept of calling *ELPA* is always the same: #### MPI version of *ELPA* #### In this case, *ELPA* relies on a BLACS distributed matrix. To solve a Eigenvalue problem of this matrix with *ELPA*, one has 1. to include the *ELPA* header (C case) or module (Fortran) 2. to create row and column MPI communicators for ELPA (with "get_elpa_communicators") 3. to call *ELPA 1stage* or *ELPA 2stage* for the matrix. Here is a very simple MPI code snippet for using *ELPA 1stage*: For the definition of all variables please have a look at the man pages and/or the online documentation (see above). A full version of a simple example program can be found in ./test_project/src. ! All ELPA routines need MPI communicators for communicating within ! rows or columns of processes, these are set in get_elpa_communicators success = get_elpa_communicators(mpi_comm_world, my_prow, my_pcol, & mpi_comm_rows, mpi_comm_cols) if (myid==0) then print '(a)','| Past split communicator setup for rows and columns.' end if ! Determine the necessary size of the distributed matrices, ! we use the Scalapack tools routine NUMROC for that. na_rows = numroc(na, nblk, my_prow, 0, np_rows) na_cols = numroc(na, nblk, my_pcol, 0, np_cols) !------------------------------------------------------------------------------- ! Calculate eigenvalues/eigenvectors if (myid==0) then print '(a)','| Entering one-step ELPA solver ... ' print * end if success = solve_evp_real_1stage(na, nev, a, na_rows, ev, z, na_rows, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) if (myid==0) then print '(a)','| One-step ELPA solver complete.' print * end if #### Shared-memory version of *ELPA* #### If the *ELPA* library has been compiled with the configure option "--with-mpi=0", no MPI will be used. Still the **same** call sequence as in the MPI case can be used (see above). #### Setting the row and column communicators #### SYNOPSIS FORTRAN INTERFACE use elpa1 success = get_elpa_communicators (mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols) integer, intent(in) mpi_comm_global: global communicator for the calculation integer, intent(in) my_prow: row coordinate of the calling process in the process grid integer, intent(in) my_pcol: column coordinate of the calling process in the process grid integer, intent(out) mpi_comm_row: communicator for communication within rows of processes integer, intent(out) mpi_comm_row: communicator for communication within columns of processes integer success: return value indicating success or failure of the underlying MPI_COMM_SPLIT function C INTERFACE #include "elpa_generated.h" success = get_elpa_communicators (int mpi_comm_world, int my_prow, my_pcol, int *mpi_comm_rows, int *Pmpi_comm_cols); int mpi_comm_global: global communicator for the calculation int my_prow: row coordinate of the calling process in the process grid int my_pcol: column coordinate of the calling process in the process grid int *mpi_comm_row: pointer to the communicator for communication within rows of processes int *mpi_comm_row: pointer to the communicator for communication within columns of processes int success: return value indicating success or failure of the underlying MPI_COMM_SPLIT function #### Using *ELPA 1stage* #### After setting up the *ELPA* row and column communicators (by calling get_elpa_communicators), only the real or complex valued solver has to be called: SYNOPSIS FORTRAN INTERFACE use elpa1 success = solve_evp_real_1stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) With the definintions of the input and output variables: integer, intent(in) na: global dimension of quadratic matrix a to solve integer, intent(in) nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated real*8, intent(inout) a: locally distributed part of the matrix a. The local dimensions are lda x matrixCols integer, intent(in) lda: leading dimension of locally distributed matrix a real*8, intent(inout) ev: on output the first nev computed eigenvalues real*8, intent(inout) q: on output the first nev computed eigenvectors integer, intent(in) ldq: leading dimension of matrix q which stores the eigenvectors integer, intent(in) nblk: blocksize of block cyclic distributin, must be the same in both directions integer, intent(in) matrixCols: number of columns of locally distributed matrices a and q integer, intent(in) mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) integer, intent(in) mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) logical success: return value indicating success or failure C INTERFACE #include "elpa.h" success = solve_evp_real_1stage (int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); With the definintions of the input and output variables: int na: global dimension of quadratic matrix a to solve int nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated double *a: pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols int lda: leading dimension of locally distributed matrix a double *ev: pointer to memory containing on output the first nev computed eigenvalues double *q: pointer to memory containing on output the first nev computed eigenvectors int ldq: leading dimension of matrix q which stores the eigenvectors int nblk: blocksize of block cyclic distributin, must be the same in both directions int matrixCols: number of columns of locally distributed matrices a and q int mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) int mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) int success: return value indicating success (1) or failure (0) DESCRIPTION Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols. The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues will be stored in q. All memory of the arguments must be allocated outside the call to the solver. FORTRAN INTERFACE use elpa1 success = solve_evp_complex_1stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) With the definintions of the input and output variables: integer, intent(in) na: global dimension of quadratic matrix a to solve integer, intent(in) nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated complex*16, intent(inout) a: locally distributed part of the matrix a. The local dimensions are lda x matrixCols integer, intent(in) lda: leading dimension of locally distributed matrix a real*8, intent(inout) ev: on output the first nev computed eigenvalues complex*16, intent(inout) q: on output the first nev computed eigenvectors integer, intent(in) ldq: leading dimension of matrix q which stores the eigenvectors integer, intent(in) nblk: blocksize of block cyclic distributin, must be the same in both directions integer, intent(in) matrixCols: number of columns of locally distributed matrices a and q integer, intent(in) mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) integer, intent(in) mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) logical success: return value indicating success or failure C INTERFACE #include "elpa.h" #include success = solve_evp_complex_1stage (int na, int nev, double complex *a, int lda, double *ev, double complex*q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); With the definintions of the input and output variables: int na: global dimension of quadratic matrix a to solve int nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated double complex *a: pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols int lda: leading dimension of locally distributed matrix a double *ev: pointer to memory containing on output the first nev computed eigenvalues double complex *q: pointer to memory containing on output the first nev computed eigenvectors int ldq: leading dimension of matrix q which stores the eigenvectors int nblk: blocksize of block cyclic distributin, must be the same in both directions int matrixCols: number of columns of locally distributed matrices a and q int mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) int mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) int success: return value indicating success (1) or failure (0) DESCRIPTION Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols. The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues will be stored in q. All memory of the arguments must be allocated outside the call to the solver. The *ELPA 1stage* solver, does not need or accept any other parameters than in the above specification. #### Using *ELPA 2stage* #### The *ELPA 2stage* solver can be used in the same manner, as the *ELPA 1stage* solver. However, the 2 stage solver, can be used with different compute kernels, which offers more possibilities for configuration. It is recommended to first call the utillity program elpa2_print_kernels which will tell all the compute kernels that can be used with *ELPA 2stage*". It will also give information, whether a kernel can be set via environment variables. ##### Using the default kernels ##### If no kernel is set either via an environment variable or the *ELPA 2stage API* then the default kernels will be set. ##### Setting the *ELPA 2stage* compute kernels ##### If the *ELPA* installation allows setting ther compute kernels with enviroment variables, setting the variables "REAL_ELPA_KERNEL" and "COMPLEX_ELPA_KERNEL" will set the compute kernels. The environment variable setting will take precedence over all other settings! It is also possible to set the *ELPA 2stage* compute kernels via the API. SYNOPSIS FORTRAN INTERFACE use elpa1 use elpa2 success = solve_evp_real_2stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQr=useQR) With the definintions of the input and output variables: integer, intent(in) na: global dimension of quadratic matrix a to solve integer, intent(in) nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated real*8, intent(inout) a: locally distributed part of the matrix a. The local dimensions are lda x matrixCols integer, intent(in) lda: leading dimension of locally distributed matrix a real*8, intent(inout) ev: on output the first nev computed eigenvalues real*8, intent(inout) q: on output the first nev computed eigenvectors integer, intent(in) ldq: leading dimension of matrix q which stores the eigenvectors integer, intent(in) nblk: blocksize of block cyclic distributin, must be the same in both directions integer, intent(in) matrixCols: number of columns of locally distributed matrices a and q integer, intent(in) mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) integer, intent(in) mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) integer, intent(in) mpi_comm_all: communicator for all processes in the processor set involved in ELPA logical, intent(in), optional: useQR: optional argument; switches to QR-decomposition if set to .true. logical success: return value indicating success or failure C INTERFACE #include "elpa.h" success = solve_evp_real_2stage (int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_ELPA_REAL_KERNEL, int useQr); With the definintions of the input and output variables: int na: global dimension of quadratic matrix a to solve int nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated double *a: pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols int lda: leading dimension of locally distributed matrix a double *ev: pointer to memory containing on output the first nev computed eigenvalues double *q: pointer to memory containing on output the first nev computed eigenvectors int ldq: leading dimension of matrix q which stores the eigenvectors int nblk: blocksize of block cyclic distributin, must be the same in both directions int matrixCols: number of columns of locally distributed matrices a and q int mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) int mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) int mpi_comm_all: communicator for all processes in the processor set involved in ELPA int useQR: if set to 1 switch to QR-decomposition int success: return value indicating success (1) or failure (0) DESCRIPTION Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols. The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues will be stored in q. All memory of the arguments must be allocated outside the call to the solver. SYNOPSIS FORTRAN INTERFACE use elpa1 use elpa2 success = solve_evp_real_2stage (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL) With the definintions of the input and output variables: integer, intent(in) na: global dimension of quadratic matrix a to solve integer, intent(in) nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated complex*16, intent(inout) a: locally distributed part of the matrix a. The local dimensions are lda x matrixCols integer, intent(in) lda: leading dimension of locally distributed matrix a real*8, intent(inout) ev: on output the first nev computed eigenvalues complex*16, intent(inout) q: on output the first nev computed eigenvectors integer, intent(in) ldq: leading dimension of matrix q which stores the eigenvectors integer, intent(in) nblk: blocksize of block cyclic distributin, must be the same in both directions integer, intent(in) matrixCols: number of columns of locally distributed matrices a and q integer, intent(in) mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) integer, intent(in) mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) integer, intent(in) mpi_comm_all: communicator for all processes in the processor set involved in ELPA logical success: return value indicating success or failure C INTERFACE #include "elpa.h" #include success = solve_evp_complex_2stage (int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_ELPA_REAL_KERNEL); With the definintions of the input and output variables: int na: global dimension of quadratic matrix a to solve int nev: number of eigenvalues to be computed; the first nev eigenvalules are calculated double complex *a: pointer to locally distributed part of the matrix a. The local dimensions are lda x matrixCols int lda: leading dimension of locally distributed matrix a double *ev: pointer to memory containing on output the first nev computed eigenvalues double complex *q: pointer to memory containing on output the first nev computed eigenvectors int ldq: leading dimension of matrix q which stores the eigenvectors int nblk: blocksize of block cyclic distributin, must be the same in both directions int matrixCols: number of columns of locally distributed matrices a and q int mpi_comm_rows: communicator for communication in rows. Constructed with get_elpa_communicators(3) int mpi_comm_cols: communicator for communication in colums. Constructed with get_elpa_communicators(3) int mpi_comm_all: communicator for all processes in the processor set involved in ELPA int success: return value indicating success (1) or failure (0) DESCRIPTION Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators mpi_comm_rows and mpi_comm_cols are obtained with the get_elpa_communicators(3) function. The distributed quadratic marix a has global dimensions na x na, and a local size lda x matrixCols. The solver will compute the first nev eigenvalues, which will be stored on exit in ev. The eigenvectors corresponding to the eigenvalues will be stored in q. All memory of the arguments must be allocated outside the call to the solver. elpa-2016.05.001/ltmain.sh0000755000312500001440000117077112717533401011666 00000000000000#! /bin/sh ## DO NOT EDIT - This file generated from ./build-aux/ltmain.in ## by inline-source v2014-01-03.01 # libtool (GNU libtool) 2.4.6 # Provide generalized library-building support services. # Written by Gordon Matzigkeit , 1996 # Copyright (C) 1996-2015 Free Software Foundation, Inc. # This is free software; see the source for copying conditions. There is NO # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # GNU Libtool is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # As a special exception to the GNU General Public License, # if you distribute this file as part of a program or library that # is built using GNU Libtool, you may include this file under the # same distribution terms that you use for the rest of that program. # # GNU Libtool is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . PROGRAM=libtool PACKAGE=libtool VERSION=2.4.6 package_revision=2.4.6 ## ------ ## ## Usage. ## ## ------ ## # Run './libtool --help' for help with using this script from the # command line. ## ------------------------------- ## ## User overridable command paths. ## ## ------------------------------- ## # After configure completes, it has a better idea of some of the # shell tools we need than the defaults used by the functions shared # with bootstrap, so set those here where they can still be over- # ridden by the user, but otherwise take precedence. : ${AUTOCONF="autoconf"} : ${AUTOMAKE="automake"} ## -------------------------- ## ## Source external libraries. ## ## -------------------------- ## # Much of our low-level functionality needs to be sourced from external # libraries, which are installed to $pkgauxdir. # Set a version string for this script. scriptversion=2015-01-20.17; # UTC # General shell script boiler plate, and helper functions. # Written by Gary V. Vaughan, 2004 # Copyright (C) 2004-2015 Free Software Foundation, Inc. # This is free software; see the source for copying conditions. There is NO # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # As a special exception to the GNU General Public License, if you distribute # this file as part of a program or library that is built using GNU Libtool, # you may include this file under the same distribution terms that you use # for the rest of that program. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNES FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Please report bugs or propose patches to gary@gnu.org. ## ------ ## ## Usage. ## ## ------ ## # Evaluate this file near the top of your script to gain access to # the functions and variables defined here: # # . `echo "$0" | ${SED-sed} 's|[^/]*$||'`/build-aux/funclib.sh # # If you need to override any of the default environment variable # settings, do that before evaluating this file. ## -------------------- ## ## Shell normalisation. ## ## -------------------- ## # Some shells need a little help to be as Bourne compatible as possible. # Before doing anything else, make sure all that help has been provided! DUALCASE=1; export DUALCASE # for MKS sh if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST else case `(set -o) 2>/dev/null` in *posix*) set -o posix ;; esac fi # NLS nuisances: We save the old values in case they are required later. _G_user_locale= _G_safe_locale= for _G_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES do eval "if test set = \"\${$_G_var+set}\"; then save_$_G_var=\$$_G_var $_G_var=C export $_G_var _G_user_locale=\"$_G_var=\\\$save_\$_G_var; \$_G_user_locale\" _G_safe_locale=\"$_G_var=C; \$_G_safe_locale\" fi" done # CDPATH. (unset CDPATH) >/dev/null 2>&1 && unset CDPATH # Make sure IFS has a sensible default sp=' ' nl=' ' IFS="$sp $nl" # There are apparently some retarded systems that use ';' as a PATH separator! if test "${PATH_SEPARATOR+set}" != set; then PATH_SEPARATOR=: (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || PATH_SEPARATOR=';' } fi ## ------------------------- ## ## Locate command utilities. ## ## ------------------------- ## # func_executable_p FILE # ---------------------- # Check that FILE is an executable regular file. func_executable_p () { test -f "$1" && test -x "$1" } # func_path_progs PROGS_LIST CHECK_FUNC [PATH] # -------------------------------------------- # Search for either a program that responds to --version with output # containing "GNU", or else returned by CHECK_FUNC otherwise, by # trying all the directories in PATH with each of the elements of # PROGS_LIST. # # CHECK_FUNC should accept the path to a candidate program, and # set $func_check_prog_result if it truncates its output less than # $_G_path_prog_max characters. func_path_progs () { _G_progs_list=$1 _G_check_func=$2 _G_PATH=${3-"$PATH"} _G_path_prog_max=0 _G_path_prog_found=false _G_save_IFS=$IFS; IFS=${PATH_SEPARATOR-:} for _G_dir in $_G_PATH; do IFS=$_G_save_IFS test -z "$_G_dir" && _G_dir=. for _G_prog_name in $_G_progs_list; do for _exeext in '' .EXE; do _G_path_prog=$_G_dir/$_G_prog_name$_exeext func_executable_p "$_G_path_prog" || continue case `"$_G_path_prog" --version 2>&1` in *GNU*) func_path_progs_result=$_G_path_prog _G_path_prog_found=: ;; *) $_G_check_func $_G_path_prog func_path_progs_result=$func_check_prog_result ;; esac $_G_path_prog_found && break 3 done done done IFS=$_G_save_IFS test -z "$func_path_progs_result" && { echo "no acceptable sed could be found in \$PATH" >&2 exit 1 } } # We want to be able to use the functions in this file before configure # has figured out where the best binaries are kept, which means we have # to search for them ourselves - except when the results are already set # where we skip the searches. # Unless the user overrides by setting SED, search the path for either GNU # sed, or the sed that truncates its output the least. test -z "$SED" && { _G_sed_script=s/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/ for _G_i in 1 2 3 4 5 6 7; do _G_sed_script=$_G_sed_script$nl$_G_sed_script done echo "$_G_sed_script" 2>/dev/null | sed 99q >conftest.sed _G_sed_script= func_check_prog_sed () { _G_path_prog=$1 _G_count=0 printf 0123456789 >conftest.in while : do cat conftest.in conftest.in >conftest.tmp mv conftest.tmp conftest.in cp conftest.in conftest.nl echo '' >> conftest.nl "$_G_path_prog" -f conftest.sed conftest.out 2>/dev/null || break diff conftest.out conftest.nl >/dev/null 2>&1 || break _G_count=`expr $_G_count + 1` if test "$_G_count" -gt "$_G_path_prog_max"; then # Best one so far, save it but keep looking for a better one func_check_prog_result=$_G_path_prog _G_path_prog_max=$_G_count fi # 10*(2^10) chars as input seems more than enough test 10 -lt "$_G_count" && break done rm -f conftest.in conftest.tmp conftest.nl conftest.out } func_path_progs "sed gsed" func_check_prog_sed $PATH:/usr/xpg4/bin rm -f conftest.sed SED=$func_path_progs_result } # Unless the user overrides by setting GREP, search the path for either GNU # grep, or the grep that truncates its output the least. test -z "$GREP" && { func_check_prog_grep () { _G_path_prog=$1 _G_count=0 _G_path_prog_max=0 printf 0123456789 >conftest.in while : do cat conftest.in conftest.in >conftest.tmp mv conftest.tmp conftest.in cp conftest.in conftest.nl echo 'GREP' >> conftest.nl "$_G_path_prog" -e 'GREP$' -e '-(cannot match)-' conftest.out 2>/dev/null || break diff conftest.out conftest.nl >/dev/null 2>&1 || break _G_count=`expr $_G_count + 1` if test "$_G_count" -gt "$_G_path_prog_max"; then # Best one so far, save it but keep looking for a better one func_check_prog_result=$_G_path_prog _G_path_prog_max=$_G_count fi # 10*(2^10) chars as input seems more than enough test 10 -lt "$_G_count" && break done rm -f conftest.in conftest.tmp conftest.nl conftest.out } func_path_progs "grep ggrep" func_check_prog_grep $PATH:/usr/xpg4/bin GREP=$func_path_progs_result } ## ------------------------------- ## ## User overridable command paths. ## ## ------------------------------- ## # All uppercase variable names are used for environment variables. These # variables can be overridden by the user before calling a script that # uses them if a suitable command of that name is not already available # in the command search PATH. : ${CP="cp -f"} : ${ECHO="printf %s\n"} : ${EGREP="$GREP -E"} : ${FGREP="$GREP -F"} : ${LN_S="ln -s"} : ${MAKE="make"} : ${MKDIR="mkdir"} : ${MV="mv -f"} : ${RM="rm -f"} : ${SHELL="${CONFIG_SHELL-/bin/sh}"} ## -------------------- ## ## Useful sed snippets. ## ## -------------------- ## sed_dirname='s|/[^/]*$||' sed_basename='s|^.*/||' # Sed substitution that helps us do robust quoting. It backslashifies # metacharacters that are still active within double-quoted strings. sed_quote_subst='s|\([`"$\\]\)|\\\1|g' # Same as above, but do not quote variable references. sed_double_quote_subst='s/\(["`\\]\)/\\\1/g' # Sed substitution that turns a string into a regex matching for the # string literally. sed_make_literal_regex='s|[].[^$\\*\/]|\\&|g' # Sed substitution that converts a w32 file name or path # that contains forward slashes, into one that contains # (escaped) backslashes. A very naive implementation. sed_naive_backslashify='s|\\\\*|\\|g;s|/|\\|g;s|\\|\\\\|g' # Re-'\' parameter expansions in output of sed_double_quote_subst that # were '\'-ed in input to the same. If an odd number of '\' preceded a # '$' in input to sed_double_quote_subst, that '$' was protected from # expansion. Since each input '\' is now two '\'s, look for any number # of runs of four '\'s followed by two '\'s and then a '$'. '\' that '$'. _G_bs='\\' _G_bs2='\\\\' _G_bs4='\\\\\\\\' _G_dollar='\$' sed_double_backslash="\ s/$_G_bs4/&\\ /g s/^$_G_bs2$_G_dollar/$_G_bs&/ s/\\([^$_G_bs]\\)$_G_bs2$_G_dollar/\\1$_G_bs2$_G_bs$_G_dollar/g s/\n//g" ## ----------------- ## ## Global variables. ## ## ----------------- ## # Except for the global variables explicitly listed below, the following # functions in the '^func_' namespace, and the '^require_' namespace # variables initialised in the 'Resource management' section, sourcing # this file will not pollute your global namespace with anything # else. There's no portable way to scope variables in Bourne shell # though, so actually running these functions will sometimes place # results into a variable named after the function, and often use # temporary variables in the '^_G_' namespace. If you are careful to # avoid using those namespaces casually in your sourcing script, things # should continue to work as you expect. And, of course, you can freely # overwrite any of the functions or variables defined here before # calling anything to customize them. EXIT_SUCCESS=0 EXIT_FAILURE=1 EXIT_MISMATCH=63 # $? = 63 is used to indicate version mismatch to missing. EXIT_SKIP=77 # $? = 77 is used to indicate a skipped test to automake. # Allow overriding, eg assuming that you follow the convention of # putting '$debug_cmd' at the start of all your functions, you can get # bash to show function call trace with: # # debug_cmd='eval echo "${FUNCNAME[0]} $*" >&2' bash your-script-name debug_cmd=${debug_cmd-":"} exit_cmd=: # By convention, finish your script with: # # exit $exit_status # # so that you can set exit_status to non-zero if you want to indicate # something went wrong during execution without actually bailing out at # the point of failure. exit_status=$EXIT_SUCCESS # Work around backward compatibility issue on IRIX 6.5. On IRIX 6.4+, sh # is ksh but when the shell is invoked as "sh" and the current value of # the _XPG environment variable is not equal to 1 (one), the special # positional parameter $0, within a function call, is the name of the # function. progpath=$0 # The name of this program. progname=`$ECHO "$progpath" |$SED "$sed_basename"` # Make sure we have an absolute progpath for reexecution: case $progpath in [\\/]*|[A-Za-z]:\\*) ;; *[\\/]*) progdir=`$ECHO "$progpath" |$SED "$sed_dirname"` progdir=`cd "$progdir" && pwd` progpath=$progdir/$progname ;; *) _G_IFS=$IFS IFS=${PATH_SEPARATOR-:} for progdir in $PATH; do IFS=$_G_IFS test -x "$progdir/$progname" && break done IFS=$_G_IFS test -n "$progdir" || progdir=`pwd` progpath=$progdir/$progname ;; esac ## ----------------- ## ## Standard options. ## ## ----------------- ## # The following options affect the operation of the functions defined # below, and should be set appropriately depending on run-time para- # meters passed on the command line. opt_dry_run=false opt_quiet=false opt_verbose=false # Categories 'all' and 'none' are always available. Append any others # you will pass as the first argument to func_warning from your own # code. warning_categories= # By default, display warnings according to 'opt_warning_types'. Set # 'warning_func' to ':' to elide all warnings, or func_fatal_error to # treat the next displayed warning as a fatal error. warning_func=func_warn_and_continue # Set to 'all' to display all warnings, 'none' to suppress all # warnings, or a space delimited list of some subset of # 'warning_categories' to display only the listed warnings. opt_warning_types=all ## -------------------- ## ## Resource management. ## ## -------------------- ## # This section contains definitions for functions that each ensure a # particular resource (a file, or a non-empty configuration variable for # example) is available, and if appropriate to extract default values # from pertinent package files. Call them using their associated # 'require_*' variable to ensure that they are executed, at most, once. # # It's entirely deliberate that calling these functions can set # variables that don't obey the namespace limitations obeyed by the rest # of this file, in order that that they be as useful as possible to # callers. # require_term_colors # ------------------- # Allow display of bold text on terminals that support it. require_term_colors=func_require_term_colors func_require_term_colors () { $debug_cmd test -t 1 && { # COLORTERM and USE_ANSI_COLORS environment variables take # precedence, because most terminfo databases neglect to describe # whether color sequences are supported. test -n "${COLORTERM+set}" && : ${USE_ANSI_COLORS="1"} if test 1 = "$USE_ANSI_COLORS"; then # Standard ANSI escape sequences tc_reset='' tc_bold=''; tc_standout='' tc_red=''; tc_green='' tc_blue=''; tc_cyan='' else # Otherwise trust the terminfo database after all. test -n "`tput sgr0 2>/dev/null`" && { tc_reset=`tput sgr0` test -n "`tput bold 2>/dev/null`" && tc_bold=`tput bold` tc_standout=$tc_bold test -n "`tput smso 2>/dev/null`" && tc_standout=`tput smso` test -n "`tput setaf 1 2>/dev/null`" && tc_red=`tput setaf 1` test -n "`tput setaf 2 2>/dev/null`" && tc_green=`tput setaf 2` test -n "`tput setaf 4 2>/dev/null`" && tc_blue=`tput setaf 4` test -n "`tput setaf 5 2>/dev/null`" && tc_cyan=`tput setaf 5` } fi } require_term_colors=: } ## ----------------- ## ## Function library. ## ## ----------------- ## # This section contains a variety of useful functions to call in your # scripts. Take note of the portable wrappers for features provided by # some modern shells, which will fall back to slower equivalents on # less featureful shells. # func_append VAR VALUE # --------------------- # Append VALUE onto the existing contents of VAR. # We should try to minimise forks, especially on Windows where they are # unreasonably slow, so skip the feature probes when bash or zsh are # being used: if test set = "${BASH_VERSION+set}${ZSH_VERSION+set}"; then : ${_G_HAVE_ARITH_OP="yes"} : ${_G_HAVE_XSI_OPS="yes"} # The += operator was introduced in bash 3.1 case $BASH_VERSION in [12].* | 3.0 | 3.0*) ;; *) : ${_G_HAVE_PLUSEQ_OP="yes"} ;; esac fi # _G_HAVE_PLUSEQ_OP # Can be empty, in which case the shell is probed, "yes" if += is # useable or anything else if it does not work. test -z "$_G_HAVE_PLUSEQ_OP" \ && (eval 'x=a; x+=" b"; test "a b" = "$x"') 2>/dev/null \ && _G_HAVE_PLUSEQ_OP=yes if test yes = "$_G_HAVE_PLUSEQ_OP" then # This is an XSI compatible shell, allowing a faster implementation... eval 'func_append () { $debug_cmd eval "$1+=\$2" }' else # ...otherwise fall back to using expr, which is often a shell builtin. func_append () { $debug_cmd eval "$1=\$$1\$2" } fi # func_append_quoted VAR VALUE # ---------------------------- # Quote VALUE and append to the end of shell variable VAR, separated # by a space. if test yes = "$_G_HAVE_PLUSEQ_OP"; then eval 'func_append_quoted () { $debug_cmd func_quote_for_eval "$2" eval "$1+=\\ \$func_quote_for_eval_result" }' else func_append_quoted () { $debug_cmd func_quote_for_eval "$2" eval "$1=\$$1\\ \$func_quote_for_eval_result" } fi # func_append_uniq VAR VALUE # -------------------------- # Append unique VALUE onto the existing contents of VAR, assuming # entries are delimited by the first character of VALUE. For example: # # func_append_uniq options " --another-option option-argument" # # will only append to $options if " --another-option option-argument " # is not already present somewhere in $options already (note spaces at # each end implied by leading space in second argument). func_append_uniq () { $debug_cmd eval _G_current_value='`$ECHO $'$1'`' _G_delim=`expr "$2" : '\(.\)'` case $_G_delim$_G_current_value$_G_delim in *"$2$_G_delim"*) ;; *) func_append "$@" ;; esac } # func_arith TERM... # ------------------ # Set func_arith_result to the result of evaluating TERMs. test -z "$_G_HAVE_ARITH_OP" \ && (eval 'test 2 = $(( 1 + 1 ))') 2>/dev/null \ && _G_HAVE_ARITH_OP=yes if test yes = "$_G_HAVE_ARITH_OP"; then eval 'func_arith () { $debug_cmd func_arith_result=$(( $* )) }' else func_arith () { $debug_cmd func_arith_result=`expr "$@"` } fi # func_basename FILE # ------------------ # Set func_basename_result to FILE with everything up to and including # the last / stripped. if test yes = "$_G_HAVE_XSI_OPS"; then # If this shell supports suffix pattern removal, then use it to avoid # forking. Hide the definitions single quotes in case the shell chokes # on unsupported syntax... _b='func_basename_result=${1##*/}' _d='case $1 in */*) func_dirname_result=${1%/*}$2 ;; * ) func_dirname_result=$3 ;; esac' else # ...otherwise fall back to using sed. _b='func_basename_result=`$ECHO "$1" |$SED "$sed_basename"`' _d='func_dirname_result=`$ECHO "$1" |$SED "$sed_dirname"` if test "X$func_dirname_result" = "X$1"; then func_dirname_result=$3 else func_append func_dirname_result "$2" fi' fi eval 'func_basename () { $debug_cmd '"$_b"' }' # func_dirname FILE APPEND NONDIR_REPLACEMENT # ------------------------------------------- # Compute the dirname of FILE. If nonempty, add APPEND to the result, # otherwise set result to NONDIR_REPLACEMENT. eval 'func_dirname () { $debug_cmd '"$_d"' }' # func_dirname_and_basename FILE APPEND NONDIR_REPLACEMENT # -------------------------------------------------------- # Perform func_basename and func_dirname in a single function # call: # dirname: Compute the dirname of FILE. If nonempty, # add APPEND to the result, otherwise set result # to NONDIR_REPLACEMENT. # value returned in "$func_dirname_result" # basename: Compute filename of FILE. # value retuned in "$func_basename_result" # For efficiency, we do not delegate to the functions above but instead # duplicate the functionality here. eval 'func_dirname_and_basename () { $debug_cmd '"$_b"' '"$_d"' }' # func_echo ARG... # ---------------- # Echo program name prefixed message. func_echo () { $debug_cmd _G_message=$* func_echo_IFS=$IFS IFS=$nl for _G_line in $_G_message; do IFS=$func_echo_IFS $ECHO "$progname: $_G_line" done IFS=$func_echo_IFS } # func_echo_all ARG... # -------------------- # Invoke $ECHO with all args, space-separated. func_echo_all () { $ECHO "$*" } # func_echo_infix_1 INFIX ARG... # ------------------------------ # Echo program name, followed by INFIX on the first line, with any # additional lines not showing INFIX. func_echo_infix_1 () { $debug_cmd $require_term_colors _G_infix=$1; shift _G_indent=$_G_infix _G_prefix="$progname: $_G_infix: " _G_message=$* # Strip color escape sequences before counting printable length for _G_tc in "$tc_reset" "$tc_bold" "$tc_standout" "$tc_red" "$tc_green" "$tc_blue" "$tc_cyan" do test -n "$_G_tc" && { _G_esc_tc=`$ECHO "$_G_tc" | $SED "$sed_make_literal_regex"` _G_indent=`$ECHO "$_G_indent" | $SED "s|$_G_esc_tc||g"` } done _G_indent="$progname: "`echo "$_G_indent" | $SED 's|.| |g'`" " ## exclude from sc_prohibit_nested_quotes func_echo_infix_1_IFS=$IFS IFS=$nl for _G_line in $_G_message; do IFS=$func_echo_infix_1_IFS $ECHO "$_G_prefix$tc_bold$_G_line$tc_reset" >&2 _G_prefix=$_G_indent done IFS=$func_echo_infix_1_IFS } # func_error ARG... # ----------------- # Echo program name prefixed message to standard error. func_error () { $debug_cmd $require_term_colors func_echo_infix_1 " $tc_standout${tc_red}error$tc_reset" "$*" >&2 } # func_fatal_error ARG... # ----------------------- # Echo program name prefixed message to standard error, and exit. func_fatal_error () { $debug_cmd func_error "$*" exit $EXIT_FAILURE } # func_grep EXPRESSION FILENAME # ----------------------------- # Check whether EXPRESSION matches any line of FILENAME, without output. func_grep () { $debug_cmd $GREP "$1" "$2" >/dev/null 2>&1 } # func_len STRING # --------------- # Set func_len_result to the length of STRING. STRING may not # start with a hyphen. test -z "$_G_HAVE_XSI_OPS" \ && (eval 'x=a/b/c; test 5aa/bb/cc = "${#x}${x%%/*}${x%/*}${x#*/}${x##*/}"') 2>/dev/null \ && _G_HAVE_XSI_OPS=yes if test yes = "$_G_HAVE_XSI_OPS"; then eval 'func_len () { $debug_cmd func_len_result=${#1} }' else func_len () { $debug_cmd func_len_result=`expr "$1" : ".*" 2>/dev/null || echo $max_cmd_len` } fi # func_mkdir_p DIRECTORY-PATH # --------------------------- # Make sure the entire path to DIRECTORY-PATH is available. func_mkdir_p () { $debug_cmd _G_directory_path=$1 _G_dir_list= if test -n "$_G_directory_path" && test : != "$opt_dry_run"; then # Protect directory names starting with '-' case $_G_directory_path in -*) _G_directory_path=./$_G_directory_path ;; esac # While some portion of DIR does not yet exist... while test ! -d "$_G_directory_path"; do # ...make a list in topmost first order. Use a colon delimited # list incase some portion of path contains whitespace. _G_dir_list=$_G_directory_path:$_G_dir_list # If the last portion added has no slash in it, the list is done case $_G_directory_path in */*) ;; *) break ;; esac # ...otherwise throw away the child directory and loop _G_directory_path=`$ECHO "$_G_directory_path" | $SED -e "$sed_dirname"` done _G_dir_list=`$ECHO "$_G_dir_list" | $SED 's|:*$||'` func_mkdir_p_IFS=$IFS; IFS=: for _G_dir in $_G_dir_list; do IFS=$func_mkdir_p_IFS # mkdir can fail with a 'File exist' error if two processes # try to create one of the directories concurrently. Don't # stop in that case! $MKDIR "$_G_dir" 2>/dev/null || : done IFS=$func_mkdir_p_IFS # Bail out if we (or some other process) failed to create a directory. test -d "$_G_directory_path" || \ func_fatal_error "Failed to create '$1'" fi } # func_mktempdir [BASENAME] # ------------------------- # Make a temporary directory that won't clash with other running # libtool processes, and avoids race conditions if possible. If # given, BASENAME is the basename for that directory. func_mktempdir () { $debug_cmd _G_template=${TMPDIR-/tmp}/${1-$progname} if test : = "$opt_dry_run"; then # Return a directory name, but don't create it in dry-run mode _G_tmpdir=$_G_template-$$ else # If mktemp works, use that first and foremost _G_tmpdir=`mktemp -d "$_G_template-XXXXXXXX" 2>/dev/null` if test ! -d "$_G_tmpdir"; then # Failing that, at least try and use $RANDOM to avoid a race _G_tmpdir=$_G_template-${RANDOM-0}$$ func_mktempdir_umask=`umask` umask 0077 $MKDIR "$_G_tmpdir" umask $func_mktempdir_umask fi # If we're not in dry-run mode, bomb out on failure test -d "$_G_tmpdir" || \ func_fatal_error "cannot create temporary directory '$_G_tmpdir'" fi $ECHO "$_G_tmpdir" } # func_normal_abspath PATH # ------------------------ # Remove doubled-up and trailing slashes, "." path components, # and cancel out any ".." path components in PATH after making # it an absolute path. func_normal_abspath () { $debug_cmd # These SED scripts presuppose an absolute path with a trailing slash. _G_pathcar='s|^/\([^/]*\).*$|\1|' _G_pathcdr='s|^/[^/]*||' _G_removedotparts=':dotsl s|/\./|/|g t dotsl s|/\.$|/|' _G_collapseslashes='s|/\{1,\}|/|g' _G_finalslash='s|/*$|/|' # Start from root dir and reassemble the path. func_normal_abspath_result= func_normal_abspath_tpath=$1 func_normal_abspath_altnamespace= case $func_normal_abspath_tpath in "") # Empty path, that just means $cwd. func_stripname '' '/' "`pwd`" func_normal_abspath_result=$func_stripname_result return ;; # The next three entries are used to spot a run of precisely # two leading slashes without using negated character classes; # we take advantage of case's first-match behaviour. ///*) # Unusual form of absolute path, do nothing. ;; //*) # Not necessarily an ordinary path; POSIX reserves leading '//' # and for example Cygwin uses it to access remote file shares # over CIFS/SMB, so we conserve a leading double slash if found. func_normal_abspath_altnamespace=/ ;; /*) # Absolute path, do nothing. ;; *) # Relative path, prepend $cwd. func_normal_abspath_tpath=`pwd`/$func_normal_abspath_tpath ;; esac # Cancel out all the simple stuff to save iterations. We also want # the path to end with a slash for ease of parsing, so make sure # there is one (and only one) here. func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \ -e "$_G_removedotparts" -e "$_G_collapseslashes" -e "$_G_finalslash"` while :; do # Processed it all yet? if test / = "$func_normal_abspath_tpath"; then # If we ascended to the root using ".." the result may be empty now. if test -z "$func_normal_abspath_result"; then func_normal_abspath_result=/ fi break fi func_normal_abspath_tcomponent=`$ECHO "$func_normal_abspath_tpath" | $SED \ -e "$_G_pathcar"` func_normal_abspath_tpath=`$ECHO "$func_normal_abspath_tpath" | $SED \ -e "$_G_pathcdr"` # Figure out what to do with it case $func_normal_abspath_tcomponent in "") # Trailing empty path component, ignore it. ;; ..) # Parent dir; strip last assembled component from result. func_dirname "$func_normal_abspath_result" func_normal_abspath_result=$func_dirname_result ;; *) # Actual path component, append it. func_append func_normal_abspath_result "/$func_normal_abspath_tcomponent" ;; esac done # Restore leading double-slash if one was found on entry. func_normal_abspath_result=$func_normal_abspath_altnamespace$func_normal_abspath_result } # func_notquiet ARG... # -------------------- # Echo program name prefixed message only when not in quiet mode. func_notquiet () { $debug_cmd $opt_quiet || func_echo ${1+"$@"} # A bug in bash halts the script if the last line of a function # fails when set -e is in force, so we need another command to # work around that: : } # func_relative_path SRCDIR DSTDIR # -------------------------------- # Set func_relative_path_result to the relative path from SRCDIR to DSTDIR. func_relative_path () { $debug_cmd func_relative_path_result= func_normal_abspath "$1" func_relative_path_tlibdir=$func_normal_abspath_result func_normal_abspath "$2" func_relative_path_tbindir=$func_normal_abspath_result # Ascend the tree starting from libdir while :; do # check if we have found a prefix of bindir case $func_relative_path_tbindir in $func_relative_path_tlibdir) # found an exact match func_relative_path_tcancelled= break ;; $func_relative_path_tlibdir*) # found a matching prefix func_stripname "$func_relative_path_tlibdir" '' "$func_relative_path_tbindir" func_relative_path_tcancelled=$func_stripname_result if test -z "$func_relative_path_result"; then func_relative_path_result=. fi break ;; *) func_dirname $func_relative_path_tlibdir func_relative_path_tlibdir=$func_dirname_result if test -z "$func_relative_path_tlibdir"; then # Have to descend all the way to the root! func_relative_path_result=../$func_relative_path_result func_relative_path_tcancelled=$func_relative_path_tbindir break fi func_relative_path_result=../$func_relative_path_result ;; esac done # Now calculate path; take care to avoid doubling-up slashes. func_stripname '' '/' "$func_relative_path_result" func_relative_path_result=$func_stripname_result func_stripname '/' '/' "$func_relative_path_tcancelled" if test -n "$func_stripname_result"; then func_append func_relative_path_result "/$func_stripname_result" fi # Normalisation. If bindir is libdir, return '.' else relative path. if test -n "$func_relative_path_result"; then func_stripname './' '' "$func_relative_path_result" func_relative_path_result=$func_stripname_result fi test -n "$func_relative_path_result" || func_relative_path_result=. : } # func_quote_for_eval ARG... # -------------------------- # Aesthetically quote ARGs to be evaled later. # This function returns two values: # i) func_quote_for_eval_result # double-quoted, suitable for a subsequent eval # ii) func_quote_for_eval_unquoted_result # has all characters that are still active within double # quotes backslashified. func_quote_for_eval () { $debug_cmd func_quote_for_eval_unquoted_result= func_quote_for_eval_result= while test 0 -lt $#; do case $1 in *[\\\`\"\$]*) _G_unquoted_arg=`printf '%s\n' "$1" |$SED "$sed_quote_subst"` ;; *) _G_unquoted_arg=$1 ;; esac if test -n "$func_quote_for_eval_unquoted_result"; then func_append func_quote_for_eval_unquoted_result " $_G_unquoted_arg" else func_append func_quote_for_eval_unquoted_result "$_G_unquoted_arg" fi case $_G_unquoted_arg in # Double-quote args containing shell metacharacters to delay # word splitting, command substitution and variable expansion # for a subsequent eval. # Many Bourne shells cannot handle close brackets correctly # in scan sets, so we specify it separately. *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") _G_quoted_arg=\"$_G_unquoted_arg\" ;; *) _G_quoted_arg=$_G_unquoted_arg ;; esac if test -n "$func_quote_for_eval_result"; then func_append func_quote_for_eval_result " $_G_quoted_arg" else func_append func_quote_for_eval_result "$_G_quoted_arg" fi shift done } # func_quote_for_expand ARG # ------------------------- # Aesthetically quote ARG to be evaled later; same as above, # but do not quote variable references. func_quote_for_expand () { $debug_cmd case $1 in *[\\\`\"]*) _G_arg=`$ECHO "$1" | $SED \ -e "$sed_double_quote_subst" -e "$sed_double_backslash"` ;; *) _G_arg=$1 ;; esac case $_G_arg in # Double-quote args containing shell metacharacters to delay # word splitting and command substitution for a subsequent eval. # Many Bourne shells cannot handle close brackets correctly # in scan sets, so we specify it separately. *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") _G_arg=\"$_G_arg\" ;; esac func_quote_for_expand_result=$_G_arg } # func_stripname PREFIX SUFFIX NAME # --------------------------------- # strip PREFIX and SUFFIX from NAME, and store in func_stripname_result. # PREFIX and SUFFIX must not contain globbing or regex special # characters, hashes, percent signs, but SUFFIX may contain a leading # dot (in which case that matches only a dot). if test yes = "$_G_HAVE_XSI_OPS"; then eval 'func_stripname () { $debug_cmd # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are # positional parameters, so assign one to ordinary variable first. func_stripname_result=$3 func_stripname_result=${func_stripname_result#"$1"} func_stripname_result=${func_stripname_result%"$2"} }' else func_stripname () { $debug_cmd case $2 in .*) func_stripname_result=`$ECHO "$3" | $SED -e "s%^$1%%" -e "s%\\\\$2\$%%"`;; *) func_stripname_result=`$ECHO "$3" | $SED -e "s%^$1%%" -e "s%$2\$%%"`;; esac } fi # func_show_eval CMD [FAIL_EXP] # ----------------------------- # Unless opt_quiet is true, then output CMD. Then, if opt_dryrun is # not true, evaluate CMD. If the evaluation of CMD fails, and FAIL_EXP # is given, then evaluate it. func_show_eval () { $debug_cmd _G_cmd=$1 _G_fail_exp=${2-':'} func_quote_for_expand "$_G_cmd" eval "func_notquiet $func_quote_for_expand_result" $opt_dry_run || { eval "$_G_cmd" _G_status=$? if test 0 -ne "$_G_status"; then eval "(exit $_G_status); $_G_fail_exp" fi } } # func_show_eval_locale CMD [FAIL_EXP] # ------------------------------------ # Unless opt_quiet is true, then output CMD. Then, if opt_dryrun is # not true, evaluate CMD. If the evaluation of CMD fails, and FAIL_EXP # is given, then evaluate it. Use the saved locale for evaluation. func_show_eval_locale () { $debug_cmd _G_cmd=$1 _G_fail_exp=${2-':'} $opt_quiet || { func_quote_for_expand "$_G_cmd" eval "func_echo $func_quote_for_expand_result" } $opt_dry_run || { eval "$_G_user_locale $_G_cmd" _G_status=$? eval "$_G_safe_locale" if test 0 -ne "$_G_status"; then eval "(exit $_G_status); $_G_fail_exp" fi } } # func_tr_sh # ---------- # Turn $1 into a string suitable for a shell variable name. # Result is stored in $func_tr_sh_result. All characters # not in the set a-zA-Z0-9_ are replaced with '_'. Further, # if $1 begins with a digit, a '_' is prepended as well. func_tr_sh () { $debug_cmd case $1 in [0-9]* | *[!a-zA-Z0-9_]*) func_tr_sh_result=`$ECHO "$1" | $SED -e 's/^\([0-9]\)/_\1/' -e 's/[^a-zA-Z0-9_]/_/g'` ;; * ) func_tr_sh_result=$1 ;; esac } # func_verbose ARG... # ------------------- # Echo program name prefixed message in verbose mode only. func_verbose () { $debug_cmd $opt_verbose && func_echo "$*" : } # func_warn_and_continue ARG... # ----------------------------- # Echo program name prefixed warning message to standard error. func_warn_and_continue () { $debug_cmd $require_term_colors func_echo_infix_1 "${tc_red}warning$tc_reset" "$*" >&2 } # func_warning CATEGORY ARG... # ---------------------------- # Echo program name prefixed warning message to standard error. Warning # messages can be filtered according to CATEGORY, where this function # elides messages where CATEGORY is not listed in the global variable # 'opt_warning_types'. func_warning () { $debug_cmd # CATEGORY must be in the warning_categories list! case " $warning_categories " in *" $1 "*) ;; *) func_internal_error "invalid warning category '$1'" ;; esac _G_category=$1 shift case " $opt_warning_types " in *" $_G_category "*) $warning_func ${1+"$@"} ;; esac } # func_sort_ver VER1 VER2 # ----------------------- # 'sort -V' is not generally available. # Note this deviates from the version comparison in automake # in that it treats 1.5 < 1.5.0, and treats 1.4.4a < 1.4-p3a # but this should suffice as we won't be specifying old # version formats or redundant trailing .0 in bootstrap.conf. # If we did want full compatibility then we should probably # use m4_version_compare from autoconf. func_sort_ver () { $debug_cmd printf '%s\n%s\n' "$1" "$2" \ | sort -t. -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n -k 6,6n -k 7,7n -k 8,8n -k 9,9n } # func_lt_ver PREV CURR # --------------------- # Return true if PREV and CURR are in the correct order according to # func_sort_ver, otherwise false. Use it like this: # # func_lt_ver "$prev_ver" "$proposed_ver" || func_fatal_error "..." func_lt_ver () { $debug_cmd test "x$1" = x`func_sort_ver "$1" "$2" | $SED 1q` } # Local variables: # mode: shell-script # sh-indentation: 2 # eval: (add-hook 'before-save-hook 'time-stamp) # time-stamp-pattern: "10/scriptversion=%:y-%02m-%02d.%02H; # UTC" # time-stamp-time-zone: "UTC" # End: #! /bin/sh # Set a version string for this script. scriptversion=2014-01-07.03; # UTC # A portable, pluggable option parser for Bourne shell. # Written by Gary V. Vaughan, 2010 # Copyright (C) 2010-2015 Free Software Foundation, Inc. # This is free software; see the source for copying conditions. There is NO # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Please report bugs or propose patches to gary@gnu.org. ## ------ ## ## Usage. ## ## ------ ## # This file is a library for parsing options in your shell scripts along # with assorted other useful supporting features that you can make use # of too. # # For the simplest scripts you might need only: # # #!/bin/sh # . relative/path/to/funclib.sh # . relative/path/to/options-parser # scriptversion=1.0 # func_options ${1+"$@"} # eval set dummy "$func_options_result"; shift # ...rest of your script... # # In order for the '--version' option to work, you will need to have a # suitably formatted comment like the one at the top of this file # starting with '# Written by ' and ending with '# warranty; '. # # For '-h' and '--help' to work, you will also need a one line # description of your script's purpose in a comment directly above the # '# Written by ' line, like the one at the top of this file. # # The default options also support '--debug', which will turn on shell # execution tracing (see the comment above debug_cmd below for another # use), and '--verbose' and the func_verbose function to allow your script # to display verbose messages only when your user has specified # '--verbose'. # # After sourcing this file, you can plug processing for additional # options by amending the variables from the 'Configuration' section # below, and following the instructions in the 'Option parsing' # section further down. ## -------------- ## ## Configuration. ## ## -------------- ## # You should override these variables in your script after sourcing this # file so that they reflect the customisations you have added to the # option parser. # The usage line for option parsing errors and the start of '-h' and # '--help' output messages. You can embed shell variables for delayed # expansion at the time the message is displayed, but you will need to # quote other shell meta-characters carefully to prevent them being # expanded when the contents are evaled. usage='$progpath [OPTION]...' # Short help message in response to '-h' and '--help'. Add to this or # override it after sourcing this library to reflect the full set of # options your script accepts. usage_message="\ --debug enable verbose shell tracing -W, --warnings=CATEGORY report the warnings falling in CATEGORY [all] -v, --verbose verbosely report processing --version print version information and exit -h, --help print short or long help message and exit " # Additional text appended to 'usage_message' in response to '--help'. long_help_message=" Warning categories include: 'all' show all warnings 'none' turn off all the warnings 'error' warnings are treated as fatal errors" # Help message printed before fatal option parsing errors. fatal_help="Try '\$progname --help' for more information." ## ------------------------- ## ## Hook function management. ## ## ------------------------- ## # This section contains functions for adding, removing, and running hooks # to the main code. A hook is just a named list of of function, that can # be run in order later on. # func_hookable FUNC_NAME # ----------------------- # Declare that FUNC_NAME will run hooks added with # 'func_add_hook FUNC_NAME ...'. func_hookable () { $debug_cmd func_append hookable_fns " $1" } # func_add_hook FUNC_NAME HOOK_FUNC # --------------------------------- # Request that FUNC_NAME call HOOK_FUNC before it returns. FUNC_NAME must # first have been declared "hookable" by a call to 'func_hookable'. func_add_hook () { $debug_cmd case " $hookable_fns " in *" $1 "*) ;; *) func_fatal_error "'$1' does not accept hook functions." ;; esac eval func_append ${1}_hooks '" $2"' } # func_remove_hook FUNC_NAME HOOK_FUNC # ------------------------------------ # Remove HOOK_FUNC from the list of functions called by FUNC_NAME. func_remove_hook () { $debug_cmd eval ${1}_hooks='`$ECHO "\$'$1'_hooks" |$SED "s| '$2'||"`' } # func_run_hooks FUNC_NAME [ARG]... # --------------------------------- # Run all hook functions registered to FUNC_NAME. # It is assumed that the list of hook functions contains nothing more # than a whitespace-delimited list of legal shell function names, and # no effort is wasted trying to catch shell meta-characters or preserve # whitespace. func_run_hooks () { $debug_cmd case " $hookable_fns " in *" $1 "*) ;; *) func_fatal_error "'$1' does not support hook funcions.n" ;; esac eval _G_hook_fns=\$$1_hooks; shift for _G_hook in $_G_hook_fns; do eval $_G_hook '"$@"' # store returned options list back into positional # parameters for next 'cmd' execution. eval _G_hook_result=\$${_G_hook}_result eval set dummy "$_G_hook_result"; shift done func_quote_for_eval ${1+"$@"} func_run_hooks_result=$func_quote_for_eval_result } ## --------------- ## ## Option parsing. ## ## --------------- ## # In order to add your own option parsing hooks, you must accept the # full positional parameter list in your hook function, remove any # options that you action, and then pass back the remaining unprocessed # options in '_result', escaped suitably for # 'eval'. Like this: # # my_options_prep () # { # $debug_cmd # # # Extend the existing usage message. # usage_message=$usage_message' # -s, --silent don'\''t print informational messages # ' # # func_quote_for_eval ${1+"$@"} # my_options_prep_result=$func_quote_for_eval_result # } # func_add_hook func_options_prep my_options_prep # # # my_silent_option () # { # $debug_cmd # # # Note that for efficiency, we parse as many options as we can # # recognise in a loop before passing the remainder back to the # # caller on the first unrecognised argument we encounter. # while test $# -gt 0; do # opt=$1; shift # case $opt in # --silent|-s) opt_silent=: ;; # # Separate non-argument short options: # -s*) func_split_short_opt "$_G_opt" # set dummy "$func_split_short_opt_name" \ # "-$func_split_short_opt_arg" ${1+"$@"} # shift # ;; # *) set dummy "$_G_opt" "$*"; shift; break ;; # esac # done # # func_quote_for_eval ${1+"$@"} # my_silent_option_result=$func_quote_for_eval_result # } # func_add_hook func_parse_options my_silent_option # # # my_option_validation () # { # $debug_cmd # # $opt_silent && $opt_verbose && func_fatal_help "\ # '--silent' and '--verbose' options are mutually exclusive." # # func_quote_for_eval ${1+"$@"} # my_option_validation_result=$func_quote_for_eval_result # } # func_add_hook func_validate_options my_option_validation # # You'll alse need to manually amend $usage_message to reflect the extra # options you parse. It's preferable to append if you can, so that # multiple option parsing hooks can be added safely. # func_options [ARG]... # --------------------- # All the functions called inside func_options are hookable. See the # individual implementations for details. func_hookable func_options func_options () { $debug_cmd func_options_prep ${1+"$@"} eval func_parse_options \ ${func_options_prep_result+"$func_options_prep_result"} eval func_validate_options \ ${func_parse_options_result+"$func_parse_options_result"} eval func_run_hooks func_options \ ${func_validate_options_result+"$func_validate_options_result"} # save modified positional parameters for caller func_options_result=$func_run_hooks_result } # func_options_prep [ARG]... # -------------------------- # All initialisations required before starting the option parse loop. # Note that when calling hook functions, we pass through the list of # positional parameters. If a hook function modifies that list, and # needs to propogate that back to rest of this script, then the complete # modified list must be put in 'func_run_hooks_result' before # returning. func_hookable func_options_prep func_options_prep () { $debug_cmd # Option defaults: opt_verbose=false opt_warning_types= func_run_hooks func_options_prep ${1+"$@"} # save modified positional parameters for caller func_options_prep_result=$func_run_hooks_result } # func_parse_options [ARG]... # --------------------------- # The main option parsing loop. func_hookable func_parse_options func_parse_options () { $debug_cmd func_parse_options_result= # this just eases exit handling while test $# -gt 0; do # Defer to hook functions for initial option parsing, so they # get priority in the event of reusing an option name. func_run_hooks func_parse_options ${1+"$@"} # Adjust func_parse_options positional parameters to match eval set dummy "$func_run_hooks_result"; shift # Break out of the loop if we already parsed every option. test $# -gt 0 || break _G_opt=$1 shift case $_G_opt in --debug|-x) debug_cmd='set -x' func_echo "enabling shell trace mode" $debug_cmd ;; --no-warnings|--no-warning|--no-warn) set dummy --warnings none ${1+"$@"} shift ;; --warnings|--warning|-W) test $# = 0 && func_missing_arg $_G_opt && break case " $warning_categories $1" in *" $1 "*) # trailing space prevents matching last $1 above func_append_uniq opt_warning_types " $1" ;; *all) opt_warning_types=$warning_categories ;; *none) opt_warning_types=none warning_func=: ;; *error) opt_warning_types=$warning_categories warning_func=func_fatal_error ;; *) func_fatal_error \ "unsupported warning category: '$1'" ;; esac shift ;; --verbose|-v) opt_verbose=: ;; --version) func_version ;; -\?|-h) func_usage ;; --help) func_help ;; # Separate optargs to long options (plugins may need this): --*=*) func_split_equals "$_G_opt" set dummy "$func_split_equals_lhs" \ "$func_split_equals_rhs" ${1+"$@"} shift ;; # Separate optargs to short options: -W*) func_split_short_opt "$_G_opt" set dummy "$func_split_short_opt_name" \ "$func_split_short_opt_arg" ${1+"$@"} shift ;; # Separate non-argument short options: -\?*|-h*|-v*|-x*) func_split_short_opt "$_G_opt" set dummy "$func_split_short_opt_name" \ "-$func_split_short_opt_arg" ${1+"$@"} shift ;; --) break ;; -*) func_fatal_help "unrecognised option: '$_G_opt'" ;; *) set dummy "$_G_opt" ${1+"$@"}; shift; break ;; esac done # save modified positional parameters for caller func_quote_for_eval ${1+"$@"} func_parse_options_result=$func_quote_for_eval_result } # func_validate_options [ARG]... # ------------------------------ # Perform any sanity checks on option settings and/or unconsumed # arguments. func_hookable func_validate_options func_validate_options () { $debug_cmd # Display all warnings if -W was not given. test -n "$opt_warning_types" || opt_warning_types=" $warning_categories" func_run_hooks func_validate_options ${1+"$@"} # Bail if the options were screwed! $exit_cmd $EXIT_FAILURE # save modified positional parameters for caller func_validate_options_result=$func_run_hooks_result } ## ----------------- ## ## Helper functions. ## ## ----------------- ## # This section contains the helper functions used by the rest of the # hookable option parser framework in ascii-betical order. # func_fatal_help ARG... # ---------------------- # Echo program name prefixed message to standard error, followed by # a help hint, and exit. func_fatal_help () { $debug_cmd eval \$ECHO \""Usage: $usage"\" eval \$ECHO \""$fatal_help"\" func_error ${1+"$@"} exit $EXIT_FAILURE } # func_help # --------- # Echo long help message to standard output and exit. func_help () { $debug_cmd func_usage_message $ECHO "$long_help_message" exit 0 } # func_missing_arg ARGNAME # ------------------------ # Echo program name prefixed message to standard error and set global # exit_cmd. func_missing_arg () { $debug_cmd func_error "Missing argument for '$1'." exit_cmd=exit } # func_split_equals STRING # ------------------------ # Set func_split_equals_lhs and func_split_equals_rhs shell variables after # splitting STRING at the '=' sign. test -z "$_G_HAVE_XSI_OPS" \ && (eval 'x=a/b/c; test 5aa/bb/cc = "${#x}${x%%/*}${x%/*}${x#*/}${x##*/}"') 2>/dev/null \ && _G_HAVE_XSI_OPS=yes if test yes = "$_G_HAVE_XSI_OPS" then # This is an XSI compatible shell, allowing a faster implementation... eval 'func_split_equals () { $debug_cmd func_split_equals_lhs=${1%%=*} func_split_equals_rhs=${1#*=} test "x$func_split_equals_lhs" = "x$1" \ && func_split_equals_rhs= }' else # ...otherwise fall back to using expr, which is often a shell builtin. func_split_equals () { $debug_cmd func_split_equals_lhs=`expr "x$1" : 'x\([^=]*\)'` func_split_equals_rhs= test "x$func_split_equals_lhs" = "x$1" \ || func_split_equals_rhs=`expr "x$1" : 'x[^=]*=\(.*\)$'` } fi #func_split_equals # func_split_short_opt SHORTOPT # ----------------------------- # Set func_split_short_opt_name and func_split_short_opt_arg shell # variables after splitting SHORTOPT after the 2nd character. if test yes = "$_G_HAVE_XSI_OPS" then # This is an XSI compatible shell, allowing a faster implementation... eval 'func_split_short_opt () { $debug_cmd func_split_short_opt_arg=${1#??} func_split_short_opt_name=${1%"$func_split_short_opt_arg"} }' else # ...otherwise fall back to using expr, which is often a shell builtin. func_split_short_opt () { $debug_cmd func_split_short_opt_name=`expr "x$1" : 'x-\(.\)'` func_split_short_opt_arg=`expr "x$1" : 'x-.\(.*\)$'` } fi #func_split_short_opt # func_usage # ---------- # Echo short help message to standard output and exit. func_usage () { $debug_cmd func_usage_message $ECHO "Run '$progname --help |${PAGER-more}' for full usage" exit 0 } # func_usage_message # ------------------ # Echo short help message to standard output. func_usage_message () { $debug_cmd eval \$ECHO \""Usage: $usage"\" echo $SED -n 's|^# || /^Written by/{ x;p;x } h /^Written by/q' < "$progpath" echo eval \$ECHO \""$usage_message"\" } # func_version # ------------ # Echo version message to standard output and exit. func_version () { $debug_cmd printf '%s\n' "$progname $scriptversion" $SED -n ' /(C)/!b go :more /\./!{ N s|\n# | | b more } :go /^# Written by /,/# warranty; / { s|^# || s|^# *$|| s|\((C)\)[ 0-9,-]*[ ,-]\([1-9][0-9]* \)|\1 \2| p } /^# Written by / { s|^# || p } /^warranty; /q' < "$progpath" exit $? } # Local variables: # mode: shell-script # sh-indentation: 2 # eval: (add-hook 'before-save-hook 'time-stamp) # time-stamp-pattern: "10/scriptversion=%:y-%02m-%02d.%02H; # UTC" # time-stamp-time-zone: "UTC" # End: # Set a version string. scriptversion='(GNU libtool) 2.4.6' # func_echo ARG... # ---------------- # Libtool also displays the current mode in messages, so override # funclib.sh func_echo with this custom definition. func_echo () { $debug_cmd _G_message=$* func_echo_IFS=$IFS IFS=$nl for _G_line in $_G_message; do IFS=$func_echo_IFS $ECHO "$progname${opt_mode+: $opt_mode}: $_G_line" done IFS=$func_echo_IFS } # func_warning ARG... # ------------------- # Libtool warnings are not categorized, so override funclib.sh # func_warning with this simpler definition. func_warning () { $debug_cmd $warning_func ${1+"$@"} } ## ---------------- ## ## Options parsing. ## ## ---------------- ## # Hook in the functions to make sure our own options are parsed during # the option parsing loop. usage='$progpath [OPTION]... [MODE-ARG]...' # Short help message in response to '-h'. usage_message="Options: --config show all configuration variables --debug enable verbose shell tracing -n, --dry-run display commands without modifying any files --features display basic configuration information and exit --mode=MODE use operation mode MODE --no-warnings equivalent to '-Wnone' --preserve-dup-deps don't remove duplicate dependency libraries --quiet, --silent don't print informational messages --tag=TAG use configuration variables from tag TAG -v, --verbose print more informational messages than default --version print version information -W, --warnings=CATEGORY report the warnings falling in CATEGORY [all] -h, --help, --help-all print short, long, or detailed help message " # Additional text appended to 'usage_message' in response to '--help'. func_help () { $debug_cmd func_usage_message $ECHO "$long_help_message MODE must be one of the following: clean remove files from the build directory compile compile a source file into a libtool object execute automatically set library path, then run a program finish complete the installation of libtool libraries install install libraries or executables link create a library or an executable uninstall remove libraries from an installed directory MODE-ARGS vary depending on the MODE. When passed as first option, '--mode=MODE' may be abbreviated as 'MODE' or a unique abbreviation of that. Try '$progname --help --mode=MODE' for a more detailed description of MODE. When reporting a bug, please describe a test case to reproduce it and include the following information: host-triplet: $host shell: $SHELL compiler: $LTCC compiler flags: $LTCFLAGS linker: $LD (gnu? $with_gnu_ld) version: $progname (GNU libtool) 2.4.6 automake: `($AUTOMAKE --version) 2>/dev/null |$SED 1q` autoconf: `($AUTOCONF --version) 2>/dev/null |$SED 1q` Report bugs to . GNU libtool home page: . General help using GNU software: ." exit 0 } # func_lo2o OBJECT-NAME # --------------------- # Transform OBJECT-NAME from a '.lo' suffix to the platform specific # object suffix. lo2o=s/\\.lo\$/.$objext/ o2lo=s/\\.$objext\$/.lo/ if test yes = "$_G_HAVE_XSI_OPS"; then eval 'func_lo2o () { case $1 in *.lo) func_lo2o_result=${1%.lo}.$objext ;; * ) func_lo2o_result=$1 ;; esac }' # func_xform LIBOBJ-OR-SOURCE # --------------------------- # Transform LIBOBJ-OR-SOURCE from a '.o' or '.c' (or otherwise) # suffix to a '.lo' libtool-object suffix. eval 'func_xform () { func_xform_result=${1%.*}.lo }' else # ...otherwise fall back to using sed. func_lo2o () { func_lo2o_result=`$ECHO "$1" | $SED "$lo2o"` } func_xform () { func_xform_result=`$ECHO "$1" | $SED 's|\.[^.]*$|.lo|'` } fi # func_fatal_configuration ARG... # ------------------------------- # Echo program name prefixed message to standard error, followed by # a configuration failure hint, and exit. func_fatal_configuration () { func__fatal_error ${1+"$@"} \ "See the $PACKAGE documentation for more information." \ "Fatal configuration error." } # func_config # ----------- # Display the configuration for all the tags in this script. func_config () { re_begincf='^# ### BEGIN LIBTOOL' re_endcf='^# ### END LIBTOOL' # Default configuration. $SED "1,/$re_begincf CONFIG/d;/$re_endcf CONFIG/,\$d" < "$progpath" # Now print the configurations for the tags. for tagname in $taglist; do $SED -n "/$re_begincf TAG CONFIG: $tagname\$/,/$re_endcf TAG CONFIG: $tagname\$/p" < "$progpath" done exit $? } # func_features # ------------- # Display the features supported by this script. func_features () { echo "host: $host" if test yes = "$build_libtool_libs"; then echo "enable shared libraries" else echo "disable shared libraries" fi if test yes = "$build_old_libs"; then echo "enable static libraries" else echo "disable static libraries" fi exit $? } # func_enable_tag TAGNAME # ----------------------- # Verify that TAGNAME is valid, and either flag an error and exit, or # enable the TAGNAME tag. We also add TAGNAME to the global $taglist # variable here. func_enable_tag () { # Global variable: tagname=$1 re_begincf="^# ### BEGIN LIBTOOL TAG CONFIG: $tagname\$" re_endcf="^# ### END LIBTOOL TAG CONFIG: $tagname\$" sed_extractcf=/$re_begincf/,/$re_endcf/p # Validate tagname. case $tagname in *[!-_A-Za-z0-9,/]*) func_fatal_error "invalid tag name: $tagname" ;; esac # Don't test for the "default" C tag, as we know it's # there but not specially marked. case $tagname in CC) ;; *) if $GREP "$re_begincf" "$progpath" >/dev/null 2>&1; then taglist="$taglist $tagname" # Evaluate the configuration. Be careful to quote the path # and the sed script, to avoid splitting on whitespace, but # also don't use non-portable quotes within backquotes within # quotes we have to do it in 2 steps: extractedcf=`$SED -n -e "$sed_extractcf" < "$progpath"` eval "$extractedcf" else func_error "ignoring unknown tag $tagname" fi ;; esac } # func_check_version_match # ------------------------ # Ensure that we are using m4 macros, and libtool script from the same # release of libtool. func_check_version_match () { if test "$package_revision" != "$macro_revision"; then if test "$VERSION" != "$macro_version"; then if test -z "$macro_version"; then cat >&2 <<_LT_EOF $progname: Version mismatch error. This is $PACKAGE $VERSION, but the $progname: definition of this LT_INIT comes from an older release. $progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION $progname: and run autoconf again. _LT_EOF else cat >&2 <<_LT_EOF $progname: Version mismatch error. This is $PACKAGE $VERSION, but the $progname: definition of this LT_INIT comes from $PACKAGE $macro_version. $progname: You should recreate aclocal.m4 with macros from $PACKAGE $VERSION $progname: and run autoconf again. _LT_EOF fi else cat >&2 <<_LT_EOF $progname: Version mismatch error. This is $PACKAGE $VERSION, revision $package_revision, $progname: but the definition of this LT_INIT comes from revision $macro_revision. $progname: You should recreate aclocal.m4 with macros from revision $package_revision $progname: of $PACKAGE $VERSION and run autoconf again. _LT_EOF fi exit $EXIT_MISMATCH fi } # libtool_options_prep [ARG]... # ----------------------------- # Preparation for options parsed by libtool. libtool_options_prep () { $debug_mode # Option defaults: opt_config=false opt_dlopen= opt_dry_run=false opt_help=false opt_mode= opt_preserve_dup_deps=false opt_quiet=false nonopt= preserve_args= # Shorthand for --mode=foo, only valid as the first argument case $1 in clean|clea|cle|cl) shift; set dummy --mode clean ${1+"$@"}; shift ;; compile|compil|compi|comp|com|co|c) shift; set dummy --mode compile ${1+"$@"}; shift ;; execute|execut|execu|exec|exe|ex|e) shift; set dummy --mode execute ${1+"$@"}; shift ;; finish|finis|fini|fin|fi|f) shift; set dummy --mode finish ${1+"$@"}; shift ;; install|instal|insta|inst|ins|in|i) shift; set dummy --mode install ${1+"$@"}; shift ;; link|lin|li|l) shift; set dummy --mode link ${1+"$@"}; shift ;; uninstall|uninstal|uninsta|uninst|unins|unin|uni|un|u) shift; set dummy --mode uninstall ${1+"$@"}; shift ;; esac # Pass back the list of options. func_quote_for_eval ${1+"$@"} libtool_options_prep_result=$func_quote_for_eval_result } func_add_hook func_options_prep libtool_options_prep # libtool_parse_options [ARG]... # --------------------------------- # Provide handling for libtool specific options. libtool_parse_options () { $debug_cmd # Perform our own loop to consume as many options as possible in # each iteration. while test $# -gt 0; do _G_opt=$1 shift case $_G_opt in --dry-run|--dryrun|-n) opt_dry_run=: ;; --config) func_config ;; --dlopen|-dlopen) opt_dlopen="${opt_dlopen+$opt_dlopen }$1" shift ;; --preserve-dup-deps) opt_preserve_dup_deps=: ;; --features) func_features ;; --finish) set dummy --mode finish ${1+"$@"}; shift ;; --help) opt_help=: ;; --help-all) opt_help=': help-all' ;; --mode) test $# = 0 && func_missing_arg $_G_opt && break opt_mode=$1 case $1 in # Valid mode arguments: clean|compile|execute|finish|install|link|relink|uninstall) ;; # Catch anything else as an error *) func_error "invalid argument for $_G_opt" exit_cmd=exit break ;; esac shift ;; --no-silent|--no-quiet) opt_quiet=false func_append preserve_args " $_G_opt" ;; --no-warnings|--no-warning|--no-warn) opt_warning=false func_append preserve_args " $_G_opt" ;; --no-verbose) opt_verbose=false func_append preserve_args " $_G_opt" ;; --silent|--quiet) opt_quiet=: opt_verbose=false func_append preserve_args " $_G_opt" ;; --tag) test $# = 0 && func_missing_arg $_G_opt && break opt_tag=$1 func_append preserve_args " $_G_opt $1" func_enable_tag "$1" shift ;; --verbose|-v) opt_quiet=false opt_verbose=: func_append preserve_args " $_G_opt" ;; # An option not handled by this hook function: *) set dummy "$_G_opt" ${1+"$@"}; shift; break ;; esac done # save modified positional parameters for caller func_quote_for_eval ${1+"$@"} libtool_parse_options_result=$func_quote_for_eval_result } func_add_hook func_parse_options libtool_parse_options # libtool_validate_options [ARG]... # --------------------------------- # Perform any sanity checks on option settings and/or unconsumed # arguments. libtool_validate_options () { # save first non-option argument if test 0 -lt $#; then nonopt=$1 shift fi # preserve --debug test : = "$debug_cmd" || func_append preserve_args " --debug" case $host in # Solaris2 added to fix http://debbugs.gnu.org/cgi/bugreport.cgi?bug=16452 # see also: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59788 *cygwin* | *mingw* | *pw32* | *cegcc* | *solaris2* | *os2*) # don't eliminate duplications in $postdeps and $predeps opt_duplicate_compiler_generated_deps=: ;; *) opt_duplicate_compiler_generated_deps=$opt_preserve_dup_deps ;; esac $opt_help || { # Sanity checks first: func_check_version_match test yes != "$build_libtool_libs" \ && test yes != "$build_old_libs" \ && func_fatal_configuration "not configured to build any kind of library" # Darwin sucks eval std_shrext=\"$shrext_cmds\" # Only execute mode is allowed to have -dlopen flags. if test -n "$opt_dlopen" && test execute != "$opt_mode"; then func_error "unrecognized option '-dlopen'" $ECHO "$help" 1>&2 exit $EXIT_FAILURE fi # Change the help message to a mode-specific one. generic_help=$help help="Try '$progname --help --mode=$opt_mode' for more information." } # Pass back the unparsed argument list func_quote_for_eval ${1+"$@"} libtool_validate_options_result=$func_quote_for_eval_result } func_add_hook func_validate_options libtool_validate_options # Process options as early as possible so that --help and --version # can return quickly. func_options ${1+"$@"} eval set dummy "$func_options_result"; shift ## ----------- ## ## Main. ## ## ----------- ## magic='%%%MAGIC variable%%%' magic_exe='%%%MAGIC EXE variable%%%' # Global variables. extracted_archives= extracted_serial=0 # If this variable is set in any of the actions, the command in it # will be execed at the end. This prevents here-documents from being # left over by shells. exec_cmd= # A function that is used when there is no print builtin or printf. func_fallback_echo () { eval 'cat <<_LTECHO_EOF $1 _LTECHO_EOF' } # func_generated_by_libtool # True iff stdin has been generated by Libtool. This function is only # a basic sanity check; it will hardly flush out determined imposters. func_generated_by_libtool_p () { $GREP "^# Generated by .*$PACKAGE" > /dev/null 2>&1 } # func_lalib_p file # True iff FILE is a libtool '.la' library or '.lo' object file. # This function is only a basic sanity check; it will hardly flush out # determined imposters. func_lalib_p () { test -f "$1" && $SED -e 4q "$1" 2>/dev/null | func_generated_by_libtool_p } # func_lalib_unsafe_p file # True iff FILE is a libtool '.la' library or '.lo' object file. # This function implements the same check as func_lalib_p without # resorting to external programs. To this end, it redirects stdin and # closes it afterwards, without saving the original file descriptor. # As a safety measure, use it only where a negative result would be # fatal anyway. Works if 'file' does not exist. func_lalib_unsafe_p () { lalib_p=no if test -f "$1" && test -r "$1" && exec 5<&0 <"$1"; then for lalib_p_l in 1 2 3 4 do read lalib_p_line case $lalib_p_line in \#\ Generated\ by\ *$PACKAGE* ) lalib_p=yes; break;; esac done exec 0<&5 5<&- fi test yes = "$lalib_p" } # func_ltwrapper_script_p file # True iff FILE is a libtool wrapper script # This function is only a basic sanity check; it will hardly flush out # determined imposters. func_ltwrapper_script_p () { test -f "$1" && $lt_truncate_bin < "$1" 2>/dev/null | func_generated_by_libtool_p } # func_ltwrapper_executable_p file # True iff FILE is a libtool wrapper executable # This function is only a basic sanity check; it will hardly flush out # determined imposters. func_ltwrapper_executable_p () { func_ltwrapper_exec_suffix= case $1 in *.exe) ;; *) func_ltwrapper_exec_suffix=.exe ;; esac $GREP "$magic_exe" "$1$func_ltwrapper_exec_suffix" >/dev/null 2>&1 } # func_ltwrapper_scriptname file # Assumes file is an ltwrapper_executable # uses $file to determine the appropriate filename for a # temporary ltwrapper_script. func_ltwrapper_scriptname () { func_dirname_and_basename "$1" "" "." func_stripname '' '.exe' "$func_basename_result" func_ltwrapper_scriptname_result=$func_dirname_result/$objdir/${func_stripname_result}_ltshwrapper } # func_ltwrapper_p file # True iff FILE is a libtool wrapper script or wrapper executable # This function is only a basic sanity check; it will hardly flush out # determined imposters. func_ltwrapper_p () { func_ltwrapper_script_p "$1" || func_ltwrapper_executable_p "$1" } # func_execute_cmds commands fail_cmd # Execute tilde-delimited COMMANDS. # If FAIL_CMD is given, eval that upon failure. # FAIL_CMD may read-access the current command in variable CMD! func_execute_cmds () { $debug_cmd save_ifs=$IFS; IFS='~' for cmd in $1; do IFS=$sp$nl eval cmd=\"$cmd\" IFS=$save_ifs func_show_eval "$cmd" "${2-:}" done IFS=$save_ifs } # func_source file # Source FILE, adding directory component if necessary. # Note that it is not necessary on cygwin/mingw to append a dot to # FILE even if both FILE and FILE.exe exist: automatic-append-.exe # behavior happens only for exec(3), not for open(2)! Also, sourcing # 'FILE.' does not work on cygwin managed mounts. func_source () { $debug_cmd case $1 in */* | *\\*) . "$1" ;; *) . "./$1" ;; esac } # func_resolve_sysroot PATH # Replace a leading = in PATH with a sysroot. Store the result into # func_resolve_sysroot_result func_resolve_sysroot () { func_resolve_sysroot_result=$1 case $func_resolve_sysroot_result in =*) func_stripname '=' '' "$func_resolve_sysroot_result" func_resolve_sysroot_result=$lt_sysroot$func_stripname_result ;; esac } # func_replace_sysroot PATH # If PATH begins with the sysroot, replace it with = and # store the result into func_replace_sysroot_result. func_replace_sysroot () { case $lt_sysroot:$1 in ?*:"$lt_sysroot"*) func_stripname "$lt_sysroot" '' "$1" func_replace_sysroot_result='='$func_stripname_result ;; *) # Including no sysroot. func_replace_sysroot_result=$1 ;; esac } # func_infer_tag arg # Infer tagged configuration to use if any are available and # if one wasn't chosen via the "--tag" command line option. # Only attempt this if the compiler in the base compile # command doesn't match the default compiler. # arg is usually of the form 'gcc ...' func_infer_tag () { $debug_cmd if test -n "$available_tags" && test -z "$tagname"; then CC_quoted= for arg in $CC; do func_append_quoted CC_quoted "$arg" done CC_expanded=`func_echo_all $CC` CC_quoted_expanded=`func_echo_all $CC_quoted` case $@ in # Blanks in the command may have been stripped by the calling shell, # but not from the CC environment variable when configure was run. " $CC "* | "$CC "* | " $CC_expanded "* | "$CC_expanded "* | \ " $CC_quoted"* | "$CC_quoted "* | " $CC_quoted_expanded "* | "$CC_quoted_expanded "*) ;; # Blanks at the start of $base_compile will cause this to fail # if we don't check for them as well. *) for z in $available_tags; do if $GREP "^# ### BEGIN LIBTOOL TAG CONFIG: $z$" < "$progpath" > /dev/null; then # Evaluate the configuration. eval "`$SED -n -e '/^# ### BEGIN LIBTOOL TAG CONFIG: '$z'$/,/^# ### END LIBTOOL TAG CONFIG: '$z'$/p' < $progpath`" CC_quoted= for arg in $CC; do # Double-quote args containing other shell metacharacters. func_append_quoted CC_quoted "$arg" done CC_expanded=`func_echo_all $CC` CC_quoted_expanded=`func_echo_all $CC_quoted` case "$@ " in " $CC "* | "$CC "* | " $CC_expanded "* | "$CC_expanded "* | \ " $CC_quoted"* | "$CC_quoted "* | " $CC_quoted_expanded "* | "$CC_quoted_expanded "*) # The compiler in the base compile command matches # the one in the tagged configuration. # Assume this is the tagged configuration we want. tagname=$z break ;; esac fi done # If $tagname still isn't set, then no tagged configuration # was found and let the user know that the "--tag" command # line option must be used. if test -z "$tagname"; then func_echo "unable to infer tagged configuration" func_fatal_error "specify a tag with '--tag'" # else # func_verbose "using $tagname tagged configuration" fi ;; esac fi } # func_write_libtool_object output_name pic_name nonpic_name # Create a libtool object file (analogous to a ".la" file), # but don't create it if we're doing a dry run. func_write_libtool_object () { write_libobj=$1 if test yes = "$build_libtool_libs"; then write_lobj=\'$2\' else write_lobj=none fi if test yes = "$build_old_libs"; then write_oldobj=\'$3\' else write_oldobj=none fi $opt_dry_run || { cat >${write_libobj}T </dev/null` if test "$?" -eq 0 && test -n "$func_convert_core_file_wine_to_w32_tmp"; then func_convert_core_file_wine_to_w32_result=`$ECHO "$func_convert_core_file_wine_to_w32_tmp" | $SED -e "$sed_naive_backslashify"` else func_convert_core_file_wine_to_w32_result= fi fi } # end: func_convert_core_file_wine_to_w32 # func_convert_core_path_wine_to_w32 ARG # Helper function used by path conversion functions when $build is *nix, and # $host is mingw, cygwin, or some other w32 environment. Relies on a correctly # configured wine environment available, with the winepath program in $build's # $PATH. Assumes ARG has no leading or trailing path separator characters. # # ARG is path to be converted from $build format to win32. # Result is available in $func_convert_core_path_wine_to_w32_result. # Unconvertible file (directory) names in ARG are skipped; if no directory names # are convertible, then the result may be empty. func_convert_core_path_wine_to_w32 () { $debug_cmd # unfortunately, winepath doesn't convert paths, only file names func_convert_core_path_wine_to_w32_result= if test -n "$1"; then oldIFS=$IFS IFS=: for func_convert_core_path_wine_to_w32_f in $1; do IFS=$oldIFS func_convert_core_file_wine_to_w32 "$func_convert_core_path_wine_to_w32_f" if test -n "$func_convert_core_file_wine_to_w32_result"; then if test -z "$func_convert_core_path_wine_to_w32_result"; then func_convert_core_path_wine_to_w32_result=$func_convert_core_file_wine_to_w32_result else func_append func_convert_core_path_wine_to_w32_result ";$func_convert_core_file_wine_to_w32_result" fi fi done IFS=$oldIFS fi } # end: func_convert_core_path_wine_to_w32 # func_cygpath ARGS... # Wrapper around calling the cygpath program via LT_CYGPATH. This is used when # when (1) $build is *nix and Cygwin is hosted via a wine environment; or (2) # $build is MSYS and $host is Cygwin, or (3) $build is Cygwin. In case (1) or # (2), returns the Cygwin file name or path in func_cygpath_result (input # file name or path is assumed to be in w32 format, as previously converted # from $build's *nix or MSYS format). In case (3), returns the w32 file name # or path in func_cygpath_result (input file name or path is assumed to be in # Cygwin format). Returns an empty string on error. # # ARGS are passed to cygpath, with the last one being the file name or path to # be converted. # # Specify the absolute *nix (or w32) name to cygpath in the LT_CYGPATH # environment variable; do not put it in $PATH. func_cygpath () { $debug_cmd if test -n "$LT_CYGPATH" && test -f "$LT_CYGPATH"; then func_cygpath_result=`$LT_CYGPATH "$@" 2>/dev/null` if test "$?" -ne 0; then # on failure, ensure result is empty func_cygpath_result= fi else func_cygpath_result= func_error "LT_CYGPATH is empty or specifies non-existent file: '$LT_CYGPATH'" fi } #end: func_cygpath # func_convert_core_msys_to_w32 ARG # Convert file name or path ARG from MSYS format to w32 format. Return # result in func_convert_core_msys_to_w32_result. func_convert_core_msys_to_w32 () { $debug_cmd # awkward: cmd appends spaces to result func_convert_core_msys_to_w32_result=`( cmd //c echo "$1" ) 2>/dev/null | $SED -e 's/[ ]*$//' -e "$sed_naive_backslashify"` } #end: func_convert_core_msys_to_w32 # func_convert_file_check ARG1 ARG2 # Verify that ARG1 (a file name in $build format) was converted to $host # format in ARG2. Otherwise, emit an error message, but continue (resetting # func_to_host_file_result to ARG1). func_convert_file_check () { $debug_cmd if test -z "$2" && test -n "$1"; then func_error "Could not determine host file name corresponding to" func_error " '$1'" func_error "Continuing, but uninstalled executables may not work." # Fallback: func_to_host_file_result=$1 fi } # end func_convert_file_check # func_convert_path_check FROM_PATHSEP TO_PATHSEP FROM_PATH TO_PATH # Verify that FROM_PATH (a path in $build format) was converted to $host # format in TO_PATH. Otherwise, emit an error message, but continue, resetting # func_to_host_file_result to a simplistic fallback value (see below). func_convert_path_check () { $debug_cmd if test -z "$4" && test -n "$3"; then func_error "Could not determine the host path corresponding to" func_error " '$3'" func_error "Continuing, but uninstalled executables may not work." # Fallback. This is a deliberately simplistic "conversion" and # should not be "improved". See libtool.info. if test "x$1" != "x$2"; then lt_replace_pathsep_chars="s|$1|$2|g" func_to_host_path_result=`echo "$3" | $SED -e "$lt_replace_pathsep_chars"` else func_to_host_path_result=$3 fi fi } # end func_convert_path_check # func_convert_path_front_back_pathsep FRONTPAT BACKPAT REPL ORIG # Modifies func_to_host_path_result by prepending REPL if ORIG matches FRONTPAT # and appending REPL if ORIG matches BACKPAT. func_convert_path_front_back_pathsep () { $debug_cmd case $4 in $1 ) func_to_host_path_result=$3$func_to_host_path_result ;; esac case $4 in $2 ) func_append func_to_host_path_result "$3" ;; esac } # end func_convert_path_front_back_pathsep ################################################## # $build to $host FILE NAME CONVERSION FUNCTIONS # ################################################## # invoked via '$to_host_file_cmd ARG' # # In each case, ARG is the path to be converted from $build to $host format. # Result will be available in $func_to_host_file_result. # func_to_host_file ARG # Converts the file name ARG from $build format to $host format. Return result # in func_to_host_file_result. func_to_host_file () { $debug_cmd $to_host_file_cmd "$1" } # end func_to_host_file # func_to_tool_file ARG LAZY # converts the file name ARG from $build format to toolchain format. Return # result in func_to_tool_file_result. If the conversion in use is listed # in (the comma separated) LAZY, no conversion takes place. func_to_tool_file () { $debug_cmd case ,$2, in *,"$to_tool_file_cmd",*) func_to_tool_file_result=$1 ;; *) $to_tool_file_cmd "$1" func_to_tool_file_result=$func_to_host_file_result ;; esac } # end func_to_tool_file # func_convert_file_noop ARG # Copy ARG to func_to_host_file_result. func_convert_file_noop () { func_to_host_file_result=$1 } # end func_convert_file_noop # func_convert_file_msys_to_w32 ARG # Convert file name ARG from (mingw) MSYS to (mingw) w32 format; automatic # conversion to w32 is not available inside the cwrapper. Returns result in # func_to_host_file_result. func_convert_file_msys_to_w32 () { $debug_cmd func_to_host_file_result=$1 if test -n "$1"; then func_convert_core_msys_to_w32 "$1" func_to_host_file_result=$func_convert_core_msys_to_w32_result fi func_convert_file_check "$1" "$func_to_host_file_result" } # end func_convert_file_msys_to_w32 # func_convert_file_cygwin_to_w32 ARG # Convert file name ARG from Cygwin to w32 format. Returns result in # func_to_host_file_result. func_convert_file_cygwin_to_w32 () { $debug_cmd func_to_host_file_result=$1 if test -n "$1"; then # because $build is cygwin, we call "the" cygpath in $PATH; no need to use # LT_CYGPATH in this case. func_to_host_file_result=`cygpath -m "$1"` fi func_convert_file_check "$1" "$func_to_host_file_result" } # end func_convert_file_cygwin_to_w32 # func_convert_file_nix_to_w32 ARG # Convert file name ARG from *nix to w32 format. Requires a wine environment # and a working winepath. Returns result in func_to_host_file_result. func_convert_file_nix_to_w32 () { $debug_cmd func_to_host_file_result=$1 if test -n "$1"; then func_convert_core_file_wine_to_w32 "$1" func_to_host_file_result=$func_convert_core_file_wine_to_w32_result fi func_convert_file_check "$1" "$func_to_host_file_result" } # end func_convert_file_nix_to_w32 # func_convert_file_msys_to_cygwin ARG # Convert file name ARG from MSYS to Cygwin format. Requires LT_CYGPATH set. # Returns result in func_to_host_file_result. func_convert_file_msys_to_cygwin () { $debug_cmd func_to_host_file_result=$1 if test -n "$1"; then func_convert_core_msys_to_w32 "$1" func_cygpath -u "$func_convert_core_msys_to_w32_result" func_to_host_file_result=$func_cygpath_result fi func_convert_file_check "$1" "$func_to_host_file_result" } # end func_convert_file_msys_to_cygwin # func_convert_file_nix_to_cygwin ARG # Convert file name ARG from *nix to Cygwin format. Requires Cygwin installed # in a wine environment, working winepath, and LT_CYGPATH set. Returns result # in func_to_host_file_result. func_convert_file_nix_to_cygwin () { $debug_cmd func_to_host_file_result=$1 if test -n "$1"; then # convert from *nix to w32, then use cygpath to convert from w32 to cygwin. func_convert_core_file_wine_to_w32 "$1" func_cygpath -u "$func_convert_core_file_wine_to_w32_result" func_to_host_file_result=$func_cygpath_result fi func_convert_file_check "$1" "$func_to_host_file_result" } # end func_convert_file_nix_to_cygwin ############################################# # $build to $host PATH CONVERSION FUNCTIONS # ############################################# # invoked via '$to_host_path_cmd ARG' # # In each case, ARG is the path to be converted from $build to $host format. # The result will be available in $func_to_host_path_result. # # Path separators are also converted from $build format to $host format. If # ARG begins or ends with a path separator character, it is preserved (but # converted to $host format) on output. # # All path conversion functions are named using the following convention: # file name conversion function : func_convert_file_X_to_Y () # path conversion function : func_convert_path_X_to_Y () # where, for any given $build/$host combination the 'X_to_Y' value is the # same. If conversion functions are added for new $build/$host combinations, # the two new functions must follow this pattern, or func_init_to_host_path_cmd # will break. # func_init_to_host_path_cmd # Ensures that function "pointer" variable $to_host_path_cmd is set to the # appropriate value, based on the value of $to_host_file_cmd. to_host_path_cmd= func_init_to_host_path_cmd () { $debug_cmd if test -z "$to_host_path_cmd"; then func_stripname 'func_convert_file_' '' "$to_host_file_cmd" to_host_path_cmd=func_convert_path_$func_stripname_result fi } # func_to_host_path ARG # Converts the path ARG from $build format to $host format. Return result # in func_to_host_path_result. func_to_host_path () { $debug_cmd func_init_to_host_path_cmd $to_host_path_cmd "$1" } # end func_to_host_path # func_convert_path_noop ARG # Copy ARG to func_to_host_path_result. func_convert_path_noop () { func_to_host_path_result=$1 } # end func_convert_path_noop # func_convert_path_msys_to_w32 ARG # Convert path ARG from (mingw) MSYS to (mingw) w32 format; automatic # conversion to w32 is not available inside the cwrapper. Returns result in # func_to_host_path_result. func_convert_path_msys_to_w32 () { $debug_cmd func_to_host_path_result=$1 if test -n "$1"; then # Remove leading and trailing path separator characters from ARG. MSYS # behavior is inconsistent here; cygpath turns them into '.;' and ';.'; # and winepath ignores them completely. func_stripname : : "$1" func_to_host_path_tmp1=$func_stripname_result func_convert_core_msys_to_w32 "$func_to_host_path_tmp1" func_to_host_path_result=$func_convert_core_msys_to_w32_result func_convert_path_check : ";" \ "$func_to_host_path_tmp1" "$func_to_host_path_result" func_convert_path_front_back_pathsep ":*" "*:" ";" "$1" fi } # end func_convert_path_msys_to_w32 # func_convert_path_cygwin_to_w32 ARG # Convert path ARG from Cygwin to w32 format. Returns result in # func_to_host_file_result. func_convert_path_cygwin_to_w32 () { $debug_cmd func_to_host_path_result=$1 if test -n "$1"; then # See func_convert_path_msys_to_w32: func_stripname : : "$1" func_to_host_path_tmp1=$func_stripname_result func_to_host_path_result=`cygpath -m -p "$func_to_host_path_tmp1"` func_convert_path_check : ";" \ "$func_to_host_path_tmp1" "$func_to_host_path_result" func_convert_path_front_back_pathsep ":*" "*:" ";" "$1" fi } # end func_convert_path_cygwin_to_w32 # func_convert_path_nix_to_w32 ARG # Convert path ARG from *nix to w32 format. Requires a wine environment and # a working winepath. Returns result in func_to_host_file_result. func_convert_path_nix_to_w32 () { $debug_cmd func_to_host_path_result=$1 if test -n "$1"; then # See func_convert_path_msys_to_w32: func_stripname : : "$1" func_to_host_path_tmp1=$func_stripname_result func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1" func_to_host_path_result=$func_convert_core_path_wine_to_w32_result func_convert_path_check : ";" \ "$func_to_host_path_tmp1" "$func_to_host_path_result" func_convert_path_front_back_pathsep ":*" "*:" ";" "$1" fi } # end func_convert_path_nix_to_w32 # func_convert_path_msys_to_cygwin ARG # Convert path ARG from MSYS to Cygwin format. Requires LT_CYGPATH set. # Returns result in func_to_host_file_result. func_convert_path_msys_to_cygwin () { $debug_cmd func_to_host_path_result=$1 if test -n "$1"; then # See func_convert_path_msys_to_w32: func_stripname : : "$1" func_to_host_path_tmp1=$func_stripname_result func_convert_core_msys_to_w32 "$func_to_host_path_tmp1" func_cygpath -u -p "$func_convert_core_msys_to_w32_result" func_to_host_path_result=$func_cygpath_result func_convert_path_check : : \ "$func_to_host_path_tmp1" "$func_to_host_path_result" func_convert_path_front_back_pathsep ":*" "*:" : "$1" fi } # end func_convert_path_msys_to_cygwin # func_convert_path_nix_to_cygwin ARG # Convert path ARG from *nix to Cygwin format. Requires Cygwin installed in a # a wine environment, working winepath, and LT_CYGPATH set. Returns result in # func_to_host_file_result. func_convert_path_nix_to_cygwin () { $debug_cmd func_to_host_path_result=$1 if test -n "$1"; then # Remove leading and trailing path separator characters from # ARG. msys behavior is inconsistent here, cygpath turns them # into '.;' and ';.', and winepath ignores them completely. func_stripname : : "$1" func_to_host_path_tmp1=$func_stripname_result func_convert_core_path_wine_to_w32 "$func_to_host_path_tmp1" func_cygpath -u -p "$func_convert_core_path_wine_to_w32_result" func_to_host_path_result=$func_cygpath_result func_convert_path_check : : \ "$func_to_host_path_tmp1" "$func_to_host_path_result" func_convert_path_front_back_pathsep ":*" "*:" : "$1" fi } # end func_convert_path_nix_to_cygwin # func_dll_def_p FILE # True iff FILE is a Windows DLL '.def' file. # Keep in sync with _LT_DLL_DEF_P in libtool.m4 func_dll_def_p () { $debug_cmd func_dll_def_p_tmp=`$SED -n \ -e 's/^[ ]*//' \ -e '/^\(;.*\)*$/d' \ -e 's/^\(EXPORTS\|LIBRARY\)\([ ].*\)*$/DEF/p' \ -e q \ "$1"` test DEF = "$func_dll_def_p_tmp" } # func_mode_compile arg... func_mode_compile () { $debug_cmd # Get the compilation command and the source file. base_compile= srcfile=$nonopt # always keep a non-empty value in "srcfile" suppress_opt=yes suppress_output= arg_mode=normal libobj= later= pie_flag= for arg do case $arg_mode in arg ) # do not "continue". Instead, add this to base_compile lastarg=$arg arg_mode=normal ;; target ) libobj=$arg arg_mode=normal continue ;; normal ) # Accept any command-line options. case $arg in -o) test -n "$libobj" && \ func_fatal_error "you cannot specify '-o' more than once" arg_mode=target continue ;; -pie | -fpie | -fPIE) func_append pie_flag " $arg" continue ;; -shared | -static | -prefer-pic | -prefer-non-pic) func_append later " $arg" continue ;; -no-suppress) suppress_opt=no continue ;; -Xcompiler) arg_mode=arg # the next one goes into the "base_compile" arg list continue # The current "srcfile" will either be retained or ;; # replaced later. I would guess that would be a bug. -Wc,*) func_stripname '-Wc,' '' "$arg" args=$func_stripname_result lastarg= save_ifs=$IFS; IFS=, for arg in $args; do IFS=$save_ifs func_append_quoted lastarg "$arg" done IFS=$save_ifs func_stripname ' ' '' "$lastarg" lastarg=$func_stripname_result # Add the arguments to base_compile. func_append base_compile " $lastarg" continue ;; *) # Accept the current argument as the source file. # The previous "srcfile" becomes the current argument. # lastarg=$srcfile srcfile=$arg ;; esac # case $arg ;; esac # case $arg_mode # Aesthetically quote the previous argument. func_append_quoted base_compile "$lastarg" done # for arg case $arg_mode in arg) func_fatal_error "you must specify an argument for -Xcompile" ;; target) func_fatal_error "you must specify a target with '-o'" ;; *) # Get the name of the library object. test -z "$libobj" && { func_basename "$srcfile" libobj=$func_basename_result } ;; esac # Recognize several different file suffixes. # If the user specifies -o file.o, it is replaced with file.lo case $libobj in *.[cCFSifmso] | \ *.ada | *.adb | *.ads | *.asm | \ *.c++ | *.cc | *.ii | *.class | *.cpp | *.cxx | \ *.[fF][09]? | *.for | *.java | *.go | *.obj | *.sx | *.cu | *.cup) func_xform "$libobj" libobj=$func_xform_result ;; esac case $libobj in *.lo) func_lo2o "$libobj"; obj=$func_lo2o_result ;; *) func_fatal_error "cannot determine name of library object from '$libobj'" ;; esac func_infer_tag $base_compile for arg in $later; do case $arg in -shared) test yes = "$build_libtool_libs" \ || func_fatal_configuration "cannot build a shared library" build_old_libs=no continue ;; -static) build_libtool_libs=no build_old_libs=yes continue ;; -prefer-pic) pic_mode=yes continue ;; -prefer-non-pic) pic_mode=no continue ;; esac done func_quote_for_eval "$libobj" test "X$libobj" != "X$func_quote_for_eval_result" \ && $ECHO "X$libobj" | $GREP '[]~#^*{};<>?"'"'"' &()|`$[]' \ && func_warning "libobj name '$libobj' may not contain shell special characters." func_dirname_and_basename "$obj" "/" "" objname=$func_basename_result xdir=$func_dirname_result lobj=$xdir$objdir/$objname test -z "$base_compile" && \ func_fatal_help "you must specify a compilation command" # Delete any leftover library objects. if test yes = "$build_old_libs"; then removelist="$obj $lobj $libobj ${libobj}T" else removelist="$lobj $libobj ${libobj}T" fi # On Cygwin there's no "real" PIC flag so we must build both object types case $host_os in cygwin* | mingw* | pw32* | os2* | cegcc*) pic_mode=default ;; esac if test no = "$pic_mode" && test pass_all != "$deplibs_check_method"; then # non-PIC code in shared libraries is not supported pic_mode=default fi # Calculate the filename of the output object if compiler does # not support -o with -c if test no = "$compiler_c_o"; then output_obj=`$ECHO "$srcfile" | $SED 's%^.*/%%; s%\.[^.]*$%%'`.$objext lockfile=$output_obj.lock else output_obj= need_locks=no lockfile= fi # Lock this critical section if it is needed # We use this script file to make the link, it avoids creating a new file if test yes = "$need_locks"; then until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do func_echo "Waiting for $lockfile to be removed" sleep 2 done elif test warn = "$need_locks"; then if test -f "$lockfile"; then $ECHO "\ *** ERROR, $lockfile exists and contains: `cat $lockfile 2>/dev/null` This indicates that another process is trying to use the same temporary object file, and libtool could not work around it because your compiler does not support '-c' and '-o' together. If you repeat this compilation, it may succeed, by chance, but you had better avoid parallel builds (make -j) in this platform, or get a better compiler." $opt_dry_run || $RM $removelist exit $EXIT_FAILURE fi func_append removelist " $output_obj" $ECHO "$srcfile" > "$lockfile" fi $opt_dry_run || $RM $removelist func_append removelist " $lockfile" trap '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE' 1 2 15 func_to_tool_file "$srcfile" func_convert_file_msys_to_w32 srcfile=$func_to_tool_file_result func_quote_for_eval "$srcfile" qsrcfile=$func_quote_for_eval_result # Only build a PIC object if we are building libtool libraries. if test yes = "$build_libtool_libs"; then # Without this assignment, base_compile gets emptied. fbsd_hideous_sh_bug=$base_compile if test no != "$pic_mode"; then command="$base_compile $qsrcfile $pic_flag" else # Don't build PIC code command="$base_compile $qsrcfile" fi func_mkdir_p "$xdir$objdir" if test -z "$output_obj"; then # Place PIC objects in $objdir func_append command " -o $lobj" fi func_show_eval_locale "$command" \ 'test -n "$output_obj" && $RM $removelist; exit $EXIT_FAILURE' if test warn = "$need_locks" && test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then $ECHO "\ *** ERROR, $lockfile contains: `cat $lockfile 2>/dev/null` but it should contain: $srcfile This indicates that another process is trying to use the same temporary object file, and libtool could not work around it because your compiler does not support '-c' and '-o' together. If you repeat this compilation, it may succeed, by chance, but you had better avoid parallel builds (make -j) in this platform, or get a better compiler." $opt_dry_run || $RM $removelist exit $EXIT_FAILURE fi # Just move the object if needed, then go on to compile the next one if test -n "$output_obj" && test "X$output_obj" != "X$lobj"; then func_show_eval '$MV "$output_obj" "$lobj"' \ 'error=$?; $opt_dry_run || $RM $removelist; exit $error' fi # Allow error messages only from the first compilation. if test yes = "$suppress_opt"; then suppress_output=' >/dev/null 2>&1' fi fi # Only build a position-dependent object if we build old libraries. if test yes = "$build_old_libs"; then if test yes != "$pic_mode"; then # Don't build PIC code command="$base_compile $qsrcfile$pie_flag" else command="$base_compile $qsrcfile $pic_flag" fi if test yes = "$compiler_c_o"; then func_append command " -o $obj" fi # Suppress compiler output if we already did a PIC compilation. func_append command "$suppress_output" func_show_eval_locale "$command" \ '$opt_dry_run || $RM $removelist; exit $EXIT_FAILURE' if test warn = "$need_locks" && test "X`cat $lockfile 2>/dev/null`" != "X$srcfile"; then $ECHO "\ *** ERROR, $lockfile contains: `cat $lockfile 2>/dev/null` but it should contain: $srcfile This indicates that another process is trying to use the same temporary object file, and libtool could not work around it because your compiler does not support '-c' and '-o' together. If you repeat this compilation, it may succeed, by chance, but you had better avoid parallel builds (make -j) in this platform, or get a better compiler." $opt_dry_run || $RM $removelist exit $EXIT_FAILURE fi # Just move the object if needed if test -n "$output_obj" && test "X$output_obj" != "X$obj"; then func_show_eval '$MV "$output_obj" "$obj"' \ 'error=$?; $opt_dry_run || $RM $removelist; exit $error' fi fi $opt_dry_run || { func_write_libtool_object "$libobj" "$objdir/$objname" "$objname" # Unlock the critical section if it was locked if test no != "$need_locks"; then removelist=$lockfile $RM "$lockfile" fi } exit $EXIT_SUCCESS } $opt_help || { test compile = "$opt_mode" && func_mode_compile ${1+"$@"} } func_mode_help () { # We need to display help for each of the modes. case $opt_mode in "") # Generic help is extracted from the usage comments # at the start of this file. func_help ;; clean) $ECHO \ "Usage: $progname [OPTION]... --mode=clean RM [RM-OPTION]... FILE... Remove files from the build directory. RM is the name of the program to use to delete files associated with each FILE (typically '/bin/rm'). RM-OPTIONS are options (such as '-f') to be passed to RM. If FILE is a libtool library, object or program, all the files associated with it are deleted. Otherwise, only FILE itself is deleted using RM." ;; compile) $ECHO \ "Usage: $progname [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE Compile a source file into a libtool library object. This mode accepts the following additional options: -o OUTPUT-FILE set the output file name to OUTPUT-FILE -no-suppress do not suppress compiler output for multiple passes -prefer-pic try to build PIC objects only -prefer-non-pic try to build non-PIC objects only -shared do not build a '.o' file suitable for static linking -static only build a '.o' file suitable for static linking -Wc,FLAG pass FLAG directly to the compiler COMPILE-COMMAND is a command to be used in creating a 'standard' object file from the given SOURCEFILE. The output file name is determined by removing the directory component from SOURCEFILE, then substituting the C source code suffix '.c' with the library object suffix, '.lo'." ;; execute) $ECHO \ "Usage: $progname [OPTION]... --mode=execute COMMAND [ARGS]... Automatically set library path, then run a program. This mode accepts the following additional options: -dlopen FILE add the directory containing FILE to the library path This mode sets the library path environment variable according to '-dlopen' flags. If any of the ARGS are libtool executable wrappers, then they are translated into their corresponding uninstalled binary, and any of their required library directories are added to the library path. Then, COMMAND is executed, with ARGS as arguments." ;; finish) $ECHO \ "Usage: $progname [OPTION]... --mode=finish [LIBDIR]... Complete the installation of libtool libraries. Each LIBDIR is a directory that contains libtool libraries. The commands that this mode executes may require superuser privileges. Use the '--dry-run' option if you just want to see what would be executed." ;; install) $ECHO \ "Usage: $progname [OPTION]... --mode=install INSTALL-COMMAND... Install executables or libraries. INSTALL-COMMAND is the installation command. The first component should be either the 'install' or 'cp' program. The following components of INSTALL-COMMAND are treated specially: -inst-prefix-dir PREFIX-DIR Use PREFIX-DIR as a staging area for installation The rest of the components are interpreted as arguments to that command (only BSD-compatible install options are recognized)." ;; link) $ECHO \ "Usage: $progname [OPTION]... --mode=link LINK-COMMAND... Link object files or libraries together to form another library, or to create an executable program. LINK-COMMAND is a command using the C compiler that you would use to create a program from several object files. The following components of LINK-COMMAND are treated specially: -all-static do not do any dynamic linking at all -avoid-version do not add a version suffix if possible -bindir BINDIR specify path to binaries directory (for systems where libraries must be found in the PATH setting at runtime) -dlopen FILE '-dlpreopen' FILE if it cannot be dlopened at runtime -dlpreopen FILE link in FILE and add its symbols to lt_preloaded_symbols -export-dynamic allow symbols from OUTPUT-FILE to be resolved with dlsym(3) -export-symbols SYMFILE try to export only the symbols listed in SYMFILE -export-symbols-regex REGEX try to export only the symbols matching REGEX -LLIBDIR search LIBDIR for required installed libraries -lNAME OUTPUT-FILE requires the installed library libNAME -module build a library that can dlopened -no-fast-install disable the fast-install mode -no-install link a not-installable executable -no-undefined declare that a library does not refer to external symbols -o OUTPUT-FILE create OUTPUT-FILE from the specified objects -objectlist FILE use a list of object files found in FILE to specify objects -os2dllname NAME force a short DLL name on OS/2 (no effect on other OSes) -precious-files-regex REGEX don't remove output files matching REGEX -release RELEASE specify package release information -rpath LIBDIR the created library will eventually be installed in LIBDIR -R[ ]LIBDIR add LIBDIR to the runtime path of programs and libraries -shared only do dynamic linking of libtool libraries -shrext SUFFIX override the standard shared library file extension -static do not do any dynamic linking of uninstalled libtool libraries -static-libtool-libs do not do any dynamic linking of libtool libraries -version-info CURRENT[:REVISION[:AGE]] specify library version info [each variable defaults to 0] -weak LIBNAME declare that the target provides the LIBNAME interface -Wc,FLAG -Xcompiler FLAG pass linker-specific FLAG directly to the compiler -Wl,FLAG -Xlinker FLAG pass linker-specific FLAG directly to the linker -XCClinker FLAG pass link-specific FLAG to the compiler driver (CC) All other options (arguments beginning with '-') are ignored. Every other argument is treated as a filename. Files ending in '.la' are treated as uninstalled libtool libraries, other files are standard or library object files. If the OUTPUT-FILE ends in '.la', then a libtool library is created, only library objects ('.lo' files) may be specified, and '-rpath' is required, except when creating a convenience library. If OUTPUT-FILE ends in '.a' or '.lib', then a standard library is created using 'ar' and 'ranlib', or on Windows using 'lib'. If OUTPUT-FILE ends in '.lo' or '.$objext', then a reloadable object file is created, otherwise an executable program is created." ;; uninstall) $ECHO \ "Usage: $progname [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE... Remove libraries from an installation directory. RM is the name of the program to use to delete files associated with each FILE (typically '/bin/rm'). RM-OPTIONS are options (such as '-f') to be passed to RM. If FILE is a libtool library, all the files associated with it are deleted. Otherwise, only FILE itself is deleted using RM." ;; *) func_fatal_help "invalid operation mode '$opt_mode'" ;; esac echo $ECHO "Try '$progname --help' for more information about other modes." } # Now that we've collected a possible --mode arg, show help if necessary if $opt_help; then if test : = "$opt_help"; then func_mode_help else { func_help noexit for opt_mode in compile link execute install finish uninstall clean; do func_mode_help done } | $SED -n '1p; 2,$s/^Usage:/ or: /p' { func_help noexit for opt_mode in compile link execute install finish uninstall clean; do echo func_mode_help done } | $SED '1d /^When reporting/,/^Report/{ H d } $x /information about other modes/d /more detailed .*MODE/d s/^Usage:.*--mode=\([^ ]*\) .*/Description of \1 mode:/' fi exit $? fi # func_mode_execute arg... func_mode_execute () { $debug_cmd # The first argument is the command name. cmd=$nonopt test -z "$cmd" && \ func_fatal_help "you must specify a COMMAND" # Handle -dlopen flags immediately. for file in $opt_dlopen; do test -f "$file" \ || func_fatal_help "'$file' is not a file" dir= case $file in *.la) func_resolve_sysroot "$file" file=$func_resolve_sysroot_result # Check to see that this really is a libtool archive. func_lalib_unsafe_p "$file" \ || func_fatal_help "'$lib' is not a valid libtool archive" # Read the libtool library. dlname= library_names= func_source "$file" # Skip this library if it cannot be dlopened. if test -z "$dlname"; then # Warn if it was a shared library. test -n "$library_names" && \ func_warning "'$file' was not linked with '-export-dynamic'" continue fi func_dirname "$file" "" "." dir=$func_dirname_result if test -f "$dir/$objdir/$dlname"; then func_append dir "/$objdir" else if test ! -f "$dir/$dlname"; then func_fatal_error "cannot find '$dlname' in '$dir' or '$dir/$objdir'" fi fi ;; *.lo) # Just add the directory containing the .lo file. func_dirname "$file" "" "." dir=$func_dirname_result ;; *) func_warning "'-dlopen' is ignored for non-libtool libraries and objects" continue ;; esac # Get the absolute pathname. absdir=`cd "$dir" && pwd` test -n "$absdir" && dir=$absdir # Now add the directory to shlibpath_var. if eval "test -z \"\$$shlibpath_var\""; then eval "$shlibpath_var=\"\$dir\"" else eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\"" fi done # This variable tells wrapper scripts just to set shlibpath_var # rather than running their programs. libtool_execute_magic=$magic # Check if any of the arguments is a wrapper script. args= for file do case $file in -* | *.la | *.lo ) ;; *) # Do a test to see if this is really a libtool program. if func_ltwrapper_script_p "$file"; then func_source "$file" # Transform arg to wrapped name. file=$progdir/$program elif func_ltwrapper_executable_p "$file"; then func_ltwrapper_scriptname "$file" func_source "$func_ltwrapper_scriptname_result" # Transform arg to wrapped name. file=$progdir/$program fi ;; esac # Quote arguments (to preserve shell metacharacters). func_append_quoted args "$file" done if $opt_dry_run; then # Display what would be done. if test -n "$shlibpath_var"; then eval "\$ECHO \"\$shlibpath_var=\$$shlibpath_var\"" echo "export $shlibpath_var" fi $ECHO "$cmd$args" exit $EXIT_SUCCESS else if test -n "$shlibpath_var"; then # Export the shlibpath_var. eval "export $shlibpath_var" fi # Restore saved environment variables for lt_var in LANG LANGUAGE LC_ALL LC_CTYPE LC_COLLATE LC_MESSAGES do eval "if test \"\${save_$lt_var+set}\" = set; then $lt_var=\$save_$lt_var; export $lt_var else $lt_unset $lt_var fi" done # Now prepare to actually exec the command. exec_cmd=\$cmd$args fi } test execute = "$opt_mode" && func_mode_execute ${1+"$@"} # func_mode_finish arg... func_mode_finish () { $debug_cmd libs= libdirs= admincmds= for opt in "$nonopt" ${1+"$@"} do if test -d "$opt"; then func_append libdirs " $opt" elif test -f "$opt"; then if func_lalib_unsafe_p "$opt"; then func_append libs " $opt" else func_warning "'$opt' is not a valid libtool archive" fi else func_fatal_error "invalid argument '$opt'" fi done if test -n "$libs"; then if test -n "$lt_sysroot"; then sysroot_regex=`$ECHO "$lt_sysroot" | $SED "$sed_make_literal_regex"` sysroot_cmd="s/\([ ']\)$sysroot_regex/\1/g;" else sysroot_cmd= fi # Remove sysroot references if $opt_dry_run; then for lib in $libs; do echo "removing references to $lt_sysroot and '=' prefixes from $lib" done else tmpdir=`func_mktempdir` for lib in $libs; do $SED -e "$sysroot_cmd s/\([ ']-[LR]\)=/\1/g; s/\([ ']\)=/\1/g" $lib \ > $tmpdir/tmp-la mv -f $tmpdir/tmp-la $lib done ${RM}r "$tmpdir" fi fi if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then for libdir in $libdirs; do if test -n "$finish_cmds"; then # Do each command in the finish commands. func_execute_cmds "$finish_cmds" 'admincmds="$admincmds '"$cmd"'"' fi if test -n "$finish_eval"; then # Do the single finish_eval. eval cmds=\"$finish_eval\" $opt_dry_run || eval "$cmds" || func_append admincmds " $cmds" fi done fi # Exit here if they wanted silent mode. $opt_quiet && exit $EXIT_SUCCESS if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then echo "----------------------------------------------------------------------" echo "Libraries have been installed in:" for libdir in $libdirs; do $ECHO " $libdir" done echo echo "If you ever happen to want to link against installed libraries" echo "in a given directory, LIBDIR, you must either use libtool, and" echo "specify the full pathname of the library, or use the '-LLIBDIR'" echo "flag during linking and do at least one of the following:" if test -n "$shlibpath_var"; then echo " - add LIBDIR to the '$shlibpath_var' environment variable" echo " during execution" fi if test -n "$runpath_var"; then echo " - add LIBDIR to the '$runpath_var' environment variable" echo " during linking" fi if test -n "$hardcode_libdir_flag_spec"; then libdir=LIBDIR eval flag=\"$hardcode_libdir_flag_spec\" $ECHO " - use the '$flag' linker flag" fi if test -n "$admincmds"; then $ECHO " - have your system administrator run these commands:$admincmds" fi if test -f /etc/ld.so.conf; then echo " - have your system administrator add LIBDIR to '/etc/ld.so.conf'" fi echo echo "See any operating system documentation about shared libraries for" case $host in solaris2.[6789]|solaris2.1[0-9]) echo "more information, such as the ld(1), crle(1) and ld.so(8) manual" echo "pages." ;; *) echo "more information, such as the ld(1) and ld.so(8) manual pages." ;; esac echo "----------------------------------------------------------------------" fi exit $EXIT_SUCCESS } test finish = "$opt_mode" && func_mode_finish ${1+"$@"} # func_mode_install arg... func_mode_install () { $debug_cmd # There may be an optional sh(1) argument at the beginning of # install_prog (especially on Windows NT). if test "$SHELL" = "$nonopt" || test /bin/sh = "$nonopt" || # Allow the use of GNU shtool's install command. case $nonopt in *shtool*) :;; *) false;; esac then # Aesthetically quote it. func_quote_for_eval "$nonopt" install_prog="$func_quote_for_eval_result " arg=$1 shift else install_prog= arg=$nonopt fi # The real first argument should be the name of the installation program. # Aesthetically quote it. func_quote_for_eval "$arg" func_append install_prog "$func_quote_for_eval_result" install_shared_prog=$install_prog case " $install_prog " in *[\\\ /]cp\ *) install_cp=: ;; *) install_cp=false ;; esac # We need to accept at least all the BSD install flags. dest= files= opts= prev= install_type= isdir=false stripme= no_mode=: for arg do arg2= if test -n "$dest"; then func_append files " $dest" dest=$arg continue fi case $arg in -d) isdir=: ;; -f) if $install_cp; then :; else prev=$arg fi ;; -g | -m | -o) prev=$arg ;; -s) stripme=" -s" continue ;; -*) ;; *) # If the previous option needed an argument, then skip it. if test -n "$prev"; then if test X-m = "X$prev" && test -n "$install_override_mode"; then arg2=$install_override_mode no_mode=false fi prev= else dest=$arg continue fi ;; esac # Aesthetically quote the argument. func_quote_for_eval "$arg" func_append install_prog " $func_quote_for_eval_result" if test -n "$arg2"; then func_quote_for_eval "$arg2" fi func_append install_shared_prog " $func_quote_for_eval_result" done test -z "$install_prog" && \ func_fatal_help "you must specify an install program" test -n "$prev" && \ func_fatal_help "the '$prev' option requires an argument" if test -n "$install_override_mode" && $no_mode; then if $install_cp; then :; else func_quote_for_eval "$install_override_mode" func_append install_shared_prog " -m $func_quote_for_eval_result" fi fi if test -z "$files"; then if test -z "$dest"; then func_fatal_help "no file or destination specified" else func_fatal_help "you must specify a destination" fi fi # Strip any trailing slash from the destination. func_stripname '' '/' "$dest" dest=$func_stripname_result # Check to see that the destination is a directory. test -d "$dest" && isdir=: if $isdir; then destdir=$dest destname= else func_dirname_and_basename "$dest" "" "." destdir=$func_dirname_result destname=$func_basename_result # Not a directory, so check to see that there is only one file specified. set dummy $files; shift test "$#" -gt 1 && \ func_fatal_help "'$dest' is not a directory" fi case $destdir in [\\/]* | [A-Za-z]:[\\/]*) ;; *) for file in $files; do case $file in *.lo) ;; *) func_fatal_help "'$destdir' must be an absolute directory name" ;; esac done ;; esac # This variable tells wrapper scripts just to set variables rather # than running their programs. libtool_install_magic=$magic staticlibs= future_libdirs= current_libdirs= for file in $files; do # Do each installation. case $file in *.$libext) # Do the static libraries later. func_append staticlibs " $file" ;; *.la) func_resolve_sysroot "$file" file=$func_resolve_sysroot_result # Check to see that this really is a libtool archive. func_lalib_unsafe_p "$file" \ || func_fatal_help "'$file' is not a valid libtool archive" library_names= old_library= relink_command= func_source "$file" # Add the libdir to current_libdirs if it is the destination. if test "X$destdir" = "X$libdir"; then case "$current_libdirs " in *" $libdir "*) ;; *) func_append current_libdirs " $libdir" ;; esac else # Note the libdir as a future libdir. case "$future_libdirs " in *" $libdir "*) ;; *) func_append future_libdirs " $libdir" ;; esac fi func_dirname "$file" "/" "" dir=$func_dirname_result func_append dir "$objdir" if test -n "$relink_command"; then # Determine the prefix the user has applied to our future dir. inst_prefix_dir=`$ECHO "$destdir" | $SED -e "s%$libdir\$%%"` # Don't allow the user to place us outside of our expected # location b/c this prevents finding dependent libraries that # are installed to the same prefix. # At present, this check doesn't affect windows .dll's that # are installed into $libdir/../bin (currently, that works fine) # but it's something to keep an eye on. test "$inst_prefix_dir" = "$destdir" && \ func_fatal_error "error: cannot install '$file' to a directory not ending in $libdir" if test -n "$inst_prefix_dir"; then # Stick the inst_prefix_dir data into the link command. relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%-inst-prefix-dir $inst_prefix_dir%"` else relink_command=`$ECHO "$relink_command" | $SED "s%@inst_prefix_dir@%%"` fi func_warning "relinking '$file'" func_show_eval "$relink_command" \ 'func_fatal_error "error: relink '\''$file'\'' with the above command before installing it"' fi # See the names of the shared library. set dummy $library_names; shift if test -n "$1"; then realname=$1 shift srcname=$realname test -n "$relink_command" && srcname=${realname}T # Install the shared library and build the symlinks. func_show_eval "$install_shared_prog $dir/$srcname $destdir/$realname" \ 'exit $?' tstripme=$stripme case $host_os in cygwin* | mingw* | pw32* | cegcc*) case $realname in *.dll.a) tstripme= ;; esac ;; os2*) case $realname in *_dll.a) tstripme= ;; esac ;; esac if test -n "$tstripme" && test -n "$striplib"; then func_show_eval "$striplib $destdir/$realname" 'exit $?' fi if test "$#" -gt 0; then # Delete the old symlinks, and create new ones. # Try 'ln -sf' first, because the 'ln' binary might depend on # the symlink we replace! Solaris /bin/ln does not understand -f, # so we also need to try rm && ln -s. for linkname do test "$linkname" != "$realname" \ && func_show_eval "(cd $destdir && { $LN_S -f $realname $linkname || { $RM $linkname && $LN_S $realname $linkname; }; })" done fi # Do each command in the postinstall commands. lib=$destdir/$realname func_execute_cmds "$postinstall_cmds" 'exit $?' fi # Install the pseudo-library for information purposes. func_basename "$file" name=$func_basename_result instname=$dir/${name}i func_show_eval "$install_prog $instname $destdir/$name" 'exit $?' # Maybe install the static library, too. test -n "$old_library" && func_append staticlibs " $dir/$old_library" ;; *.lo) # Install (i.e. copy) a libtool object. # Figure out destination file name, if it wasn't already specified. if test -n "$destname"; then destfile=$destdir/$destname else func_basename "$file" destfile=$func_basename_result destfile=$destdir/$destfile fi # Deduce the name of the destination old-style object file. case $destfile in *.lo) func_lo2o "$destfile" staticdest=$func_lo2o_result ;; *.$objext) staticdest=$destfile destfile= ;; *) func_fatal_help "cannot copy a libtool object to '$destfile'" ;; esac # Install the libtool object if requested. test -n "$destfile" && \ func_show_eval "$install_prog $file $destfile" 'exit $?' # Install the old object if enabled. if test yes = "$build_old_libs"; then # Deduce the name of the old-style object file. func_lo2o "$file" staticobj=$func_lo2o_result func_show_eval "$install_prog \$staticobj \$staticdest" 'exit $?' fi exit $EXIT_SUCCESS ;; *) # Figure out destination file name, if it wasn't already specified. if test -n "$destname"; then destfile=$destdir/$destname else func_basename "$file" destfile=$func_basename_result destfile=$destdir/$destfile fi # If the file is missing, and there is a .exe on the end, strip it # because it is most likely a libtool script we actually want to # install stripped_ext= case $file in *.exe) if test ! -f "$file"; then func_stripname '' '.exe' "$file" file=$func_stripname_result stripped_ext=.exe fi ;; esac # Do a test to see if this is really a libtool program. case $host in *cygwin* | *mingw*) if func_ltwrapper_executable_p "$file"; then func_ltwrapper_scriptname "$file" wrapper=$func_ltwrapper_scriptname_result else func_stripname '' '.exe' "$file" wrapper=$func_stripname_result fi ;; *) wrapper=$file ;; esac if func_ltwrapper_script_p "$wrapper"; then notinst_deplibs= relink_command= func_source "$wrapper" # Check the variables that should have been set. test -z "$generated_by_libtool_version" && \ func_fatal_error "invalid libtool wrapper script '$wrapper'" finalize=: for lib in $notinst_deplibs; do # Check to see that each library is installed. libdir= if test -f "$lib"; then func_source "$lib" fi libfile=$libdir/`$ECHO "$lib" | $SED 's%^.*/%%g'` if test -n "$libdir" && test ! -f "$libfile"; then func_warning "'$lib' has not been installed in '$libdir'" finalize=false fi done relink_command= func_source "$wrapper" outputname= if test no = "$fast_install" && test -n "$relink_command"; then $opt_dry_run || { if $finalize; then tmpdir=`func_mktempdir` func_basename "$file$stripped_ext" file=$func_basename_result outputname=$tmpdir/$file # Replace the output file specification. relink_command=`$ECHO "$relink_command" | $SED 's%@OUTPUT@%'"$outputname"'%g'` $opt_quiet || { func_quote_for_expand "$relink_command" eval "func_echo $func_quote_for_expand_result" } if eval "$relink_command"; then : else func_error "error: relink '$file' with the above command before installing it" $opt_dry_run || ${RM}r "$tmpdir" continue fi file=$outputname else func_warning "cannot relink '$file'" fi } else # Install the binary that we compiled earlier. file=`$ECHO "$file$stripped_ext" | $SED "s%\([^/]*\)$%$objdir/\1%"` fi fi # remove .exe since cygwin /usr/bin/install will append another # one anyway case $install_prog,$host in */usr/bin/install*,*cygwin*) case $file:$destfile in *.exe:*.exe) # this is ok ;; *.exe:*) destfile=$destfile.exe ;; *:*.exe) func_stripname '' '.exe' "$destfile" destfile=$func_stripname_result ;; esac ;; esac func_show_eval "$install_prog\$stripme \$file \$destfile" 'exit $?' $opt_dry_run || if test -n "$outputname"; then ${RM}r "$tmpdir" fi ;; esac done for file in $staticlibs; do func_basename "$file" name=$func_basename_result # Set up the ranlib parameters. oldlib=$destdir/$name func_to_tool_file "$oldlib" func_convert_file_msys_to_w32 tool_oldlib=$func_to_tool_file_result func_show_eval "$install_prog \$file \$oldlib" 'exit $?' if test -n "$stripme" && test -n "$old_striplib"; then func_show_eval "$old_striplib $tool_oldlib" 'exit $?' fi # Do each command in the postinstall commands. func_execute_cmds "$old_postinstall_cmds" 'exit $?' done test -n "$future_libdirs" && \ func_warning "remember to run '$progname --finish$future_libdirs'" if test -n "$current_libdirs"; then # Maybe just do a dry run. $opt_dry_run && current_libdirs=" -n$current_libdirs" exec_cmd='$SHELL "$progpath" $preserve_args --finish$current_libdirs' else exit $EXIT_SUCCESS fi } test install = "$opt_mode" && func_mode_install ${1+"$@"} # func_generate_dlsyms outputname originator pic_p # Extract symbols from dlprefiles and create ${outputname}S.o with # a dlpreopen symbol table. func_generate_dlsyms () { $debug_cmd my_outputname=$1 my_originator=$2 my_pic_p=${3-false} my_prefix=`$ECHO "$my_originator" | $SED 's%[^a-zA-Z0-9]%_%g'` my_dlsyms= if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then if test -n "$NM" && test -n "$global_symbol_pipe"; then my_dlsyms=${my_outputname}S.c else func_error "not configured to extract global symbols from dlpreopened files" fi fi if test -n "$my_dlsyms"; then case $my_dlsyms in "") ;; *.c) # Discover the nlist of each of the dlfiles. nlist=$output_objdir/$my_outputname.nm func_show_eval "$RM $nlist ${nlist}S ${nlist}T" # Parse the name list into a source file. func_verbose "creating $output_objdir/$my_dlsyms" $opt_dry_run || $ECHO > "$output_objdir/$my_dlsyms" "\ /* $my_dlsyms - symbol resolution table for '$my_outputname' dlsym emulation. */ /* Generated by $PROGRAM (GNU $PACKAGE) $VERSION */ #ifdef __cplusplus extern \"C\" { #endif #if defined __GNUC__ && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) #pragma GCC diagnostic ignored \"-Wstrict-prototypes\" #endif /* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests. */ #if defined _WIN32 || defined __CYGWIN__ || defined _WIN32_WCE /* DATA imports from DLLs on WIN32 can't be const, because runtime relocations are performed -- see ld's documentation on pseudo-relocs. */ # define LT_DLSYM_CONST #elif defined __osf__ /* This system does not cope well with relocations in const data. */ # define LT_DLSYM_CONST #else # define LT_DLSYM_CONST const #endif #define STREQ(s1, s2) (strcmp ((s1), (s2)) == 0) /* External symbol declarations for the compiler. */\ " if test yes = "$dlself"; then func_verbose "generating symbol list for '$output'" $opt_dry_run || echo ': @PROGRAM@ ' > "$nlist" # Add our own program objects to the symbol list. progfiles=`$ECHO "$objs$old_deplibs" | $SP2NL | $SED "$lo2o" | $NL2SP` for progfile in $progfiles; do func_to_tool_file "$progfile" func_convert_file_msys_to_w32 func_verbose "extracting global C symbols from '$func_to_tool_file_result'" $opt_dry_run || eval "$NM $func_to_tool_file_result | $global_symbol_pipe >> '$nlist'" done if test -n "$exclude_expsyms"; then $opt_dry_run || { eval '$EGREP -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T' eval '$MV "$nlist"T "$nlist"' } fi if test -n "$export_symbols_regex"; then $opt_dry_run || { eval '$EGREP -e "$export_symbols_regex" "$nlist" > "$nlist"T' eval '$MV "$nlist"T "$nlist"' } fi # Prepare the list of exported symbols if test -z "$export_symbols"; then export_symbols=$output_objdir/$outputname.exp $opt_dry_run || { $RM $export_symbols eval "$SED -n -e '/^: @PROGRAM@ $/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"' case $host in *cygwin* | *mingw* | *cegcc* ) eval "echo EXPORTS "'> "$output_objdir/$outputname.def"' eval 'cat "$export_symbols" >> "$output_objdir/$outputname.def"' ;; esac } else $opt_dry_run || { eval "$SED -e 's/\([].[*^$]\)/\\\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$outputname.exp"' eval '$GREP -f "$output_objdir/$outputname.exp" < "$nlist" > "$nlist"T' eval '$MV "$nlist"T "$nlist"' case $host in *cygwin* | *mingw* | *cegcc* ) eval "echo EXPORTS "'> "$output_objdir/$outputname.def"' eval 'cat "$nlist" >> "$output_objdir/$outputname.def"' ;; esac } fi fi for dlprefile in $dlprefiles; do func_verbose "extracting global C symbols from '$dlprefile'" func_basename "$dlprefile" name=$func_basename_result case $host in *cygwin* | *mingw* | *cegcc* ) # if an import library, we need to obtain dlname if func_win32_import_lib_p "$dlprefile"; then func_tr_sh "$dlprefile" eval "curr_lafile=\$libfile_$func_tr_sh_result" dlprefile_dlbasename= if test -n "$curr_lafile" && func_lalib_p "$curr_lafile"; then # Use subshell, to avoid clobbering current variable values dlprefile_dlname=`source "$curr_lafile" && echo "$dlname"` if test -n "$dlprefile_dlname"; then func_basename "$dlprefile_dlname" dlprefile_dlbasename=$func_basename_result else # no lafile. user explicitly requested -dlpreopen . $sharedlib_from_linklib_cmd "$dlprefile" dlprefile_dlbasename=$sharedlib_from_linklib_result fi fi $opt_dry_run || { if test -n "$dlprefile_dlbasename"; then eval '$ECHO ": $dlprefile_dlbasename" >> "$nlist"' else func_warning "Could not compute DLL name from $name" eval '$ECHO ": $name " >> "$nlist"' fi func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32 eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe | $SED -e '/I __imp/d' -e 's/I __nm_/D /;s/_nm__//' >> '$nlist'" } else # not an import lib $opt_dry_run || { eval '$ECHO ": $name " >> "$nlist"' func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32 eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe >> '$nlist'" } fi ;; *) $opt_dry_run || { eval '$ECHO ": $name " >> "$nlist"' func_to_tool_file "$dlprefile" func_convert_file_msys_to_w32 eval "$NM \"$func_to_tool_file_result\" 2>/dev/null | $global_symbol_pipe >> '$nlist'" } ;; esac done $opt_dry_run || { # Make sure we have at least an empty file. test -f "$nlist" || : > "$nlist" if test -n "$exclude_expsyms"; then $EGREP -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T $MV "$nlist"T "$nlist" fi # Try sorting and uniquifying the output. if $GREP -v "^: " < "$nlist" | if sort -k 3 /dev/null 2>&1; then sort -k 3 else sort +2 fi | uniq > "$nlist"S; then : else $GREP -v "^: " < "$nlist" > "$nlist"S fi if test -f "$nlist"S; then eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$my_dlsyms"' else echo '/* NONE */' >> "$output_objdir/$my_dlsyms" fi func_show_eval '$RM "${nlist}I"' if test -n "$global_symbol_to_import"; then eval "$global_symbol_to_import"' < "$nlist"S > "$nlist"I' fi echo >> "$output_objdir/$my_dlsyms" "\ /* The mapping between symbol names and symbols. */ typedef struct { const char *name; void *address; } lt_dlsymlist; extern LT_DLSYM_CONST lt_dlsymlist lt_${my_prefix}_LTX_preloaded_symbols[];\ " if test -s "$nlist"I; then echo >> "$output_objdir/$my_dlsyms" "\ static void lt_syminit(void) { LT_DLSYM_CONST lt_dlsymlist *symbol = lt_${my_prefix}_LTX_preloaded_symbols; for (; symbol->name; ++symbol) {" $SED 's/.*/ if (STREQ (symbol->name, \"&\")) symbol->address = (void *) \&&;/' < "$nlist"I >> "$output_objdir/$my_dlsyms" echo >> "$output_objdir/$my_dlsyms" "\ } }" fi echo >> "$output_objdir/$my_dlsyms" "\ LT_DLSYM_CONST lt_dlsymlist lt_${my_prefix}_LTX_preloaded_symbols[] = { {\"$my_originator\", (void *) 0}," if test -s "$nlist"I; then echo >> "$output_objdir/$my_dlsyms" "\ {\"@INIT@\", (void *) <_syminit}," fi case $need_lib_prefix in no) eval "$global_symbol_to_c_name_address" < "$nlist" >> "$output_objdir/$my_dlsyms" ;; *) eval "$global_symbol_to_c_name_address_lib_prefix" < "$nlist" >> "$output_objdir/$my_dlsyms" ;; esac echo >> "$output_objdir/$my_dlsyms" "\ {0, (void *) 0} }; /* This works around a problem in FreeBSD linker */ #ifdef FREEBSD_WORKAROUND static const void *lt_preloaded_setup() { return lt_${my_prefix}_LTX_preloaded_symbols; } #endif #ifdef __cplusplus } #endif\ " } # !$opt_dry_run pic_flag_for_symtable= case "$compile_command " in *" -static "*) ;; *) case $host in # compiling the symbol table file with pic_flag works around # a FreeBSD bug that causes programs to crash when -lm is # linked before any other PIC object. But we must not use # pic_flag when linking with -static. The problem exists in # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1. *-*-freebsd2.*|*-*-freebsd3.0*|*-*-freebsdelf3.0*) pic_flag_for_symtable=" $pic_flag -DFREEBSD_WORKAROUND" ;; *-*-hpux*) pic_flag_for_symtable=" $pic_flag" ;; *) $my_pic_p && pic_flag_for_symtable=" $pic_flag" ;; esac ;; esac symtab_cflags= for arg in $LTCFLAGS; do case $arg in -pie | -fpie | -fPIE) ;; *) func_append symtab_cflags " $arg" ;; esac done # Now compile the dynamic symbol file. func_show_eval '(cd $output_objdir && $LTCC$symtab_cflags -c$no_builtin_flag$pic_flag_for_symtable "$my_dlsyms")' 'exit $?' # Clean up the generated files. func_show_eval '$RM "$output_objdir/$my_dlsyms" "$nlist" "${nlist}S" "${nlist}T" "${nlist}I"' # Transform the symbol file into the correct name. symfileobj=$output_objdir/${my_outputname}S.$objext case $host in *cygwin* | *mingw* | *cegcc* ) if test -f "$output_objdir/$my_outputname.def"; then compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$output_objdir/$my_outputname.def $symfileobj%"` finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$output_objdir/$my_outputname.def $symfileobj%"` else compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$symfileobj%"` finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$symfileobj%"` fi ;; *) compile_command=`$ECHO "$compile_command" | $SED "s%@SYMFILE@%$symfileobj%"` finalize_command=`$ECHO "$finalize_command" | $SED "s%@SYMFILE@%$symfileobj%"` ;; esac ;; *) func_fatal_error "unknown suffix for '$my_dlsyms'" ;; esac else # We keep going just in case the user didn't refer to # lt_preloaded_symbols. The linker will fail if global_symbol_pipe # really was required. # Nullify the symbol file. compile_command=`$ECHO "$compile_command" | $SED "s% @SYMFILE@%%"` finalize_command=`$ECHO "$finalize_command" | $SED "s% @SYMFILE@%%"` fi } # func_cygming_gnu_implib_p ARG # This predicate returns with zero status (TRUE) if # ARG is a GNU/binutils-style import library. Returns # with nonzero status (FALSE) otherwise. func_cygming_gnu_implib_p () { $debug_cmd func_to_tool_file "$1" func_convert_file_msys_to_w32 func_cygming_gnu_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $EGREP ' (_head_[A-Za-z0-9_]+_[ad]l*|[A-Za-z0-9_]+_[ad]l*_iname)$'` test -n "$func_cygming_gnu_implib_tmp" } # func_cygming_ms_implib_p ARG # This predicate returns with zero status (TRUE) if # ARG is an MS-style import library. Returns # with nonzero status (FALSE) otherwise. func_cygming_ms_implib_p () { $debug_cmd func_to_tool_file "$1" func_convert_file_msys_to_w32 func_cygming_ms_implib_tmp=`$NM "$func_to_tool_file_result" | eval "$global_symbol_pipe" | $GREP '_NULL_IMPORT_DESCRIPTOR'` test -n "$func_cygming_ms_implib_tmp" } # func_win32_libid arg # return the library type of file 'arg' # # Need a lot of goo to handle *both* DLLs and import libs # Has to be a shell function in order to 'eat' the argument # that is supplied when $file_magic_command is called. # Despite the name, also deal with 64 bit binaries. func_win32_libid () { $debug_cmd win32_libid_type=unknown win32_fileres=`file -L $1 2>/dev/null` case $win32_fileres in *ar\ archive\ import\ library*) # definitely import win32_libid_type="x86 archive import" ;; *ar\ archive*) # could be an import, or static # Keep the egrep pattern in sync with the one in _LT_CHECK_MAGIC_METHOD. if eval $OBJDUMP -f $1 | $SED -e '10q' 2>/dev/null | $EGREP 'file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)' >/dev/null; then case $nm_interface in "MS dumpbin") if func_cygming_ms_implib_p "$1" || func_cygming_gnu_implib_p "$1" then win32_nmres=import else win32_nmres= fi ;; *) func_to_tool_file "$1" func_convert_file_msys_to_w32 win32_nmres=`eval $NM -f posix -A \"$func_to_tool_file_result\" | $SED -n -e ' 1,100{ / I /{ s|.*|import| p q } }'` ;; esac case $win32_nmres in import*) win32_libid_type="x86 archive import";; *) win32_libid_type="x86 archive static";; esac fi ;; *DLL*) win32_libid_type="x86 DLL" ;; *executable*) # but shell scripts are "executable" too... case $win32_fileres in *MS\ Windows\ PE\ Intel*) win32_libid_type="x86 DLL" ;; esac ;; esac $ECHO "$win32_libid_type" } # func_cygming_dll_for_implib ARG # # Platform-specific function to extract the # name of the DLL associated with the specified # import library ARG. # Invoked by eval'ing the libtool variable # $sharedlib_from_linklib_cmd # Result is available in the variable # $sharedlib_from_linklib_result func_cygming_dll_for_implib () { $debug_cmd sharedlib_from_linklib_result=`$DLLTOOL --identify-strict --identify "$1"` } # func_cygming_dll_for_implib_fallback_core SECTION_NAME LIBNAMEs # # The is the core of a fallback implementation of a # platform-specific function to extract the name of the # DLL associated with the specified import library LIBNAME. # # SECTION_NAME is either .idata$6 or .idata$7, depending # on the platform and compiler that created the implib. # # Echos the name of the DLL associated with the # specified import library. func_cygming_dll_for_implib_fallback_core () { $debug_cmd match_literal=`$ECHO "$1" | $SED "$sed_make_literal_regex"` $OBJDUMP -s --section "$1" "$2" 2>/dev/null | $SED '/^Contents of section '"$match_literal"':/{ # Place marker at beginning of archive member dllname section s/.*/====MARK====/ p d } # These lines can sometimes be longer than 43 characters, but # are always uninteresting /:[ ]*file format pe[i]\{,1\}-/d /^In archive [^:]*:/d # Ensure marker is printed /^====MARK====/p # Remove all lines with less than 43 characters /^.\{43\}/!d # From remaining lines, remove first 43 characters s/^.\{43\}//' | $SED -n ' # Join marker and all lines until next marker into a single line /^====MARK====/ b para H $ b para b :para x s/\n//g # Remove the marker s/^====MARK====// # Remove trailing dots and whitespace s/[\. \t]*$// # Print /./p' | # we now have a list, one entry per line, of the stringified # contents of the appropriate section of all members of the # archive that possess that section. Heuristic: eliminate # all those that have a first or second character that is # a '.' (that is, objdump's representation of an unprintable # character.) This should work for all archives with less than # 0x302f exports -- but will fail for DLLs whose name actually # begins with a literal '.' or a single character followed by # a '.'. # # Of those that remain, print the first one. $SED -e '/^\./d;/^.\./d;q' } # func_cygming_dll_for_implib_fallback ARG # Platform-specific function to extract the # name of the DLL associated with the specified # import library ARG. # # This fallback implementation is for use when $DLLTOOL # does not support the --identify-strict option. # Invoked by eval'ing the libtool variable # $sharedlib_from_linklib_cmd # Result is available in the variable # $sharedlib_from_linklib_result func_cygming_dll_for_implib_fallback () { $debug_cmd if func_cygming_gnu_implib_p "$1"; then # binutils import library sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$7' "$1"` elif func_cygming_ms_implib_p "$1"; then # ms-generated import library sharedlib_from_linklib_result=`func_cygming_dll_for_implib_fallback_core '.idata$6' "$1"` else # unknown sharedlib_from_linklib_result= fi } # func_extract_an_archive dir oldlib func_extract_an_archive () { $debug_cmd f_ex_an_ar_dir=$1; shift f_ex_an_ar_oldlib=$1 if test yes = "$lock_old_archive_extraction"; then lockfile=$f_ex_an_ar_oldlib.lock until $opt_dry_run || ln "$progpath" "$lockfile" 2>/dev/null; do func_echo "Waiting for $lockfile to be removed" sleep 2 done fi func_show_eval "(cd \$f_ex_an_ar_dir && $AR x \"\$f_ex_an_ar_oldlib\")" \ 'stat=$?; rm -f "$lockfile"; exit $stat' if test yes = "$lock_old_archive_extraction"; then $opt_dry_run || rm -f "$lockfile" fi if ($AR t "$f_ex_an_ar_oldlib" | sort | sort -uc >/dev/null 2>&1); then : else func_fatal_error "object name conflicts in archive: $f_ex_an_ar_dir/$f_ex_an_ar_oldlib" fi } # func_extract_archives gentop oldlib ... func_extract_archives () { $debug_cmd my_gentop=$1; shift my_oldlibs=${1+"$@"} my_oldobjs= my_xlib= my_xabs= my_xdir= for my_xlib in $my_oldlibs; do # Extract the objects. case $my_xlib in [\\/]* | [A-Za-z]:[\\/]*) my_xabs=$my_xlib ;; *) my_xabs=`pwd`"/$my_xlib" ;; esac func_basename "$my_xlib" my_xlib=$func_basename_result my_xlib_u=$my_xlib while :; do case " $extracted_archives " in *" $my_xlib_u "*) func_arith $extracted_serial + 1 extracted_serial=$func_arith_result my_xlib_u=lt$extracted_serial-$my_xlib ;; *) break ;; esac done extracted_archives="$extracted_archives $my_xlib_u" my_xdir=$my_gentop/$my_xlib_u func_mkdir_p "$my_xdir" case $host in *-darwin*) func_verbose "Extracting $my_xabs" # Do not bother doing anything if just a dry run $opt_dry_run || { darwin_orig_dir=`pwd` cd $my_xdir || exit $? darwin_archive=$my_xabs darwin_curdir=`pwd` func_basename "$darwin_archive" darwin_base_archive=$func_basename_result darwin_arches=`$LIPO -info "$darwin_archive" 2>/dev/null | $GREP Architectures 2>/dev/null || true` if test -n "$darwin_arches"; then darwin_arches=`$ECHO "$darwin_arches" | $SED -e 's/.*are://'` darwin_arch= func_verbose "$darwin_base_archive has multiple architectures $darwin_arches" for darwin_arch in $darwin_arches; do func_mkdir_p "unfat-$$/$darwin_base_archive-$darwin_arch" $LIPO -thin $darwin_arch -output "unfat-$$/$darwin_base_archive-$darwin_arch/$darwin_base_archive" "$darwin_archive" cd "unfat-$$/$darwin_base_archive-$darwin_arch" func_extract_an_archive "`pwd`" "$darwin_base_archive" cd "$darwin_curdir" $RM "unfat-$$/$darwin_base_archive-$darwin_arch/$darwin_base_archive" done # $darwin_arches ## Okay now we've a bunch of thin objects, gotta fatten them up :) darwin_filelist=`find unfat-$$ -type f -name \*.o -print -o -name \*.lo -print | $SED -e "$sed_basename" | sort -u` darwin_file= darwin_files= for darwin_file in $darwin_filelist; do darwin_files=`find unfat-$$ -name $darwin_file -print | sort | $NL2SP` $LIPO -create -output "$darwin_file" $darwin_files done # $darwin_filelist $RM -rf unfat-$$ cd "$darwin_orig_dir" else cd $darwin_orig_dir func_extract_an_archive "$my_xdir" "$my_xabs" fi # $darwin_arches } # !$opt_dry_run ;; *) func_extract_an_archive "$my_xdir" "$my_xabs" ;; esac my_oldobjs="$my_oldobjs "`find $my_xdir -name \*.$objext -print -o -name \*.lo -print | sort | $NL2SP` done func_extract_archives_result=$my_oldobjs } # func_emit_wrapper [arg=no] # # Emit a libtool wrapper script on stdout. # Don't directly open a file because we may want to # incorporate the script contents within a cygwin/mingw # wrapper executable. Must ONLY be called from within # func_mode_link because it depends on a number of variables # set therein. # # ARG is the value that the WRAPPER_SCRIPT_BELONGS_IN_OBJDIR # variable will take. If 'yes', then the emitted script # will assume that the directory where it is stored is # the $objdir directory. This is a cygwin/mingw-specific # behavior. func_emit_wrapper () { func_emit_wrapper_arg1=${1-no} $ECHO "\ #! $SHELL # $output - temporary wrapper script for $objdir/$outputname # Generated by $PROGRAM (GNU $PACKAGE) $VERSION # # The $output program cannot be directly executed until all the libtool # libraries that it depends on are installed. # # This wrapper script should never be moved out of the build directory. # If it is, it will not operate correctly. # Sed substitution that helps us do robust quoting. It backslashifies # metacharacters that are still active within double-quoted strings. sed_quote_subst='$sed_quote_subst' # Be Bourne compatible if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Zsh 3.x and 4.x performs word splitting on \${1+\"\$@\"}, which # is contrary to our usage. Disable this feature. alias -g '\${1+\"\$@\"}'='\"\$@\"' setopt NO_GLOB_SUBST else case \`(set -o) 2>/dev/null\` in *posix*) set -o posix;; esac fi BIN_SH=xpg4; export BIN_SH # for Tru64 DUALCASE=1; export DUALCASE # for MKS sh # The HP-UX ksh and POSIX shell print the target directory to stdout # if CDPATH is set. (unset CDPATH) >/dev/null 2>&1 && unset CDPATH relink_command=\"$relink_command\" # This environment variable determines our operation mode. if test \"\$libtool_install_magic\" = \"$magic\"; then # install mode needs the following variables: generated_by_libtool_version='$macro_version' notinst_deplibs='$notinst_deplibs' else # When we are sourced in execute mode, \$file and \$ECHO are already set. if test \"\$libtool_execute_magic\" != \"$magic\"; then file=\"\$0\"" qECHO=`$ECHO "$ECHO" | $SED "$sed_quote_subst"` $ECHO "\ # A function that is used when there is no print builtin or printf. func_fallback_echo () { eval 'cat <<_LTECHO_EOF \$1 _LTECHO_EOF' } ECHO=\"$qECHO\" fi # Very basic option parsing. These options are (a) specific to # the libtool wrapper, (b) are identical between the wrapper # /script/ and the wrapper /executable/ that is used only on # windows platforms, and (c) all begin with the string "--lt-" # (application programs are unlikely to have options that match # this pattern). # # There are only two supported options: --lt-debug and # --lt-dump-script. There is, deliberately, no --lt-help. # # The first argument to this parsing function should be the # script's $0 value, followed by "$@". lt_option_debug= func_parse_lt_options () { lt_script_arg0=\$0 shift for lt_opt do case \"\$lt_opt\" in --lt-debug) lt_option_debug=1 ;; --lt-dump-script) lt_dump_D=\`\$ECHO \"X\$lt_script_arg0\" | $SED -e 's/^X//' -e 's%/[^/]*$%%'\` test \"X\$lt_dump_D\" = \"X\$lt_script_arg0\" && lt_dump_D=. lt_dump_F=\`\$ECHO \"X\$lt_script_arg0\" | $SED -e 's/^X//' -e 's%^.*/%%'\` cat \"\$lt_dump_D/\$lt_dump_F\" exit 0 ;; --lt-*) \$ECHO \"Unrecognized --lt- option: '\$lt_opt'\" 1>&2 exit 1 ;; esac done # Print the debug banner immediately: if test -n \"\$lt_option_debug\"; then echo \"$outputname:$output:\$LINENO: libtool wrapper (GNU $PACKAGE) $VERSION\" 1>&2 fi } # Used when --lt-debug. Prints its arguments to stdout # (redirection is the responsibility of the caller) func_lt_dump_args () { lt_dump_args_N=1; for lt_arg do \$ECHO \"$outputname:$output:\$LINENO: newargv[\$lt_dump_args_N]: \$lt_arg\" lt_dump_args_N=\`expr \$lt_dump_args_N + 1\` done } # Core function for launching the target application func_exec_program_core () { " case $host in # Backslashes separate directories on plain windows *-*-mingw | *-*-os2* | *-cegcc*) $ECHO "\ if test -n \"\$lt_option_debug\"; then \$ECHO \"$outputname:$output:\$LINENO: newargv[0]: \$progdir\\\\\$program\" 1>&2 func_lt_dump_args \${1+\"\$@\"} 1>&2 fi exec \"\$progdir\\\\\$program\" \${1+\"\$@\"} " ;; *) $ECHO "\ if test -n \"\$lt_option_debug\"; then \$ECHO \"$outputname:$output:\$LINENO: newargv[0]: \$progdir/\$program\" 1>&2 func_lt_dump_args \${1+\"\$@\"} 1>&2 fi exec \"\$progdir/\$program\" \${1+\"\$@\"} " ;; esac $ECHO "\ \$ECHO \"\$0: cannot exec \$program \$*\" 1>&2 exit 1 } # A function to encapsulate launching the target application # Strips options in the --lt-* namespace from \$@ and # launches target application with the remaining arguments. func_exec_program () { case \" \$* \" in *\\ --lt-*) for lt_wr_arg do case \$lt_wr_arg in --lt-*) ;; *) set x \"\$@\" \"\$lt_wr_arg\"; shift;; esac shift done ;; esac func_exec_program_core \${1+\"\$@\"} } # Parse options func_parse_lt_options \"\$0\" \${1+\"\$@\"} # Find the directory that this script lives in. thisdir=\`\$ECHO \"\$file\" | $SED 's%/[^/]*$%%'\` test \"x\$thisdir\" = \"x\$file\" && thisdir=. # Follow symbolic links until we get to the real thisdir. file=\`ls -ld \"\$file\" | $SED -n 's/.*-> //p'\` while test -n \"\$file\"; do destdir=\`\$ECHO \"\$file\" | $SED 's%/[^/]*\$%%'\` # If there was a directory component, then change thisdir. if test \"x\$destdir\" != \"x\$file\"; then case \"\$destdir\" in [\\\\/]* | [A-Za-z]:[\\\\/]*) thisdir=\"\$destdir\" ;; *) thisdir=\"\$thisdir/\$destdir\" ;; esac fi file=\`\$ECHO \"\$file\" | $SED 's%^.*/%%'\` file=\`ls -ld \"\$thisdir/\$file\" | $SED -n 's/.*-> //p'\` done # Usually 'no', except on cygwin/mingw when embedded into # the cwrapper. WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=$func_emit_wrapper_arg1 if test \"\$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR\" = \"yes\"; then # special case for '.' if test \"\$thisdir\" = \".\"; then thisdir=\`pwd\` fi # remove .libs from thisdir case \"\$thisdir\" in *[\\\\/]$objdir ) thisdir=\`\$ECHO \"\$thisdir\" | $SED 's%[\\\\/][^\\\\/]*$%%'\` ;; $objdir ) thisdir=. ;; esac fi # Try to get the absolute directory name. absdir=\`cd \"\$thisdir\" && pwd\` test -n \"\$absdir\" && thisdir=\"\$absdir\" " if test yes = "$fast_install"; then $ECHO "\ program=lt-'$outputname'$exeext progdir=\"\$thisdir/$objdir\" if test ! -f \"\$progdir/\$program\" || { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | $SED 1q\`; \\ test \"X\$file\" != \"X\$progdir/\$program\"; }; then file=\"\$\$-\$program\" if test ! -d \"\$progdir\"; then $MKDIR \"\$progdir\" else $RM \"\$progdir/\$file\" fi" $ECHO "\ # relink executable if necessary if test -n \"\$relink_command\"; then if relink_command_output=\`eval \$relink_command 2>&1\`; then : else \$ECHO \"\$relink_command_output\" >&2 $RM \"\$progdir/\$file\" exit 1 fi fi $MV \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null || { $RM \"\$progdir/\$program\"; $MV \"\$progdir/\$file\" \"\$progdir/\$program\"; } $RM \"\$progdir/\$file\" fi" else $ECHO "\ program='$outputname' progdir=\"\$thisdir/$objdir\" " fi $ECHO "\ if test -f \"\$progdir/\$program\"; then" # fixup the dll searchpath if we need to. # # Fix the DLL searchpath if we need to. Do this before prepending # to shlibpath, because on Windows, both are PATH and uninstalled # libraries must come first. if test -n "$dllsearchpath"; then $ECHO "\ # Add the dll search path components to the executable PATH PATH=$dllsearchpath:\$PATH " fi # Export our shlibpath_var if we have one. if test yes = "$shlibpath_overrides_runpath" && test -n "$shlibpath_var" && test -n "$temp_rpath"; then $ECHO "\ # Add our own library path to $shlibpath_var $shlibpath_var=\"$temp_rpath\$$shlibpath_var\" # Some systems cannot cope with colon-terminated $shlibpath_var # The second colon is a workaround for a bug in BeOS R4 sed $shlibpath_var=\`\$ECHO \"\$$shlibpath_var\" | $SED 's/::*\$//'\` export $shlibpath_var " fi $ECHO "\ if test \"\$libtool_execute_magic\" != \"$magic\"; then # Run the actual program with our arguments. func_exec_program \${1+\"\$@\"} fi else # The program doesn't exist. \$ECHO \"\$0: error: '\$progdir/\$program' does not exist\" 1>&2 \$ECHO \"This script is just a wrapper for \$program.\" 1>&2 \$ECHO \"See the $PACKAGE documentation for more information.\" 1>&2 exit 1 fi fi\ " } # func_emit_cwrapperexe_src # emit the source code for a wrapper executable on stdout # Must ONLY be called from within func_mode_link because # it depends on a number of variable set therein. func_emit_cwrapperexe_src () { cat < #include #ifdef _MSC_VER # include # include # include #else # include # include # ifdef __CYGWIN__ # include # endif #endif #include #include #include #include #include #include #include #include #define STREQ(s1, s2) (strcmp ((s1), (s2)) == 0) /* declarations of non-ANSI functions */ #if defined __MINGW32__ # ifdef __STRICT_ANSI__ int _putenv (const char *); # endif #elif defined __CYGWIN__ # ifdef __STRICT_ANSI__ char *realpath (const char *, char *); int putenv (char *); int setenv (const char *, const char *, int); # endif /* #elif defined other_platform || defined ... */ #endif /* portability defines, excluding path handling macros */ #if defined _MSC_VER # define setmode _setmode # define stat _stat # define chmod _chmod # define getcwd _getcwd # define putenv _putenv # define S_IXUSR _S_IEXEC #elif defined __MINGW32__ # define setmode _setmode # define stat _stat # define chmod _chmod # define getcwd _getcwd # define putenv _putenv #elif defined __CYGWIN__ # define HAVE_SETENV # define FOPEN_WB "wb" /* #elif defined other platforms ... */ #endif #if defined PATH_MAX # define LT_PATHMAX PATH_MAX #elif defined MAXPATHLEN # define LT_PATHMAX MAXPATHLEN #else # define LT_PATHMAX 1024 #endif #ifndef S_IXOTH # define S_IXOTH 0 #endif #ifndef S_IXGRP # define S_IXGRP 0 #endif /* path handling portability macros */ #ifndef DIR_SEPARATOR # define DIR_SEPARATOR '/' # define PATH_SEPARATOR ':' #endif #if defined _WIN32 || defined __MSDOS__ || defined __DJGPP__ || \ defined __OS2__ # define HAVE_DOS_BASED_FILE_SYSTEM # define FOPEN_WB "wb" # ifndef DIR_SEPARATOR_2 # define DIR_SEPARATOR_2 '\\' # endif # ifndef PATH_SEPARATOR_2 # define PATH_SEPARATOR_2 ';' # endif #endif #ifndef DIR_SEPARATOR_2 # define IS_DIR_SEPARATOR(ch) ((ch) == DIR_SEPARATOR) #else /* DIR_SEPARATOR_2 */ # define IS_DIR_SEPARATOR(ch) \ (((ch) == DIR_SEPARATOR) || ((ch) == DIR_SEPARATOR_2)) #endif /* DIR_SEPARATOR_2 */ #ifndef PATH_SEPARATOR_2 # define IS_PATH_SEPARATOR(ch) ((ch) == PATH_SEPARATOR) #else /* PATH_SEPARATOR_2 */ # define IS_PATH_SEPARATOR(ch) ((ch) == PATH_SEPARATOR_2) #endif /* PATH_SEPARATOR_2 */ #ifndef FOPEN_WB # define FOPEN_WB "w" #endif #ifndef _O_BINARY # define _O_BINARY 0 #endif #define XMALLOC(type, num) ((type *) xmalloc ((num) * sizeof(type))) #define XFREE(stale) do { \ if (stale) { free (stale); stale = 0; } \ } while (0) #if defined LT_DEBUGWRAPPER static int lt_debug = 1; #else static int lt_debug = 0; #endif const char *program_name = "libtool-wrapper"; /* in case xstrdup fails */ void *xmalloc (size_t num); char *xstrdup (const char *string); const char *base_name (const char *name); char *find_executable (const char *wrapper); char *chase_symlinks (const char *pathspec); int make_executable (const char *path); int check_executable (const char *path); char *strendzap (char *str, const char *pat); void lt_debugprintf (const char *file, int line, const char *fmt, ...); void lt_fatal (const char *file, int line, const char *message, ...); static const char *nonnull (const char *s); static const char *nonempty (const char *s); void lt_setenv (const char *name, const char *value); char *lt_extend_str (const char *orig_value, const char *add, int to_end); void lt_update_exe_path (const char *name, const char *value); void lt_update_lib_path (const char *name, const char *value); char **prepare_spawn (char **argv); void lt_dump_script (FILE *f); EOF cat <= 0) && (st.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) return 1; else return 0; } int make_executable (const char *path) { int rval = 0; struct stat st; lt_debugprintf (__FILE__, __LINE__, "(make_executable): %s\n", nonempty (path)); if ((!path) || (!*path)) return 0; if (stat (path, &st) >= 0) { rval = chmod (path, st.st_mode | S_IXOTH | S_IXGRP | S_IXUSR); } return rval; } /* Searches for the full path of the wrapper. Returns newly allocated full path name if found, NULL otherwise Does not chase symlinks, even on platforms that support them. */ char * find_executable (const char *wrapper) { int has_slash = 0; const char *p; const char *p_next; /* static buffer for getcwd */ char tmp[LT_PATHMAX + 1]; size_t tmp_len; char *concat_name; lt_debugprintf (__FILE__, __LINE__, "(find_executable): %s\n", nonempty (wrapper)); if ((wrapper == NULL) || (*wrapper == '\0')) return NULL; /* Absolute path? */ #if defined HAVE_DOS_BASED_FILE_SYSTEM if (isalpha ((unsigned char) wrapper[0]) && wrapper[1] == ':') { concat_name = xstrdup (wrapper); if (check_executable (concat_name)) return concat_name; XFREE (concat_name); } else { #endif if (IS_DIR_SEPARATOR (wrapper[0])) { concat_name = xstrdup (wrapper); if (check_executable (concat_name)) return concat_name; XFREE (concat_name); } #if defined HAVE_DOS_BASED_FILE_SYSTEM } #endif for (p = wrapper; *p; p++) if (*p == '/') { has_slash = 1; break; } if (!has_slash) { /* no slashes; search PATH */ const char *path = getenv ("PATH"); if (path != NULL) { for (p = path; *p; p = p_next) { const char *q; size_t p_len; for (q = p; *q; q++) if (IS_PATH_SEPARATOR (*q)) break; p_len = (size_t) (q - p); p_next = (*q == '\0' ? q : q + 1); if (p_len == 0) { /* empty path: current directory */ if (getcwd (tmp, LT_PATHMAX) == NULL) lt_fatal (__FILE__, __LINE__, "getcwd failed: %s", nonnull (strerror (errno))); tmp_len = strlen (tmp); concat_name = XMALLOC (char, tmp_len + 1 + strlen (wrapper) + 1); memcpy (concat_name, tmp, tmp_len); concat_name[tmp_len] = '/'; strcpy (concat_name + tmp_len + 1, wrapper); } else { concat_name = XMALLOC (char, p_len + 1 + strlen (wrapper) + 1); memcpy (concat_name, p, p_len); concat_name[p_len] = '/'; strcpy (concat_name + p_len + 1, wrapper); } if (check_executable (concat_name)) return concat_name; XFREE (concat_name); } } /* not found in PATH; assume curdir */ } /* Relative path | not found in path: prepend cwd */ if (getcwd (tmp, LT_PATHMAX) == NULL) lt_fatal (__FILE__, __LINE__, "getcwd failed: %s", nonnull (strerror (errno))); tmp_len = strlen (tmp); concat_name = XMALLOC (char, tmp_len + 1 + strlen (wrapper) + 1); memcpy (concat_name, tmp, tmp_len); concat_name[tmp_len] = '/'; strcpy (concat_name + tmp_len + 1, wrapper); if (check_executable (concat_name)) return concat_name; XFREE (concat_name); return NULL; } char * chase_symlinks (const char *pathspec) { #ifndef S_ISLNK return xstrdup (pathspec); #else char buf[LT_PATHMAX]; struct stat s; char *tmp_pathspec = xstrdup (pathspec); char *p; int has_symlinks = 0; while (strlen (tmp_pathspec) && !has_symlinks) { lt_debugprintf (__FILE__, __LINE__, "checking path component for symlinks: %s\n", tmp_pathspec); if (lstat (tmp_pathspec, &s) == 0) { if (S_ISLNK (s.st_mode) != 0) { has_symlinks = 1; break; } /* search backwards for last DIR_SEPARATOR */ p = tmp_pathspec + strlen (tmp_pathspec) - 1; while ((p > tmp_pathspec) && (!IS_DIR_SEPARATOR (*p))) p--; if ((p == tmp_pathspec) && (!IS_DIR_SEPARATOR (*p))) { /* no more DIR_SEPARATORS left */ break; } *p = '\0'; } else { lt_fatal (__FILE__, __LINE__, "error accessing file \"%s\": %s", tmp_pathspec, nonnull (strerror (errno))); } } XFREE (tmp_pathspec); if (!has_symlinks) { return xstrdup (pathspec); } tmp_pathspec = realpath (pathspec, buf); if (tmp_pathspec == 0) { lt_fatal (__FILE__, __LINE__, "could not follow symlinks for %s", pathspec); } return xstrdup (tmp_pathspec); #endif } char * strendzap (char *str, const char *pat) { size_t len, patlen; assert (str != NULL); assert (pat != NULL); len = strlen (str); patlen = strlen (pat); if (patlen <= len) { str += len - patlen; if (STREQ (str, pat)) *str = '\0'; } return str; } void lt_debugprintf (const char *file, int line, const char *fmt, ...) { va_list args; if (lt_debug) { (void) fprintf (stderr, "%s:%s:%d: ", program_name, file, line); va_start (args, fmt); (void) vfprintf (stderr, fmt, args); va_end (args); } } static void lt_error_core (int exit_status, const char *file, int line, const char *mode, const char *message, va_list ap) { fprintf (stderr, "%s:%s:%d: %s: ", program_name, file, line, mode); vfprintf (stderr, message, ap); fprintf (stderr, ".\n"); if (exit_status >= 0) exit (exit_status); } void lt_fatal (const char *file, int line, const char *message, ...) { va_list ap; va_start (ap, message); lt_error_core (EXIT_FAILURE, file, line, "FATAL", message, ap); va_end (ap); } static const char * nonnull (const char *s) { return s ? s : "(null)"; } static const char * nonempty (const char *s) { return (s && !*s) ? "(empty)" : nonnull (s); } void lt_setenv (const char *name, const char *value) { lt_debugprintf (__FILE__, __LINE__, "(lt_setenv) setting '%s' to '%s'\n", nonnull (name), nonnull (value)); { #ifdef HAVE_SETENV /* always make a copy, for consistency with !HAVE_SETENV */ char *str = xstrdup (value); setenv (name, str, 1); #else size_t len = strlen (name) + 1 + strlen (value) + 1; char *str = XMALLOC (char, len); sprintf (str, "%s=%s", name, value); if (putenv (str) != EXIT_SUCCESS) { XFREE (str); } #endif } } char * lt_extend_str (const char *orig_value, const char *add, int to_end) { char *new_value; if (orig_value && *orig_value) { size_t orig_value_len = strlen (orig_value); size_t add_len = strlen (add); new_value = XMALLOC (char, add_len + orig_value_len + 1); if (to_end) { strcpy (new_value, orig_value); strcpy (new_value + orig_value_len, add); } else { strcpy (new_value, add); strcpy (new_value + add_len, orig_value); } } else { new_value = xstrdup (add); } return new_value; } void lt_update_exe_path (const char *name, const char *value) { lt_debugprintf (__FILE__, __LINE__, "(lt_update_exe_path) modifying '%s' by prepending '%s'\n", nonnull (name), nonnull (value)); if (name && *name && value && *value) { char *new_value = lt_extend_str (getenv (name), value, 0); /* some systems can't cope with a ':'-terminated path #' */ size_t len = strlen (new_value); while ((len > 0) && IS_PATH_SEPARATOR (new_value[len-1])) { new_value[--len] = '\0'; } lt_setenv (name, new_value); XFREE (new_value); } } void lt_update_lib_path (const char *name, const char *value) { lt_debugprintf (__FILE__, __LINE__, "(lt_update_lib_path) modifying '%s' by prepending '%s'\n", nonnull (name), nonnull (value)); if (name && *name && value && *value) { char *new_value = lt_extend_str (getenv (name), value, 0); lt_setenv (name, new_value); XFREE (new_value); } } EOF case $host_os in mingw*) cat <<"EOF" /* Prepares an argument vector before calling spawn(). Note that spawn() does not by itself call the command interpreter (getenv ("COMSPEC") != NULL ? getenv ("COMSPEC") : ({ OSVERSIONINFO v; v.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); GetVersionEx(&v); v.dwPlatformId == VER_PLATFORM_WIN32_NT; }) ? "cmd.exe" : "command.com"). Instead it simply concatenates the arguments, separated by ' ', and calls CreateProcess(). We must quote the arguments since Win32 CreateProcess() interprets characters like ' ', '\t', '\\', '"' (but not '<' and '>') in a special way: - Space and tab are interpreted as delimiters. They are not treated as delimiters if they are surrounded by double quotes: "...". - Unescaped double quotes are removed from the input. Their only effect is that within double quotes, space and tab are treated like normal characters. - Backslashes not followed by double quotes are not special. - But 2*n+1 backslashes followed by a double quote become n backslashes followed by a double quote (n >= 0): \" -> " \\\" -> \" \\\\\" -> \\" */ #define SHELL_SPECIAL_CHARS "\"\\ \001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037" #define SHELL_SPACE_CHARS " \001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037" char ** prepare_spawn (char **argv) { size_t argc; char **new_argv; size_t i; /* Count number of arguments. */ for (argc = 0; argv[argc] != NULL; argc++) ; /* Allocate new argument vector. */ new_argv = XMALLOC (char *, argc + 1); /* Put quoted arguments into the new argument vector. */ for (i = 0; i < argc; i++) { const char *string = argv[i]; if (string[0] == '\0') new_argv[i] = xstrdup ("\"\""); else if (strpbrk (string, SHELL_SPECIAL_CHARS) != NULL) { int quote_around = (strpbrk (string, SHELL_SPACE_CHARS) != NULL); size_t length; unsigned int backslashes; const char *s; char *quoted_string; char *p; length = 0; backslashes = 0; if (quote_around) length++; for (s = string; *s != '\0'; s++) { char c = *s; if (c == '"') length += backslashes + 1; length++; if (c == '\\') backslashes++; else backslashes = 0; } if (quote_around) length += backslashes + 1; quoted_string = XMALLOC (char, length + 1); p = quoted_string; backslashes = 0; if (quote_around) *p++ = '"'; for (s = string; *s != '\0'; s++) { char c = *s; if (c == '"') { unsigned int j; for (j = backslashes + 1; j > 0; j--) *p++ = '\\'; } *p++ = c; if (c == '\\') backslashes++; else backslashes = 0; } if (quote_around) { unsigned int j; for (j = backslashes; j > 0; j--) *p++ = '\\'; *p++ = '"'; } *p = '\0'; new_argv[i] = quoted_string; } else new_argv[i] = (char *) string; } new_argv[argc] = NULL; return new_argv; } EOF ;; esac cat <<"EOF" void lt_dump_script (FILE* f) { EOF func_emit_wrapper yes | $SED -n -e ' s/^\(.\{79\}\)\(..*\)/\1\ \2/ h s/\([\\"]\)/\\\1/g s/$/\\n/ s/\([^\n]*\).*/ fputs ("\1", f);/p g D' cat <<"EOF" } EOF } # end: func_emit_cwrapperexe_src # func_win32_import_lib_p ARG # True if ARG is an import lib, as indicated by $file_magic_cmd func_win32_import_lib_p () { $debug_cmd case `eval $file_magic_cmd \"\$1\" 2>/dev/null | $SED -e 10q` in *import*) : ;; *) false ;; esac } # func_suncc_cstd_abi # !!ONLY CALL THIS FOR SUN CC AFTER $compile_command IS FULLY EXPANDED!! # Several compiler flags select an ABI that is incompatible with the # Cstd library. Avoid specifying it if any are in CXXFLAGS. func_suncc_cstd_abi () { $debug_cmd case " $compile_command " in *" -compat=g "*|*\ -std=c++[0-9][0-9]\ *|*" -library=stdcxx4 "*|*" -library=stlport4 "*) suncc_use_cstd_abi=no ;; *) suncc_use_cstd_abi=yes ;; esac } # func_mode_link arg... func_mode_link () { $debug_cmd case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*) # It is impossible to link a dll without this setting, and # we shouldn't force the makefile maintainer to figure out # what system we are compiling for in order to pass an extra # flag for every libtool invocation. # allow_undefined=no # FIXME: Unfortunately, there are problems with the above when trying # to make a dll that has undefined symbols, in which case not # even a static library is built. For now, we need to specify # -no-undefined on the libtool link line when we can be certain # that all symbols are satisfied, otherwise we get a static library. allow_undefined=yes ;; *) allow_undefined=yes ;; esac libtool_args=$nonopt base_compile="$nonopt $@" compile_command=$nonopt finalize_command=$nonopt compile_rpath= finalize_rpath= compile_shlibpath= finalize_shlibpath= convenience= old_convenience= deplibs= old_deplibs= compiler_flags= linker_flags= dllsearchpath= lib_search_path=`pwd` inst_prefix_dir= new_inherited_linker_flags= avoid_version=no bindir= dlfiles= dlprefiles= dlself=no export_dynamic=no export_symbols= export_symbols_regex= generated= libobjs= ltlibs= module=no no_install=no objs= os2dllname= non_pic_objects= precious_files_regex= prefer_static_libs=no preload=false prev= prevarg= release= rpath= xrpath= perm_rpath= temp_rpath= thread_safe=no vinfo= vinfo_number=no weak_libs= single_module=$wl-single_module func_infer_tag $base_compile # We need to know -static, to get the right output filenames. for arg do case $arg in -shared) test yes != "$build_libtool_libs" \ && func_fatal_configuration "cannot build a shared library" build_old_libs=no break ;; -all-static | -static | -static-libtool-libs) case $arg in -all-static) if test yes = "$build_libtool_libs" && test -z "$link_static_flag"; then func_warning "complete static linking is impossible in this configuration" fi if test -n "$link_static_flag"; then dlopen_self=$dlopen_self_static fi prefer_static_libs=yes ;; -static) if test -z "$pic_flag" && test -n "$link_static_flag"; then dlopen_self=$dlopen_self_static fi prefer_static_libs=built ;; -static-libtool-libs) if test -z "$pic_flag" && test -n "$link_static_flag"; then dlopen_self=$dlopen_self_static fi prefer_static_libs=yes ;; esac build_libtool_libs=no build_old_libs=yes break ;; esac done # See if our shared archives depend on static archives. test -n "$old_archive_from_new_cmds" && build_old_libs=yes # Go through the arguments, transforming them on the way. while test "$#" -gt 0; do arg=$1 shift func_quote_for_eval "$arg" qarg=$func_quote_for_eval_unquoted_result func_append libtool_args " $func_quote_for_eval_result" # If the previous option needs an argument, assign it. if test -n "$prev"; then case $prev in output) func_append compile_command " @OUTPUT@" func_append finalize_command " @OUTPUT@" ;; esac case $prev in bindir) bindir=$arg prev= continue ;; dlfiles|dlprefiles) $preload || { # Add the symbol object into the linking commands. func_append compile_command " @SYMFILE@" func_append finalize_command " @SYMFILE@" preload=: } case $arg in *.la | *.lo) ;; # We handle these cases below. force) if test no = "$dlself"; then dlself=needless export_dynamic=yes fi prev= continue ;; self) if test dlprefiles = "$prev"; then dlself=yes elif test dlfiles = "$prev" && test yes != "$dlopen_self"; then dlself=yes else dlself=needless export_dynamic=yes fi prev= continue ;; *) if test dlfiles = "$prev"; then func_append dlfiles " $arg" else func_append dlprefiles " $arg" fi prev= continue ;; esac ;; expsyms) export_symbols=$arg test -f "$arg" \ || func_fatal_error "symbol file '$arg' does not exist" prev= continue ;; expsyms_regex) export_symbols_regex=$arg prev= continue ;; framework) case $host in *-*-darwin*) case "$deplibs " in *" $qarg.ltframework "*) ;; *) func_append deplibs " $qarg.ltframework" # this is fixed later ;; esac ;; esac prev= continue ;; inst_prefix) inst_prefix_dir=$arg prev= continue ;; mllvm) # Clang does not use LLVM to link, so we can simply discard any # '-mllvm $arg' options when doing the link step. prev= continue ;; objectlist) if test -f "$arg"; then save_arg=$arg moreargs= for fil in `cat "$save_arg"` do # func_append moreargs " $fil" arg=$fil # A libtool-controlled object. # Check to see that this really is a libtool object. if func_lalib_unsafe_p "$arg"; then pic_object= non_pic_object= # Read the .lo file func_source "$arg" if test -z "$pic_object" || test -z "$non_pic_object" || test none = "$pic_object" && test none = "$non_pic_object"; then func_fatal_error "cannot find name of object for '$arg'" fi # Extract subdirectory from the argument. func_dirname "$arg" "/" "" xdir=$func_dirname_result if test none != "$pic_object"; then # Prepend the subdirectory the object is found in. pic_object=$xdir$pic_object if test dlfiles = "$prev"; then if test yes = "$build_libtool_libs" && test yes = "$dlopen_support"; then func_append dlfiles " $pic_object" prev= continue else # If libtool objects are unsupported, then we need to preload. prev=dlprefiles fi fi # CHECK ME: I think I busted this. -Ossama if test dlprefiles = "$prev"; then # Preload the old-style object. func_append dlprefiles " $pic_object" prev= fi # A PIC object. func_append libobjs " $pic_object" arg=$pic_object fi # Non-PIC object. if test none != "$non_pic_object"; then # Prepend the subdirectory the object is found in. non_pic_object=$xdir$non_pic_object # A standard non-PIC object func_append non_pic_objects " $non_pic_object" if test -z "$pic_object" || test none = "$pic_object"; then arg=$non_pic_object fi else # If the PIC object exists, use it instead. # $xdir was prepended to $pic_object above. non_pic_object=$pic_object func_append non_pic_objects " $non_pic_object" fi else # Only an error if not doing a dry-run. if $opt_dry_run; then # Extract subdirectory from the argument. func_dirname "$arg" "/" "" xdir=$func_dirname_result func_lo2o "$arg" pic_object=$xdir$objdir/$func_lo2o_result non_pic_object=$xdir$func_lo2o_result func_append libobjs " $pic_object" func_append non_pic_objects " $non_pic_object" else func_fatal_error "'$arg' is not a valid libtool object" fi fi done else func_fatal_error "link input file '$arg' does not exist" fi arg=$save_arg prev= continue ;; os2dllname) os2dllname=$arg prev= continue ;; precious_regex) precious_files_regex=$arg prev= continue ;; release) release=-$arg prev= continue ;; rpath | xrpath) # We need an absolute path. case $arg in [\\/]* | [A-Za-z]:[\\/]*) ;; *) func_fatal_error "only absolute run-paths are allowed" ;; esac if test rpath = "$prev"; then case "$rpath " in *" $arg "*) ;; *) func_append rpath " $arg" ;; esac else case "$xrpath " in *" $arg "*) ;; *) func_append xrpath " $arg" ;; esac fi prev= continue ;; shrext) shrext_cmds=$arg prev= continue ;; weak) func_append weak_libs " $arg" prev= continue ;; xcclinker) func_append linker_flags " $qarg" func_append compiler_flags " $qarg" prev= func_append compile_command " $qarg" func_append finalize_command " $qarg" continue ;; xcompiler) func_append compiler_flags " $qarg" prev= func_append compile_command " $qarg" func_append finalize_command " $qarg" continue ;; xlinker) func_append linker_flags " $qarg" func_append compiler_flags " $wl$qarg" prev= func_append compile_command " $wl$qarg" func_append finalize_command " $wl$qarg" continue ;; *) eval "$prev=\"\$arg\"" prev= continue ;; esac fi # test -n "$prev" prevarg=$arg case $arg in -all-static) if test -n "$link_static_flag"; then # See comment for -static flag below, for more details. func_append compile_command " $link_static_flag" func_append finalize_command " $link_static_flag" fi continue ;; -allow-undefined) # FIXME: remove this flag sometime in the future. func_fatal_error "'-allow-undefined' must not be used because it is the default" ;; -avoid-version) avoid_version=yes continue ;; -bindir) prev=bindir continue ;; -dlopen) prev=dlfiles continue ;; -dlpreopen) prev=dlprefiles continue ;; -export-dynamic) export_dynamic=yes continue ;; -export-symbols | -export-symbols-regex) if test -n "$export_symbols" || test -n "$export_symbols_regex"; then func_fatal_error "more than one -exported-symbols argument is not allowed" fi if test X-export-symbols = "X$arg"; then prev=expsyms else prev=expsyms_regex fi continue ;; -framework) prev=framework continue ;; -inst-prefix-dir) prev=inst_prefix continue ;; # The native IRIX linker understands -LANG:*, -LIST:* and -LNO:* # so, if we see these flags be careful not to treat them like -L -L[A-Z][A-Z]*:*) case $with_gcc/$host in no/*-*-irix* | /*-*-irix*) func_append compile_command " $arg" func_append finalize_command " $arg" ;; esac continue ;; -L*) func_stripname "-L" '' "$arg" if test -z "$func_stripname_result"; then if test "$#" -gt 0; then func_fatal_error "require no space between '-L' and '$1'" else func_fatal_error "need path for '-L' option" fi fi func_resolve_sysroot "$func_stripname_result" dir=$func_resolve_sysroot_result # We need an absolute path. case $dir in [\\/]* | [A-Za-z]:[\\/]*) ;; *) absdir=`cd "$dir" && pwd` test -z "$absdir" && \ func_fatal_error "cannot determine absolute directory name of '$dir'" dir=$absdir ;; esac case "$deplibs " in *" -L$dir "* | *" $arg "*) # Will only happen for absolute or sysroot arguments ;; *) # Preserve sysroot, but never include relative directories case $dir in [\\/]* | [A-Za-z]:[\\/]* | =*) func_append deplibs " $arg" ;; *) func_append deplibs " -L$dir" ;; esac func_append lib_search_path " $dir" ;; esac case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*) testbindir=`$ECHO "$dir" | $SED 's*/lib$*/bin*'` case :$dllsearchpath: in *":$dir:"*) ;; ::) dllsearchpath=$dir;; *) func_append dllsearchpath ":$dir";; esac case :$dllsearchpath: in *":$testbindir:"*) ;; ::) dllsearchpath=$testbindir;; *) func_append dllsearchpath ":$testbindir";; esac ;; esac continue ;; -l*) if test X-lc = "X$arg" || test X-lm = "X$arg"; then case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-beos* | *-cegcc* | *-*-haiku*) # These systems don't actually have a C or math library (as such) continue ;; *-*-os2*) # These systems don't actually have a C library (as such) test X-lc = "X$arg" && continue ;; *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*) # Do not include libc due to us having libc/libc_r. test X-lc = "X$arg" && continue ;; *-*-rhapsody* | *-*-darwin1.[012]) # Rhapsody C and math libraries are in the System framework func_append deplibs " System.ltframework" continue ;; *-*-sco3.2v5* | *-*-sco5v6*) # Causes problems with __ctype test X-lc = "X$arg" && continue ;; *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*) # Compiler inserts libc in the correct place for threads to work test X-lc = "X$arg" && continue ;; esac elif test X-lc_r = "X$arg"; then case $host in *-*-openbsd* | *-*-freebsd* | *-*-dragonfly* | *-*-bitrig*) # Do not include libc_r directly, use -pthread flag. continue ;; esac fi func_append deplibs " $arg" continue ;; -mllvm) prev=mllvm continue ;; -module) module=yes continue ;; # Tru64 UNIX uses -model [arg] to determine the layout of C++ # classes, name mangling, and exception handling. # Darwin uses the -arch flag to determine output architecture. -model|-arch|-isysroot|--sysroot) func_append compiler_flags " $arg" func_append compile_command " $arg" func_append finalize_command " $arg" prev=xcompiler continue ;; -mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \ |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*) func_append compiler_flags " $arg" func_append compile_command " $arg" func_append finalize_command " $arg" case "$new_inherited_linker_flags " in *" $arg "*) ;; * ) func_append new_inherited_linker_flags " $arg" ;; esac continue ;; -multi_module) single_module=$wl-multi_module continue ;; -no-fast-install) fast_install=no continue ;; -no-install) case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-darwin* | *-cegcc*) # The PATH hackery in wrapper scripts is required on Windows # and Darwin in order for the loader to find any dlls it needs. func_warning "'-no-install' is ignored for $host" func_warning "assuming '-no-fast-install' instead" fast_install=no ;; *) no_install=yes ;; esac continue ;; -no-undefined) allow_undefined=no continue ;; -objectlist) prev=objectlist continue ;; -os2dllname) prev=os2dllname continue ;; -o) prev=output ;; -precious-files-regex) prev=precious_regex continue ;; -release) prev=release continue ;; -rpath) prev=rpath continue ;; -R) prev=xrpath continue ;; -R*) func_stripname '-R' '' "$arg" dir=$func_stripname_result # We need an absolute path. case $dir in [\\/]* | [A-Za-z]:[\\/]*) ;; =*) func_stripname '=' '' "$dir" dir=$lt_sysroot$func_stripname_result ;; *) func_fatal_error "only absolute run-paths are allowed" ;; esac case "$xrpath " in *" $dir "*) ;; *) func_append xrpath " $dir" ;; esac continue ;; -shared) # The effects of -shared are defined in a previous loop. continue ;; -shrext) prev=shrext continue ;; -static | -static-libtool-libs) # The effects of -static are defined in a previous loop. # We used to do the same as -all-static on platforms that # didn't have a PIC flag, but the assumption that the effects # would be equivalent was wrong. It would break on at least # Digital Unix and AIX. continue ;; -thread-safe) thread_safe=yes continue ;; -version-info) prev=vinfo continue ;; -version-number) prev=vinfo vinfo_number=yes continue ;; -weak) prev=weak continue ;; -Wc,*) func_stripname '-Wc,' '' "$arg" args=$func_stripname_result arg= save_ifs=$IFS; IFS=, for flag in $args; do IFS=$save_ifs func_quote_for_eval "$flag" func_append arg " $func_quote_for_eval_result" func_append compiler_flags " $func_quote_for_eval_result" done IFS=$save_ifs func_stripname ' ' '' "$arg" arg=$func_stripname_result ;; -Wl,*) func_stripname '-Wl,' '' "$arg" args=$func_stripname_result arg= save_ifs=$IFS; IFS=, for flag in $args; do IFS=$save_ifs func_quote_for_eval "$flag" func_append arg " $wl$func_quote_for_eval_result" func_append compiler_flags " $wl$func_quote_for_eval_result" func_append linker_flags " $func_quote_for_eval_result" done IFS=$save_ifs func_stripname ' ' '' "$arg" arg=$func_stripname_result ;; -Xcompiler) prev=xcompiler continue ;; -Xlinker) prev=xlinker continue ;; -XCClinker) prev=xcclinker continue ;; # -msg_* for osf cc -msg_*) func_quote_for_eval "$arg" arg=$func_quote_for_eval_result ;; # Flags to be passed through unchanged, with rationale: # -64, -mips[0-9] enable 64-bit mode for the SGI compiler # -r[0-9][0-9]* specify processor for the SGI compiler # -xarch=*, -xtarget=* enable 64-bit mode for the Sun compiler # +DA*, +DD* enable 64-bit mode for the HP compiler # -q* compiler args for the IBM compiler # -m*, -t[45]*, -txscale* architecture-specific flags for GCC # -F/path path to uninstalled frameworks, gcc on darwin # -p, -pg, --coverage, -fprofile-* profiling flags for GCC # -fstack-protector* stack protector flags for GCC # @file GCC response files # -tp=* Portland pgcc target processor selection # --sysroot=* for sysroot support # -O*, -g*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization # -stdlib=* select c++ std lib with clang -64|-mips[0-9]|-r[0-9][0-9]*|-xarch=*|-xtarget=*|+DA*|+DD*|-q*|-m*| \ -t[45]*|-txscale*|-p|-pg|--coverage|-fprofile-*|-F*|@*|-tp=*|--sysroot=*| \ -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*) func_quote_for_eval "$arg" arg=$func_quote_for_eval_result func_append compile_command " $arg" func_append finalize_command " $arg" func_append compiler_flags " $arg" continue ;; -Z*) if test os2 = "`expr $host : '.*\(os2\)'`"; then # OS/2 uses -Zxxx to specify OS/2-specific options compiler_flags="$compiler_flags $arg" func_append compile_command " $arg" func_append finalize_command " $arg" case $arg in -Zlinker | -Zstack) prev=xcompiler ;; esac continue else # Otherwise treat like 'Some other compiler flag' below func_quote_for_eval "$arg" arg=$func_quote_for_eval_result fi ;; # Some other compiler flag. -* | +*) func_quote_for_eval "$arg" arg=$func_quote_for_eval_result ;; *.$objext) # A standard object. func_append objs " $arg" ;; *.lo) # A libtool-controlled object. # Check to see that this really is a libtool object. if func_lalib_unsafe_p "$arg"; then pic_object= non_pic_object= # Read the .lo file func_source "$arg" if test -z "$pic_object" || test -z "$non_pic_object" || test none = "$pic_object" && test none = "$non_pic_object"; then func_fatal_error "cannot find name of object for '$arg'" fi # Extract subdirectory from the argument. func_dirname "$arg" "/" "" xdir=$func_dirname_result test none = "$pic_object" || { # Prepend the subdirectory the object is found in. pic_object=$xdir$pic_object if test dlfiles = "$prev"; then if test yes = "$build_libtool_libs" && test yes = "$dlopen_support"; then func_append dlfiles " $pic_object" prev= continue else # If libtool objects are unsupported, then we need to preload. prev=dlprefiles fi fi # CHECK ME: I think I busted this. -Ossama if test dlprefiles = "$prev"; then # Preload the old-style object. func_append dlprefiles " $pic_object" prev= fi # A PIC object. func_append libobjs " $pic_object" arg=$pic_object } # Non-PIC object. if test none != "$non_pic_object"; then # Prepend the subdirectory the object is found in. non_pic_object=$xdir$non_pic_object # A standard non-PIC object func_append non_pic_objects " $non_pic_object" if test -z "$pic_object" || test none = "$pic_object"; then arg=$non_pic_object fi else # If the PIC object exists, use it instead. # $xdir was prepended to $pic_object above. non_pic_object=$pic_object func_append non_pic_objects " $non_pic_object" fi else # Only an error if not doing a dry-run. if $opt_dry_run; then # Extract subdirectory from the argument. func_dirname "$arg" "/" "" xdir=$func_dirname_result func_lo2o "$arg" pic_object=$xdir$objdir/$func_lo2o_result non_pic_object=$xdir$func_lo2o_result func_append libobjs " $pic_object" func_append non_pic_objects " $non_pic_object" else func_fatal_error "'$arg' is not a valid libtool object" fi fi ;; *.$libext) # An archive. func_append deplibs " $arg" func_append old_deplibs " $arg" continue ;; *.la) # A libtool-controlled library. func_resolve_sysroot "$arg" if test dlfiles = "$prev"; then # This library was specified with -dlopen. func_append dlfiles " $func_resolve_sysroot_result" prev= elif test dlprefiles = "$prev"; then # The library was specified with -dlpreopen. func_append dlprefiles " $func_resolve_sysroot_result" prev= else func_append deplibs " $func_resolve_sysroot_result" fi continue ;; # Some other compiler argument. *) # Unknown arguments in both finalize_command and compile_command need # to be aesthetically quoted because they are evaled later. func_quote_for_eval "$arg" arg=$func_quote_for_eval_result ;; esac # arg # Now actually substitute the argument into the commands. if test -n "$arg"; then func_append compile_command " $arg" func_append finalize_command " $arg" fi done # argument parsing loop test -n "$prev" && \ func_fatal_help "the '$prevarg' option requires an argument" if test yes = "$export_dynamic" && test -n "$export_dynamic_flag_spec"; then eval arg=\"$export_dynamic_flag_spec\" func_append compile_command " $arg" func_append finalize_command " $arg" fi oldlibs= # calculate the name of the file, without its directory func_basename "$output" outputname=$func_basename_result libobjs_save=$libobjs if test -n "$shlibpath_var"; then # get the directories listed in $shlibpath_var eval shlib_search_path=\`\$ECHO \"\$$shlibpath_var\" \| \$SED \'s/:/ /g\'\` else shlib_search_path= fi eval sys_lib_search_path=\"$sys_lib_search_path_spec\" eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\" # Definition is injected by LT_CONFIG during libtool generation. func_munge_path_list sys_lib_dlsearch_path "$LT_SYS_LIBRARY_PATH" func_dirname "$output" "/" "" output_objdir=$func_dirname_result$objdir func_to_tool_file "$output_objdir/" tool_output_objdir=$func_to_tool_file_result # Create the object directory. func_mkdir_p "$output_objdir" # Determine the type of output case $output in "") func_fatal_help "you must specify an output file" ;; *.$libext) linkmode=oldlib ;; *.lo | *.$objext) linkmode=obj ;; *.la) linkmode=lib ;; *) linkmode=prog ;; # Anything else should be a program. esac specialdeplibs= libs= # Find all interdependent deplibs by searching for libraries # that are linked more than once (e.g. -la -lb -la) for deplib in $deplibs; do if $opt_preserve_dup_deps; then case "$libs " in *" $deplib "*) func_append specialdeplibs " $deplib" ;; esac fi func_append libs " $deplib" done if test lib = "$linkmode"; then libs="$predeps $libs $compiler_lib_search_path $postdeps" # Compute libraries that are listed more than once in $predeps # $postdeps and mark them as special (i.e., whose duplicates are # not to be eliminated). pre_post_deps= if $opt_duplicate_compiler_generated_deps; then for pre_post_dep in $predeps $postdeps; do case "$pre_post_deps " in *" $pre_post_dep "*) func_append specialdeplibs " $pre_post_deps" ;; esac func_append pre_post_deps " $pre_post_dep" done fi pre_post_deps= fi deplibs= newdependency_libs= newlib_search_path= need_relink=no # whether we're linking any uninstalled libtool libraries notinst_deplibs= # not-installed libtool libraries notinst_path= # paths that contain not-installed libtool libraries case $linkmode in lib) passes="conv dlpreopen link" for file in $dlfiles $dlprefiles; do case $file in *.la) ;; *) func_fatal_help "libraries can '-dlopen' only libtool libraries: $file" ;; esac done ;; prog) compile_deplibs= finalize_deplibs= alldeplibs=false newdlfiles= newdlprefiles= passes="conv scan dlopen dlpreopen link" ;; *) passes="conv" ;; esac for pass in $passes; do # The preopen pass in lib mode reverses $deplibs; put it back here # so that -L comes before libs that need it for instance... if test lib,link = "$linkmode,$pass"; then ## FIXME: Find the place where the list is rebuilt in the wrong ## order, and fix it there properly tmp_deplibs= for deplib in $deplibs; do tmp_deplibs="$deplib $tmp_deplibs" done deplibs=$tmp_deplibs fi if test lib,link = "$linkmode,$pass" || test prog,scan = "$linkmode,$pass"; then libs=$deplibs deplibs= fi if test prog = "$linkmode"; then case $pass in dlopen) libs=$dlfiles ;; dlpreopen) libs=$dlprefiles ;; link) libs="$deplibs %DEPLIBS% $dependency_libs" ;; esac fi if test lib,dlpreopen = "$linkmode,$pass"; then # Collect and forward deplibs of preopened libtool libs for lib in $dlprefiles; do # Ignore non-libtool-libs dependency_libs= func_resolve_sysroot "$lib" case $lib in *.la) func_source "$func_resolve_sysroot_result" ;; esac # Collect preopened libtool deplibs, except any this library # has declared as weak libs for deplib in $dependency_libs; do func_basename "$deplib" deplib_base=$func_basename_result case " $weak_libs " in *" $deplib_base "*) ;; *) func_append deplibs " $deplib" ;; esac done done libs=$dlprefiles fi if test dlopen = "$pass"; then # Collect dlpreopened libraries save_deplibs=$deplibs deplibs= fi for deplib in $libs; do lib= found=false case $deplib in -mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \ |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*) if test prog,link = "$linkmode,$pass"; then compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" else func_append compiler_flags " $deplib" if test lib = "$linkmode"; then case "$new_inherited_linker_flags " in *" $deplib "*) ;; * ) func_append new_inherited_linker_flags " $deplib" ;; esac fi fi continue ;; -l*) if test lib != "$linkmode" && test prog != "$linkmode"; then func_warning "'-l' is ignored for archives/objects" continue fi func_stripname '-l' '' "$deplib" name=$func_stripname_result if test lib = "$linkmode"; then searchdirs="$newlib_search_path $lib_search_path $compiler_lib_search_dirs $sys_lib_search_path $shlib_search_path" else searchdirs="$newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path" fi for searchdir in $searchdirs; do for search_ext in .la $std_shrext .so .a; do # Search the libtool library lib=$searchdir/lib$name$search_ext if test -f "$lib"; then if test .la = "$search_ext"; then found=: else found=false fi break 2 fi done done if $found; then # deplib is a libtool library # If $allow_libtool_libs_with_static_runtimes && $deplib is a stdlib, # We need to do some special things here, and not later. if test yes = "$allow_libtool_libs_with_static_runtimes"; then case " $predeps $postdeps " in *" $deplib "*) if func_lalib_p "$lib"; then library_names= old_library= func_source "$lib" for l in $old_library $library_names; do ll=$l done if test "X$ll" = "X$old_library"; then # only static version available found=false func_dirname "$lib" "" "." ladir=$func_dirname_result lib=$ladir/$old_library if test prog,link = "$linkmode,$pass"; then compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" else deplibs="$deplib $deplibs" test lib = "$linkmode" && newdependency_libs="$deplib $newdependency_libs" fi continue fi fi ;; *) ;; esac fi else # deplib doesn't seem to be a libtool library if test prog,link = "$linkmode,$pass"; then compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" else deplibs="$deplib $deplibs" test lib = "$linkmode" && newdependency_libs="$deplib $newdependency_libs" fi continue fi ;; # -l *.ltframework) if test prog,link = "$linkmode,$pass"; then compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" else deplibs="$deplib $deplibs" if test lib = "$linkmode"; then case "$new_inherited_linker_flags " in *" $deplib "*) ;; * ) func_append new_inherited_linker_flags " $deplib" ;; esac fi fi continue ;; -L*) case $linkmode in lib) deplibs="$deplib $deplibs" test conv = "$pass" && continue newdependency_libs="$deplib $newdependency_libs" func_stripname '-L' '' "$deplib" func_resolve_sysroot "$func_stripname_result" func_append newlib_search_path " $func_resolve_sysroot_result" ;; prog) if test conv = "$pass"; then deplibs="$deplib $deplibs" continue fi if test scan = "$pass"; then deplibs="$deplib $deplibs" else compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" fi func_stripname '-L' '' "$deplib" func_resolve_sysroot "$func_stripname_result" func_append newlib_search_path " $func_resolve_sysroot_result" ;; *) func_warning "'-L' is ignored for archives/objects" ;; esac # linkmode continue ;; # -L -R*) if test link = "$pass"; then func_stripname '-R' '' "$deplib" func_resolve_sysroot "$func_stripname_result" dir=$func_resolve_sysroot_result # Make sure the xrpath contains only unique directories. case "$xrpath " in *" $dir "*) ;; *) func_append xrpath " $dir" ;; esac fi deplibs="$deplib $deplibs" continue ;; *.la) func_resolve_sysroot "$deplib" lib=$func_resolve_sysroot_result ;; *.$libext) if test conv = "$pass"; then deplibs="$deplib $deplibs" continue fi case $linkmode in lib) # Linking convenience modules into shared libraries is allowed, # but linking other static libraries is non-portable. case " $dlpreconveniencelibs " in *" $deplib "*) ;; *) valid_a_lib=false case $deplibs_check_method in match_pattern*) set dummy $deplibs_check_method; shift match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"` if eval "\$ECHO \"$deplib\"" 2>/dev/null | $SED 10q \ | $EGREP "$match_pattern_regex" > /dev/null; then valid_a_lib=: fi ;; pass_all) valid_a_lib=: ;; esac if $valid_a_lib; then echo $ECHO "*** Warning: Linking the shared library $output against the" $ECHO "*** static library $deplib is not portable!" deplibs="$deplib $deplibs" else echo $ECHO "*** Warning: Trying to link with static lib archive $deplib." echo "*** I have the capability to make that library automatically link in when" echo "*** you link to this library. But I can only do this if you have a" echo "*** shared version of the library, which you do not appear to have" echo "*** because the file extensions .$libext of this argument makes me believe" echo "*** that it is just a static archive that I should not use here." fi ;; esac continue ;; prog) if test link != "$pass"; then deplibs="$deplib $deplibs" else compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" fi continue ;; esac # linkmode ;; # *.$libext *.lo | *.$objext) if test conv = "$pass"; then deplibs="$deplib $deplibs" elif test prog = "$linkmode"; then if test dlpreopen = "$pass" || test yes != "$dlopen_support" || test no = "$build_libtool_libs"; then # If there is no dlopen support or we're linking statically, # we need to preload. func_append newdlprefiles " $deplib" compile_deplibs="$deplib $compile_deplibs" finalize_deplibs="$deplib $finalize_deplibs" else func_append newdlfiles " $deplib" fi fi continue ;; %DEPLIBS%) alldeplibs=: continue ;; esac # case $deplib $found || test -f "$lib" \ || func_fatal_error "cannot find the library '$lib' or unhandled argument '$deplib'" # Check to see that this really is a libtool archive. func_lalib_unsafe_p "$lib" \ || func_fatal_error "'$lib' is not a valid libtool archive" func_dirname "$lib" "" "." ladir=$func_dirname_result dlname= dlopen= dlpreopen= libdir= library_names= old_library= inherited_linker_flags= # If the library was installed with an old release of libtool, # it will not redefine variables installed, or shouldnotlink installed=yes shouldnotlink=no avoidtemprpath= # Read the .la file func_source "$lib" # Convert "-framework foo" to "foo.ltframework" if test -n "$inherited_linker_flags"; then tmp_inherited_linker_flags=`$ECHO "$inherited_linker_flags" | $SED 's/-framework \([^ $]*\)/\1.ltframework/g'` for tmp_inherited_linker_flag in $tmp_inherited_linker_flags; do case " $new_inherited_linker_flags " in *" $tmp_inherited_linker_flag "*) ;; *) func_append new_inherited_linker_flags " $tmp_inherited_linker_flag";; esac done fi dependency_libs=`$ECHO " $dependency_libs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` if test lib,link = "$linkmode,$pass" || test prog,scan = "$linkmode,$pass" || { test prog != "$linkmode" && test lib != "$linkmode"; }; then test -n "$dlopen" && func_append dlfiles " $dlopen" test -n "$dlpreopen" && func_append dlprefiles " $dlpreopen" fi if test conv = "$pass"; then # Only check for convenience libraries deplibs="$lib $deplibs" if test -z "$libdir"; then if test -z "$old_library"; then func_fatal_error "cannot find name of link library for '$lib'" fi # It is a libtool convenience library, so add in its objects. func_append convenience " $ladir/$objdir/$old_library" func_append old_convenience " $ladir/$objdir/$old_library" elif test prog != "$linkmode" && test lib != "$linkmode"; then func_fatal_error "'$lib' is not a convenience library" fi tmp_libs= for deplib in $dependency_libs; do deplibs="$deplib $deplibs" if $opt_preserve_dup_deps; then case "$tmp_libs " in *" $deplib "*) func_append specialdeplibs " $deplib" ;; esac fi func_append tmp_libs " $deplib" done continue fi # $pass = conv # Get the name of the library we link against. linklib= if test -n "$old_library" && { test yes = "$prefer_static_libs" || test built,no = "$prefer_static_libs,$installed"; }; then linklib=$old_library else for l in $old_library $library_names; do linklib=$l done fi if test -z "$linklib"; then func_fatal_error "cannot find name of link library for '$lib'" fi # This library was specified with -dlopen. if test dlopen = "$pass"; then test -z "$libdir" \ && func_fatal_error "cannot -dlopen a convenience library: '$lib'" if test -z "$dlname" || test yes != "$dlopen_support" || test no = "$build_libtool_libs" then # If there is no dlname, no dlopen support or we're linking # statically, we need to preload. We also need to preload any # dependent libraries so libltdl's deplib preloader doesn't # bomb out in the load deplibs phase. func_append dlprefiles " $lib $dependency_libs" else func_append newdlfiles " $lib" fi continue fi # $pass = dlopen # We need an absolute path. case $ladir in [\\/]* | [A-Za-z]:[\\/]*) abs_ladir=$ladir ;; *) abs_ladir=`cd "$ladir" && pwd` if test -z "$abs_ladir"; then func_warning "cannot determine absolute directory name of '$ladir'" func_warning "passing it literally to the linker, although it might fail" abs_ladir=$ladir fi ;; esac func_basename "$lib" laname=$func_basename_result # Find the relevant object directory and library name. if test yes = "$installed"; then if test ! -f "$lt_sysroot$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then func_warning "library '$lib' was moved." dir=$ladir absdir=$abs_ladir libdir=$abs_ladir else dir=$lt_sysroot$libdir absdir=$lt_sysroot$libdir fi test yes = "$hardcode_automatic" && avoidtemprpath=yes else if test ! -f "$ladir/$objdir/$linklib" && test -f "$abs_ladir/$linklib"; then dir=$ladir absdir=$abs_ladir # Remove this search path later func_append notinst_path " $abs_ladir" else dir=$ladir/$objdir absdir=$abs_ladir/$objdir # Remove this search path later func_append notinst_path " $abs_ladir" fi fi # $installed = yes func_stripname 'lib' '.la' "$laname" name=$func_stripname_result # This library was specified with -dlpreopen. if test dlpreopen = "$pass"; then if test -z "$libdir" && test prog = "$linkmode"; then func_fatal_error "only libraries may -dlpreopen a convenience library: '$lib'" fi case $host in # special handling for platforms with PE-DLLs. *cygwin* | *mingw* | *cegcc* ) # Linker will automatically link against shared library if both # static and shared are present. Therefore, ensure we extract # symbols from the import library if a shared library is present # (otherwise, the dlopen module name will be incorrect). We do # this by putting the import library name into $newdlprefiles. # We recover the dlopen module name by 'saving' the la file # name in a special purpose variable, and (later) extracting the # dlname from the la file. if test -n "$dlname"; then func_tr_sh "$dir/$linklib" eval "libfile_$func_tr_sh_result=\$abs_ladir/\$laname" func_append newdlprefiles " $dir/$linklib" else func_append newdlprefiles " $dir/$old_library" # Keep a list of preopened convenience libraries to check # that they are being used correctly in the link pass. test -z "$libdir" && \ func_append dlpreconveniencelibs " $dir/$old_library" fi ;; * ) # Prefer using a static library (so that no silly _DYNAMIC symbols # are required to link). if test -n "$old_library"; then func_append newdlprefiles " $dir/$old_library" # Keep a list of preopened convenience libraries to check # that they are being used correctly in the link pass. test -z "$libdir" && \ func_append dlpreconveniencelibs " $dir/$old_library" # Otherwise, use the dlname, so that lt_dlopen finds it. elif test -n "$dlname"; then func_append newdlprefiles " $dir/$dlname" else func_append newdlprefiles " $dir/$linklib" fi ;; esac fi # $pass = dlpreopen if test -z "$libdir"; then # Link the convenience library if test lib = "$linkmode"; then deplibs="$dir/$old_library $deplibs" elif test prog,link = "$linkmode,$pass"; then compile_deplibs="$dir/$old_library $compile_deplibs" finalize_deplibs="$dir/$old_library $finalize_deplibs" else deplibs="$lib $deplibs" # used for prog,scan pass fi continue fi if test prog = "$linkmode" && test link != "$pass"; then func_append newlib_search_path " $ladir" deplibs="$lib $deplibs" linkalldeplibs=false if test no != "$link_all_deplibs" || test -z "$library_names" || test no = "$build_libtool_libs"; then linkalldeplibs=: fi tmp_libs= for deplib in $dependency_libs; do case $deplib in -L*) func_stripname '-L' '' "$deplib" func_resolve_sysroot "$func_stripname_result" func_append newlib_search_path " $func_resolve_sysroot_result" ;; esac # Need to link against all dependency_libs? if $linkalldeplibs; then deplibs="$deplib $deplibs" else # Need to hardcode shared library paths # or/and link against static libraries newdependency_libs="$deplib $newdependency_libs" fi if $opt_preserve_dup_deps; then case "$tmp_libs " in *" $deplib "*) func_append specialdeplibs " $deplib" ;; esac fi func_append tmp_libs " $deplib" done # for deplib continue fi # $linkmode = prog... if test prog,link = "$linkmode,$pass"; then if test -n "$library_names" && { { test no = "$prefer_static_libs" || test built,yes = "$prefer_static_libs,$installed"; } || test -z "$old_library"; }; then # We need to hardcode the library path if test -n "$shlibpath_var" && test -z "$avoidtemprpath"; then # Make sure the rpath contains only unique directories. case $temp_rpath: in *"$absdir:"*) ;; *) func_append temp_rpath "$absdir:" ;; esac fi # Hardcode the library path. # Skip directories that are in the system default run-time # search path. case " $sys_lib_dlsearch_path " in *" $absdir "*) ;; *) case "$compile_rpath " in *" $absdir "*) ;; *) func_append compile_rpath " $absdir" ;; esac ;; esac case " $sys_lib_dlsearch_path " in *" $libdir "*) ;; *) case "$finalize_rpath " in *" $libdir "*) ;; *) func_append finalize_rpath " $libdir" ;; esac ;; esac fi # $linkmode,$pass = prog,link... if $alldeplibs && { test pass_all = "$deplibs_check_method" || { test yes = "$build_libtool_libs" && test -n "$library_names"; }; }; then # We only need to search for static libraries continue fi fi link_static=no # Whether the deplib will be linked statically use_static_libs=$prefer_static_libs if test built = "$use_static_libs" && test yes = "$installed"; then use_static_libs=no fi if test -n "$library_names" && { test no = "$use_static_libs" || test -z "$old_library"; }; then case $host in *cygwin* | *mingw* | *cegcc* | *os2*) # No point in relinking DLLs because paths are not encoded func_append notinst_deplibs " $lib" need_relink=no ;; *) if test no = "$installed"; then func_append notinst_deplibs " $lib" need_relink=yes fi ;; esac # This is a shared library # Warn about portability, can't link against -module's on some # systems (darwin). Don't bleat about dlopened modules though! dlopenmodule= for dlpremoduletest in $dlprefiles; do if test "X$dlpremoduletest" = "X$lib"; then dlopenmodule=$dlpremoduletest break fi done if test -z "$dlopenmodule" && test yes = "$shouldnotlink" && test link = "$pass"; then echo if test prog = "$linkmode"; then $ECHO "*** Warning: Linking the executable $output against the loadable module" else $ECHO "*** Warning: Linking the shared library $output against the loadable module" fi $ECHO "*** $linklib is not portable!" fi if test lib = "$linkmode" && test yes = "$hardcode_into_libs"; then # Hardcode the library path. # Skip directories that are in the system default run-time # search path. case " $sys_lib_dlsearch_path " in *" $absdir "*) ;; *) case "$compile_rpath " in *" $absdir "*) ;; *) func_append compile_rpath " $absdir" ;; esac ;; esac case " $sys_lib_dlsearch_path " in *" $libdir "*) ;; *) case "$finalize_rpath " in *" $libdir "*) ;; *) func_append finalize_rpath " $libdir" ;; esac ;; esac fi if test -n "$old_archive_from_expsyms_cmds"; then # figure out the soname set dummy $library_names shift realname=$1 shift libname=`eval "\\$ECHO \"$libname_spec\""` # use dlname if we got it. it's perfectly good, no? if test -n "$dlname"; then soname=$dlname elif test -n "$soname_spec"; then # bleh windows case $host in *cygwin* | mingw* | *cegcc* | *os2*) func_arith $current - $age major=$func_arith_result versuffix=-$major ;; esac eval soname=\"$soname_spec\" else soname=$realname fi # Make a new name for the extract_expsyms_cmds to use soroot=$soname func_basename "$soroot" soname=$func_basename_result func_stripname 'lib' '.dll' "$soname" newlib=libimp-$func_stripname_result.a # If the library has no export list, then create one now if test -f "$output_objdir/$soname-def"; then : else func_verbose "extracting exported symbol list from '$soname'" func_execute_cmds "$extract_expsyms_cmds" 'exit $?' fi # Create $newlib if test -f "$output_objdir/$newlib"; then :; else func_verbose "generating import library for '$soname'" func_execute_cmds "$old_archive_from_expsyms_cmds" 'exit $?' fi # make sure the library variables are pointing to the new library dir=$output_objdir linklib=$newlib fi # test -n "$old_archive_from_expsyms_cmds" if test prog = "$linkmode" || test relink != "$opt_mode"; then add_shlibpath= add_dir= add= lib_linked=yes case $hardcode_action in immediate | unsupported) if test no = "$hardcode_direct"; then add=$dir/$linklib case $host in *-*-sco3.2v5.0.[024]*) add_dir=-L$dir ;; *-*-sysv4*uw2*) add_dir=-L$dir ;; *-*-sysv5OpenUNIX* | *-*-sysv5UnixWare7.[01].[10]* | \ *-*-unixware7*) add_dir=-L$dir ;; *-*-darwin* ) # if the lib is a (non-dlopened) module then we cannot # link against it, someone is ignoring the earlier warnings if /usr/bin/file -L $add 2> /dev/null | $GREP ": [^:]* bundle" >/dev/null; then if test "X$dlopenmodule" != "X$lib"; then $ECHO "*** Warning: lib $linklib is a module, not a shared library" if test -z "$old_library"; then echo echo "*** And there doesn't seem to be a static archive available" echo "*** The link will probably fail, sorry" else add=$dir/$old_library fi elif test -n "$old_library"; then add=$dir/$old_library fi fi esac elif test no = "$hardcode_minus_L"; then case $host in *-*-sunos*) add_shlibpath=$dir ;; esac add_dir=-L$dir add=-l$name elif test no = "$hardcode_shlibpath_var"; then add_shlibpath=$dir add=-l$name else lib_linked=no fi ;; relink) if test yes = "$hardcode_direct" && test no = "$hardcode_direct_absolute"; then add=$dir/$linklib elif test yes = "$hardcode_minus_L"; then add_dir=-L$absdir # Try looking first in the location we're being installed to. if test -n "$inst_prefix_dir"; then case $libdir in [\\/]*) func_append add_dir " -L$inst_prefix_dir$libdir" ;; esac fi add=-l$name elif test yes = "$hardcode_shlibpath_var"; then add_shlibpath=$dir add=-l$name else lib_linked=no fi ;; *) lib_linked=no ;; esac if test yes != "$lib_linked"; then func_fatal_configuration "unsupported hardcode properties" fi if test -n "$add_shlibpath"; then case :$compile_shlibpath: in *":$add_shlibpath:"*) ;; *) func_append compile_shlibpath "$add_shlibpath:" ;; esac fi if test prog = "$linkmode"; then test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs" test -n "$add" && compile_deplibs="$add $compile_deplibs" else test -n "$add_dir" && deplibs="$add_dir $deplibs" test -n "$add" && deplibs="$add $deplibs" if test yes != "$hardcode_direct" && test yes != "$hardcode_minus_L" && test yes = "$hardcode_shlibpath_var"; then case :$finalize_shlibpath: in *":$libdir:"*) ;; *) func_append finalize_shlibpath "$libdir:" ;; esac fi fi fi if test prog = "$linkmode" || test relink = "$opt_mode"; then add_shlibpath= add_dir= add= # Finalize command for both is simple: just hardcode it. if test yes = "$hardcode_direct" && test no = "$hardcode_direct_absolute"; then add=$libdir/$linklib elif test yes = "$hardcode_minus_L"; then add_dir=-L$libdir add=-l$name elif test yes = "$hardcode_shlibpath_var"; then case :$finalize_shlibpath: in *":$libdir:"*) ;; *) func_append finalize_shlibpath "$libdir:" ;; esac add=-l$name elif test yes = "$hardcode_automatic"; then if test -n "$inst_prefix_dir" && test -f "$inst_prefix_dir$libdir/$linklib"; then add=$inst_prefix_dir$libdir/$linklib else add=$libdir/$linklib fi else # We cannot seem to hardcode it, guess we'll fake it. add_dir=-L$libdir # Try looking first in the location we're being installed to. if test -n "$inst_prefix_dir"; then case $libdir in [\\/]*) func_append add_dir " -L$inst_prefix_dir$libdir" ;; esac fi add=-l$name fi if test prog = "$linkmode"; then test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs" test -n "$add" && finalize_deplibs="$add $finalize_deplibs" else test -n "$add_dir" && deplibs="$add_dir $deplibs" test -n "$add" && deplibs="$add $deplibs" fi fi elif test prog = "$linkmode"; then # Here we assume that one of hardcode_direct or hardcode_minus_L # is not unsupported. This is valid on all known static and # shared platforms. if test unsupported != "$hardcode_direct"; then test -n "$old_library" && linklib=$old_library compile_deplibs="$dir/$linklib $compile_deplibs" finalize_deplibs="$dir/$linklib $finalize_deplibs" else compile_deplibs="-l$name -L$dir $compile_deplibs" finalize_deplibs="-l$name -L$dir $finalize_deplibs" fi elif test yes = "$build_libtool_libs"; then # Not a shared library if test pass_all != "$deplibs_check_method"; then # We're trying link a shared library against a static one # but the system doesn't support it. # Just print a warning and add the library to dependency_libs so # that the program can be linked against the static library. echo $ECHO "*** Warning: This system cannot link to static lib archive $lib." echo "*** I have the capability to make that library automatically link in when" echo "*** you link to this library. But I can only do this if you have a" echo "*** shared version of the library, which you do not appear to have." if test yes = "$module"; then echo "*** But as you try to build a module library, libtool will still create " echo "*** a static module, that should work as long as the dlopening application" echo "*** is linked with the -dlopen flag to resolve symbols at runtime." if test -z "$global_symbol_pipe"; then echo echo "*** However, this would only work if libtool was able to extract symbol" echo "*** lists from a program, using 'nm' or equivalent, but libtool could" echo "*** not find such a program. So, this module is probably useless." echo "*** 'nm' from GNU binutils and a full rebuild may help." fi if test no = "$build_old_libs"; then build_libtool_libs=module build_old_libs=yes else build_libtool_libs=no fi fi else deplibs="$dir/$old_library $deplibs" link_static=yes fi fi # link shared/static library? if test lib = "$linkmode"; then if test -n "$dependency_libs" && { test yes != "$hardcode_into_libs" || test yes = "$build_old_libs" || test yes = "$link_static"; }; then # Extract -R from dependency_libs temp_deplibs= for libdir in $dependency_libs; do case $libdir in -R*) func_stripname '-R' '' "$libdir" temp_xrpath=$func_stripname_result case " $xrpath " in *" $temp_xrpath "*) ;; *) func_append xrpath " $temp_xrpath";; esac;; *) func_append temp_deplibs " $libdir";; esac done dependency_libs=$temp_deplibs fi func_append newlib_search_path " $absdir" # Link against this library test no = "$link_static" && newdependency_libs="$abs_ladir/$laname $newdependency_libs" # ... and its dependency_libs tmp_libs= for deplib in $dependency_libs; do newdependency_libs="$deplib $newdependency_libs" case $deplib in -L*) func_stripname '-L' '' "$deplib" func_resolve_sysroot "$func_stripname_result";; *) func_resolve_sysroot "$deplib" ;; esac if $opt_preserve_dup_deps; then case "$tmp_libs " in *" $func_resolve_sysroot_result "*) func_append specialdeplibs " $func_resolve_sysroot_result" ;; esac fi func_append tmp_libs " $func_resolve_sysroot_result" done if test no != "$link_all_deplibs"; then # Add the search paths of all dependency libraries for deplib in $dependency_libs; do path= case $deplib in -L*) path=$deplib ;; *.la) func_resolve_sysroot "$deplib" deplib=$func_resolve_sysroot_result func_dirname "$deplib" "" "." dir=$func_dirname_result # We need an absolute path. case $dir in [\\/]* | [A-Za-z]:[\\/]*) absdir=$dir ;; *) absdir=`cd "$dir" && pwd` if test -z "$absdir"; then func_warning "cannot determine absolute directory name of '$dir'" absdir=$dir fi ;; esac if $GREP "^installed=no" $deplib > /dev/null; then case $host in *-*-darwin*) depdepl= eval deplibrary_names=`$SED -n -e 's/^library_names=\(.*\)$/\1/p' $deplib` if test -n "$deplibrary_names"; then for tmp in $deplibrary_names; do depdepl=$tmp done if test -f "$absdir/$objdir/$depdepl"; then depdepl=$absdir/$objdir/$depdepl darwin_install_name=`$OTOOL -L $depdepl | awk '{if (NR == 2) {print $1;exit}}'` if test -z "$darwin_install_name"; then darwin_install_name=`$OTOOL64 -L $depdepl | awk '{if (NR == 2) {print $1;exit}}'` fi func_append compiler_flags " $wl-dylib_file $wl$darwin_install_name:$depdepl" func_append linker_flags " -dylib_file $darwin_install_name:$depdepl" path= fi fi ;; *) path=-L$absdir/$objdir ;; esac else eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $deplib` test -z "$libdir" && \ func_fatal_error "'$deplib' is not a valid libtool archive" test "$absdir" != "$libdir" && \ func_warning "'$deplib' seems to be moved" path=-L$absdir fi ;; esac case " $deplibs " in *" $path "*) ;; *) deplibs="$path $deplibs" ;; esac done fi # link_all_deplibs != no fi # linkmode = lib done # for deplib in $libs if test link = "$pass"; then if test prog = "$linkmode"; then compile_deplibs="$new_inherited_linker_flags $compile_deplibs" finalize_deplibs="$new_inherited_linker_flags $finalize_deplibs" else compiler_flags="$compiler_flags "`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` fi fi dependency_libs=$newdependency_libs if test dlpreopen = "$pass"; then # Link the dlpreopened libraries before other libraries for deplib in $save_deplibs; do deplibs="$deplib $deplibs" done fi if test dlopen != "$pass"; then test conv = "$pass" || { # Make sure lib_search_path contains only unique directories. lib_search_path= for dir in $newlib_search_path; do case "$lib_search_path " in *" $dir "*) ;; *) func_append lib_search_path " $dir" ;; esac done newlib_search_path= } if test prog,link = "$linkmode,$pass"; then vars="compile_deplibs finalize_deplibs" else vars=deplibs fi for var in $vars dependency_libs; do # Add libraries to $var in reverse order eval tmp_libs=\"\$$var\" new_libs= for deplib in $tmp_libs; do # FIXME: Pedantically, this is the right thing to do, so # that some nasty dependency loop isn't accidentally # broken: #new_libs="$deplib $new_libs" # Pragmatically, this seems to cause very few problems in # practice: case $deplib in -L*) new_libs="$deplib $new_libs" ;; -R*) ;; *) # And here is the reason: when a library appears more # than once as an explicit dependence of a library, or # is implicitly linked in more than once by the # compiler, it is considered special, and multiple # occurrences thereof are not removed. Compare this # with having the same library being listed as a # dependency of multiple other libraries: in this case, # we know (pedantically, we assume) the library does not # need to be listed more than once, so we keep only the # last copy. This is not always right, but it is rare # enough that we require users that really mean to play # such unportable linking tricks to link the library # using -Wl,-lname, so that libtool does not consider it # for duplicate removal. case " $specialdeplibs " in *" $deplib "*) new_libs="$deplib $new_libs" ;; *) case " $new_libs " in *" $deplib "*) ;; *) new_libs="$deplib $new_libs" ;; esac ;; esac ;; esac done tmp_libs= for deplib in $new_libs; do case $deplib in -L*) case " $tmp_libs " in *" $deplib "*) ;; *) func_append tmp_libs " $deplib" ;; esac ;; *) func_append tmp_libs " $deplib" ;; esac done eval $var=\"$tmp_libs\" done # for var fi # Add Sun CC postdeps if required: test CXX = "$tagname" && { case $host_os in linux*) case `$CC -V 2>&1 | sed 5q` in *Sun\ C*) # Sun C++ 5.9 func_suncc_cstd_abi if test no != "$suncc_use_cstd_abi"; then func_append postdeps ' -library=Cstd -library=Crun' fi ;; esac ;; solaris*) func_cc_basename "$CC" case $func_cc_basename_result in CC* | sunCC*) func_suncc_cstd_abi if test no != "$suncc_use_cstd_abi"; then func_append postdeps ' -library=Cstd -library=Crun' fi ;; esac ;; esac } # Last step: remove runtime libs from dependency_libs # (they stay in deplibs) tmp_libs= for i in $dependency_libs; do case " $predeps $postdeps $compiler_lib_search_path " in *" $i "*) i= ;; esac if test -n "$i"; then func_append tmp_libs " $i" fi done dependency_libs=$tmp_libs done # for pass if test prog = "$linkmode"; then dlfiles=$newdlfiles fi if test prog = "$linkmode" || test lib = "$linkmode"; then dlprefiles=$newdlprefiles fi case $linkmode in oldlib) if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then func_warning "'-dlopen' is ignored for archives" fi case " $deplibs" in *\ -l* | *\ -L*) func_warning "'-l' and '-L' are ignored for archives" ;; esac test -n "$rpath" && \ func_warning "'-rpath' is ignored for archives" test -n "$xrpath" && \ func_warning "'-R' is ignored for archives" test -n "$vinfo" && \ func_warning "'-version-info/-version-number' is ignored for archives" test -n "$release" && \ func_warning "'-release' is ignored for archives" test -n "$export_symbols$export_symbols_regex" && \ func_warning "'-export-symbols' is ignored for archives" # Now set the variables for building old libraries. build_libtool_libs=no oldlibs=$output func_append objs "$old_deplibs" ;; lib) # Make sure we only generate libraries of the form 'libNAME.la'. case $outputname in lib*) func_stripname 'lib' '.la' "$outputname" name=$func_stripname_result eval shared_ext=\"$shrext_cmds\" eval libname=\"$libname_spec\" ;; *) test no = "$module" \ && func_fatal_help "libtool library '$output' must begin with 'lib'" if test no != "$need_lib_prefix"; then # Add the "lib" prefix for modules if required func_stripname '' '.la' "$outputname" name=$func_stripname_result eval shared_ext=\"$shrext_cmds\" eval libname=\"$libname_spec\" else func_stripname '' '.la' "$outputname" libname=$func_stripname_result fi ;; esac if test -n "$objs"; then if test pass_all != "$deplibs_check_method"; then func_fatal_error "cannot build libtool library '$output' from non-libtool objects on this host:$objs" else echo $ECHO "*** Warning: Linking the shared library $output against the non-libtool" $ECHO "*** objects $objs is not portable!" func_append libobjs " $objs" fi fi test no = "$dlself" \ || func_warning "'-dlopen self' is ignored for libtool libraries" set dummy $rpath shift test 1 -lt "$#" \ && func_warning "ignoring multiple '-rpath's for a libtool library" install_libdir=$1 oldlibs= if test -z "$rpath"; then if test yes = "$build_libtool_libs"; then # Building a libtool convenience library. # Some compilers have problems with a '.al' extension so # convenience libraries should have the same extension an # archive normally would. oldlibs="$output_objdir/$libname.$libext $oldlibs" build_libtool_libs=convenience build_old_libs=yes fi test -n "$vinfo" && \ func_warning "'-version-info/-version-number' is ignored for convenience libraries" test -n "$release" && \ func_warning "'-release' is ignored for convenience libraries" else # Parse the version information argument. save_ifs=$IFS; IFS=: set dummy $vinfo 0 0 0 shift IFS=$save_ifs test -n "$7" && \ func_fatal_help "too many parameters to '-version-info'" # convert absolute version numbers to libtool ages # this retains compatibility with .la files and attempts # to make the code below a bit more comprehensible case $vinfo_number in yes) number_major=$1 number_minor=$2 number_revision=$3 # # There are really only two kinds -- those that # use the current revision as the major version # and those that subtract age and use age as # a minor version. But, then there is irix # that has an extra 1 added just for fun # case $version_type in # correct linux to gnu/linux during the next big refactor darwin|freebsd-elf|linux|osf|windows|none) func_arith $number_major + $number_minor current=$func_arith_result age=$number_minor revision=$number_revision ;; freebsd-aout|qnx|sunos) current=$number_major revision=$number_minor age=0 ;; irix|nonstopux) func_arith $number_major + $number_minor current=$func_arith_result age=$number_minor revision=$number_minor lt_irix_increment=no ;; esac ;; no) current=$1 revision=$2 age=$3 ;; esac # Check that each of the things are valid numbers. case $current in 0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;; *) func_error "CURRENT '$current' must be a nonnegative integer" func_fatal_error "'$vinfo' is not valid version information" ;; esac case $revision in 0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;; *) func_error "REVISION '$revision' must be a nonnegative integer" func_fatal_error "'$vinfo' is not valid version information" ;; esac case $age in 0|[1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;; *) func_error "AGE '$age' must be a nonnegative integer" func_fatal_error "'$vinfo' is not valid version information" ;; esac if test "$age" -gt "$current"; then func_error "AGE '$age' is greater than the current interface number '$current'" func_fatal_error "'$vinfo' is not valid version information" fi # Calculate the version variables. major= versuffix= verstring= case $version_type in none) ;; darwin) # Like Linux, but with the current version available in # verstring for coding it into the library header func_arith $current - $age major=.$func_arith_result versuffix=$major.$age.$revision # Darwin ld doesn't like 0 for these options... func_arith $current + 1 minor_current=$func_arith_result xlcverstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision" verstring="-compatibility_version $minor_current -current_version $minor_current.$revision" # On Darwin other compilers case $CC in nagfor*) verstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision" ;; *) verstring="-compatibility_version $minor_current -current_version $minor_current.$revision" ;; esac ;; freebsd-aout) major=.$current versuffix=.$current.$revision ;; freebsd-elf) func_arith $current - $age major=.$func_arith_result versuffix=$major.$age.$revision ;; irix | nonstopux) if test no = "$lt_irix_increment"; then func_arith $current - $age else func_arith $current - $age + 1 fi major=$func_arith_result case $version_type in nonstopux) verstring_prefix=nonstopux ;; *) verstring_prefix=sgi ;; esac verstring=$verstring_prefix$major.$revision # Add in all the interfaces that we are compatible with. loop=$revision while test 0 -ne "$loop"; do func_arith $revision - $loop iface=$func_arith_result func_arith $loop - 1 loop=$func_arith_result verstring=$verstring_prefix$major.$iface:$verstring done # Before this point, $major must not contain '.'. major=.$major versuffix=$major.$revision ;; linux) # correct to gnu/linux during the next big refactor func_arith $current - $age major=.$func_arith_result versuffix=$major.$age.$revision ;; osf) func_arith $current - $age major=.$func_arith_result versuffix=.$current.$age.$revision verstring=$current.$age.$revision # Add in all the interfaces that we are compatible with. loop=$age while test 0 -ne "$loop"; do func_arith $current - $loop iface=$func_arith_result func_arith $loop - 1 loop=$func_arith_result verstring=$verstring:$iface.0 done # Make executables depend on our current version. func_append verstring ":$current.0" ;; qnx) major=.$current versuffix=.$current ;; sco) major=.$current versuffix=.$current ;; sunos) major=.$current versuffix=.$current.$revision ;; windows) # Use '-' rather than '.', since we only want one # extension on DOS 8.3 file systems. func_arith $current - $age major=$func_arith_result versuffix=-$major ;; *) func_fatal_configuration "unknown library version type '$version_type'" ;; esac # Clear the version info if we defaulted, and they specified a release. if test -z "$vinfo" && test -n "$release"; then major= case $version_type in darwin) # we can't check for "0.0" in archive_cmds due to quoting # problems, so we reset it completely verstring= ;; *) verstring=0.0 ;; esac if test no = "$need_version"; then versuffix= else versuffix=.0.0 fi fi # Remove version info from name if versioning should be avoided if test yes,no = "$avoid_version,$need_version"; then major= versuffix= verstring= fi # Check to see if the archive will have undefined symbols. if test yes = "$allow_undefined"; then if test unsupported = "$allow_undefined_flag"; then if test yes = "$build_old_libs"; then func_warning "undefined symbols not allowed in $host shared libraries; building static only" build_libtool_libs=no else func_fatal_error "can't build $host shared library unless -no-undefined is specified" fi fi else # Don't allow undefined symbols. allow_undefined_flag=$no_undefined_flag fi fi func_generate_dlsyms "$libname" "$libname" : func_append libobjs " $symfileobj" test " " = "$libobjs" && libobjs= if test relink != "$opt_mode"; then # Remove our outputs, but don't remove object files since they # may have been created when compiling PIC objects. removelist= tempremovelist=`$ECHO "$output_objdir/*"` for p in $tempremovelist; do case $p in *.$objext | *.gcno) ;; $output_objdir/$outputname | $output_objdir/$libname.* | $output_objdir/$libname$release.*) if test -n "$precious_files_regex"; then if $ECHO "$p" | $EGREP -e "$precious_files_regex" >/dev/null 2>&1 then continue fi fi func_append removelist " $p" ;; *) ;; esac done test -n "$removelist" && \ func_show_eval "${RM}r \$removelist" fi # Now set the variables for building old libraries. if test yes = "$build_old_libs" && test convenience != "$build_libtool_libs"; then func_append oldlibs " $output_objdir/$libname.$libext" # Transform .lo files to .o files. oldobjs="$objs "`$ECHO "$libobjs" | $SP2NL | $SED "/\.$libext$/d; $lo2o" | $NL2SP` fi # Eliminate all temporary directories. #for path in $notinst_path; do # lib_search_path=`$ECHO "$lib_search_path " | $SED "s% $path % %g"` # deplibs=`$ECHO "$deplibs " | $SED "s% -L$path % %g"` # dependency_libs=`$ECHO "$dependency_libs " | $SED "s% -L$path % %g"` #done if test -n "$xrpath"; then # If the user specified any rpath flags, then add them. temp_xrpath= for libdir in $xrpath; do func_replace_sysroot "$libdir" func_append temp_xrpath " -R$func_replace_sysroot_result" case "$finalize_rpath " in *" $libdir "*) ;; *) func_append finalize_rpath " $libdir" ;; esac done if test yes != "$hardcode_into_libs" || test yes = "$build_old_libs"; then dependency_libs="$temp_xrpath $dependency_libs" fi fi # Make sure dlfiles contains only unique files that won't be dlpreopened old_dlfiles=$dlfiles dlfiles= for lib in $old_dlfiles; do case " $dlprefiles $dlfiles " in *" $lib "*) ;; *) func_append dlfiles " $lib" ;; esac done # Make sure dlprefiles contains only unique files old_dlprefiles=$dlprefiles dlprefiles= for lib in $old_dlprefiles; do case "$dlprefiles " in *" $lib "*) ;; *) func_append dlprefiles " $lib" ;; esac done if test yes = "$build_libtool_libs"; then if test -n "$rpath"; then case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos* | *-cegcc* | *-*-haiku*) # these systems don't actually have a c library (as such)! ;; *-*-rhapsody* | *-*-darwin1.[012]) # Rhapsody C library is in the System framework func_append deplibs " System.ltframework" ;; *-*-netbsd*) # Don't link with libc until the a.out ld.so is fixed. ;; *-*-openbsd* | *-*-freebsd* | *-*-dragonfly*) # Do not include libc due to us having libc/libc_r. ;; *-*-sco3.2v5* | *-*-sco5v6*) # Causes problems with __ctype ;; *-*-sysv4.2uw2* | *-*-sysv5* | *-*-unixware* | *-*-OpenUNIX*) # Compiler inserts libc in the correct place for threads to work ;; *) # Add libc to deplibs on all other systems if necessary. if test yes = "$build_libtool_need_lc"; then func_append deplibs " -lc" fi ;; esac fi # Transform deplibs into only deplibs that can be linked in shared. name_save=$name libname_save=$libname release_save=$release versuffix_save=$versuffix major_save=$major # I'm not sure if I'm treating the release correctly. I think # release should show up in the -l (ie -lgmp5) so we don't want to # add it in twice. Is that correct? release= versuffix= major= newdeplibs= droppeddeps=no case $deplibs_check_method in pass_all) # Don't check for shared/static. Everything works. # This might be a little naive. We might want to check # whether the library exists or not. But this is on # osf3 & osf4 and I'm not really sure... Just # implementing what was already the behavior. newdeplibs=$deplibs ;; test_compile) # This code stresses the "libraries are programs" paradigm to its # limits. Maybe even breaks it. We compile a program, linking it # against the deplibs as a proxy for the library. Then we can check # whether they linked in statically or dynamically with ldd. $opt_dry_run || $RM conftest.c cat > conftest.c </dev/null` $nocaseglob else potential_libs=`ls $i/$libnameglob[.-]* 2>/dev/null` fi for potent_lib in $potential_libs; do # Follow soft links. if ls -lLd "$potent_lib" 2>/dev/null | $GREP " -> " >/dev/null; then continue fi # The statement above tries to avoid entering an # endless loop below, in case of cyclic links. # We might still enter an endless loop, since a link # loop can be closed while we follow links, # but so what? potlib=$potent_lib while test -h "$potlib" 2>/dev/null; do potliblink=`ls -ld $potlib | $SED 's/.* -> //'` case $potliblink in [\\/]* | [A-Za-z]:[\\/]*) potlib=$potliblink;; *) potlib=`$ECHO "$potlib" | $SED 's|[^/]*$||'`"$potliblink";; esac done if eval $file_magic_cmd \"\$potlib\" 2>/dev/null | $SED -e 10q | $EGREP "$file_magic_regex" > /dev/null; then func_append newdeplibs " $a_deplib" a_deplib= break 2 fi done done fi if test -n "$a_deplib"; then droppeddeps=yes echo $ECHO "*** Warning: linker path does not have real file for library $a_deplib." echo "*** I have the capability to make that library automatically link in when" echo "*** you link to this library. But I can only do this if you have a" echo "*** shared version of the library, which you do not appear to have" echo "*** because I did check the linker path looking for a file starting" if test -z "$potlib"; then $ECHO "*** with $libname but no candidates were found. (...for file magic test)" else $ECHO "*** with $libname and none of the candidates passed a file format test" $ECHO "*** using a file magic. Last file checked: $potlib" fi fi ;; *) # Add a -L argument. func_append newdeplibs " $a_deplib" ;; esac done # Gone through all deplibs. ;; match_pattern*) set dummy $deplibs_check_method; shift match_pattern_regex=`expr "$deplibs_check_method" : "$1 \(.*\)"` for a_deplib in $deplibs; do case $a_deplib in -l*) func_stripname -l '' "$a_deplib" name=$func_stripname_result if test yes = "$allow_libtool_libs_with_static_runtimes"; then case " $predeps $postdeps " in *" $a_deplib "*) func_append newdeplibs " $a_deplib" a_deplib= ;; esac fi if test -n "$a_deplib"; then libname=`eval "\\$ECHO \"$libname_spec\""` for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do potential_libs=`ls $i/$libname[.-]* 2>/dev/null` for potent_lib in $potential_libs; do potlib=$potent_lib # see symlink-check above in file_magic test if eval "\$ECHO \"$potent_lib\"" 2>/dev/null | $SED 10q | \ $EGREP "$match_pattern_regex" > /dev/null; then func_append newdeplibs " $a_deplib" a_deplib= break 2 fi done done fi if test -n "$a_deplib"; then droppeddeps=yes echo $ECHO "*** Warning: linker path does not have real file for library $a_deplib." echo "*** I have the capability to make that library automatically link in when" echo "*** you link to this library. But I can only do this if you have a" echo "*** shared version of the library, which you do not appear to have" echo "*** because I did check the linker path looking for a file starting" if test -z "$potlib"; then $ECHO "*** with $libname but no candidates were found. (...for regex pattern test)" else $ECHO "*** with $libname and none of the candidates passed a file format test" $ECHO "*** using a regex pattern. Last file checked: $potlib" fi fi ;; *) # Add a -L argument. func_append newdeplibs " $a_deplib" ;; esac done # Gone through all deplibs. ;; none | unknown | *) newdeplibs= tmp_deplibs=`$ECHO " $deplibs" | $SED 's/ -lc$//; s/ -[LR][^ ]*//g'` if test yes = "$allow_libtool_libs_with_static_runtimes"; then for i in $predeps $postdeps; do # can't use Xsed below, because $i might contain '/' tmp_deplibs=`$ECHO " $tmp_deplibs" | $SED "s|$i||"` done fi case $tmp_deplibs in *[!\ \ ]*) echo if test none = "$deplibs_check_method"; then echo "*** Warning: inter-library dependencies are not supported in this platform." else echo "*** Warning: inter-library dependencies are not known to be supported." fi echo "*** All declared inter-library dependencies are being dropped." droppeddeps=yes ;; esac ;; esac versuffix=$versuffix_save major=$major_save release=$release_save libname=$libname_save name=$name_save case $host in *-*-rhapsody* | *-*-darwin1.[012]) # On Rhapsody replace the C library with the System framework newdeplibs=`$ECHO " $newdeplibs" | $SED 's/ -lc / System.ltframework /'` ;; esac if test yes = "$droppeddeps"; then if test yes = "$module"; then echo echo "*** Warning: libtool could not satisfy all declared inter-library" $ECHO "*** dependencies of module $libname. Therefore, libtool will create" echo "*** a static module, that should work as long as the dlopening" echo "*** application is linked with the -dlopen flag." if test -z "$global_symbol_pipe"; then echo echo "*** However, this would only work if libtool was able to extract symbol" echo "*** lists from a program, using 'nm' or equivalent, but libtool could" echo "*** not find such a program. So, this module is probably useless." echo "*** 'nm' from GNU binutils and a full rebuild may help." fi if test no = "$build_old_libs"; then oldlibs=$output_objdir/$libname.$libext build_libtool_libs=module build_old_libs=yes else build_libtool_libs=no fi else echo "*** The inter-library dependencies that have been dropped here will be" echo "*** automatically added whenever a program is linked with this library" echo "*** or is declared to -dlopen it." if test no = "$allow_undefined"; then echo echo "*** Since this library must not contain undefined symbols," echo "*** because either the platform does not support them or" echo "*** it was explicitly requested with -no-undefined," echo "*** libtool will only create a static version of it." if test no = "$build_old_libs"; then oldlibs=$output_objdir/$libname.$libext build_libtool_libs=module build_old_libs=yes else build_libtool_libs=no fi fi fi fi # Done checking deplibs! deplibs=$newdeplibs fi # Time to change all our "foo.ltframework" stuff back to "-framework foo" case $host in *-*-darwin*) newdeplibs=`$ECHO " $newdeplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` new_inherited_linker_flags=`$ECHO " $new_inherited_linker_flags" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` deplibs=`$ECHO " $deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` ;; esac # move library search paths that coincide with paths to not yet # installed libraries to the beginning of the library search list new_libs= for path in $notinst_path; do case " $new_libs " in *" -L$path/$objdir "*) ;; *) case " $deplibs " in *" -L$path/$objdir "*) func_append new_libs " -L$path/$objdir" ;; esac ;; esac done for deplib in $deplibs; do case $deplib in -L*) case " $new_libs " in *" $deplib "*) ;; *) func_append new_libs " $deplib" ;; esac ;; *) func_append new_libs " $deplib" ;; esac done deplibs=$new_libs # All the library-specific variables (install_libdir is set above). library_names= old_library= dlname= # Test again, we may have decided not to build it any more if test yes = "$build_libtool_libs"; then # Remove $wl instances when linking with ld. # FIXME: should test the right _cmds variable. case $archive_cmds in *\$LD\ *) wl= ;; esac if test yes = "$hardcode_into_libs"; then # Hardcode the library paths hardcode_libdirs= dep_rpath= rpath=$finalize_rpath test relink = "$opt_mode" || rpath=$compile_rpath$rpath for libdir in $rpath; do if test -n "$hardcode_libdir_flag_spec"; then if test -n "$hardcode_libdir_separator"; then func_replace_sysroot "$libdir" libdir=$func_replace_sysroot_result if test -z "$hardcode_libdirs"; then hardcode_libdirs=$libdir else # Just accumulate the unique libdirs. case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*) ;; *) func_append hardcode_libdirs "$hardcode_libdir_separator$libdir" ;; esac fi else eval flag=\"$hardcode_libdir_flag_spec\" func_append dep_rpath " $flag" fi elif test -n "$runpath_var"; then case "$perm_rpath " in *" $libdir "*) ;; *) func_append perm_rpath " $libdir" ;; esac fi done # Substitute the hardcoded libdirs into the rpath. if test -n "$hardcode_libdir_separator" && test -n "$hardcode_libdirs"; then libdir=$hardcode_libdirs eval "dep_rpath=\"$hardcode_libdir_flag_spec\"" fi if test -n "$runpath_var" && test -n "$perm_rpath"; then # We should set the runpath_var. rpath= for dir in $perm_rpath; do func_append rpath "$dir:" done eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var" fi test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs" fi shlibpath=$finalize_shlibpath test relink = "$opt_mode" || shlibpath=$compile_shlibpath$shlibpath if test -n "$shlibpath"; then eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var" fi # Get the real and link names of the library. eval shared_ext=\"$shrext_cmds\" eval library_names=\"$library_names_spec\" set dummy $library_names shift realname=$1 shift if test -n "$soname_spec"; then eval soname=\"$soname_spec\" else soname=$realname fi if test -z "$dlname"; then dlname=$soname fi lib=$output_objdir/$realname linknames= for link do func_append linknames " $link" done # Use standard objects if they are pic test -z "$pic_flag" && libobjs=`$ECHO "$libobjs" | $SP2NL | $SED "$lo2o" | $NL2SP` test "X$libobjs" = "X " && libobjs= delfiles= if test -n "$export_symbols" && test -n "$include_expsyms"; then $opt_dry_run || cp "$export_symbols" "$output_objdir/$libname.uexp" export_symbols=$output_objdir/$libname.uexp func_append delfiles " $export_symbols" fi orig_export_symbols= case $host_os in cygwin* | mingw* | cegcc*) if test -n "$export_symbols" && test -z "$export_symbols_regex"; then # exporting using user supplied symfile func_dll_def_p "$export_symbols" || { # and it's NOT already a .def file. Must figure out # which of the given symbols are data symbols and tag # them as such. So, trigger use of export_symbols_cmds. # export_symbols gets reassigned inside the "prepare # the list of exported symbols" if statement, so the # include_expsyms logic still works. orig_export_symbols=$export_symbols export_symbols= always_export_symbols=yes } fi ;; esac # Prepare the list of exported symbols if test -z "$export_symbols"; then if test yes = "$always_export_symbols" || test -n "$export_symbols_regex"; then func_verbose "generating symbol list for '$libname.la'" export_symbols=$output_objdir/$libname.exp $opt_dry_run || $RM $export_symbols cmds=$export_symbols_cmds save_ifs=$IFS; IFS='~' for cmd1 in $cmds; do IFS=$save_ifs # Take the normal branch if the nm_file_list_spec branch # doesn't work or if tool conversion is not needed. case $nm_file_list_spec~$to_tool_file_cmd in *~func_convert_file_noop | *~func_convert_file_msys_to_w32 | ~*) try_normal_branch=yes eval cmd=\"$cmd1\" func_len " $cmd" len=$func_len_result ;; *) try_normal_branch=no ;; esac if test yes = "$try_normal_branch" \ && { test "$len" -lt "$max_cmd_len" \ || test "$max_cmd_len" -le -1; } then func_show_eval "$cmd" 'exit $?' skipped_export=false elif test -n "$nm_file_list_spec"; then func_basename "$output" output_la=$func_basename_result save_libobjs=$libobjs save_output=$output output=$output_objdir/$output_la.nm func_to_tool_file "$output" libobjs=$nm_file_list_spec$func_to_tool_file_result func_append delfiles " $output" func_verbose "creating $NM input file list: $output" for obj in $save_libobjs; do func_to_tool_file "$obj" $ECHO "$func_to_tool_file_result" done > "$output" eval cmd=\"$cmd1\" func_show_eval "$cmd" 'exit $?' output=$save_output libobjs=$save_libobjs skipped_export=false else # The command line is too long to execute in one step. func_verbose "using reloadable object file for export list..." skipped_export=: # Break out early, otherwise skipped_export may be # set to false by a later but shorter cmd. break fi done IFS=$save_ifs if test -n "$export_symbols_regex" && test : != "$skipped_export"; then func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"' func_show_eval '$MV "${export_symbols}T" "$export_symbols"' fi fi fi if test -n "$export_symbols" && test -n "$include_expsyms"; then tmp_export_symbols=$export_symbols test -n "$orig_export_symbols" && tmp_export_symbols=$orig_export_symbols $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"' fi if test : != "$skipped_export" && test -n "$orig_export_symbols"; then # The given exports_symbols file has to be filtered, so filter it. func_verbose "filter symbol list for '$libname.la' to tag DATA exports" # FIXME: $output_objdir/$libname.filter potentially contains lots of # 's' commands, which not all seds can handle. GNU sed should be fine # though. Also, the filter scales superlinearly with the number of # global variables. join(1) would be nice here, but unfortunately # isn't a blessed tool. $opt_dry_run || $SED -e '/[ ,]DATA/!d;s,\(.*\)\([ \,].*\),s|^\1$|\1\2|,' < $export_symbols > $output_objdir/$libname.filter func_append delfiles " $export_symbols $output_objdir/$libname.filter" export_symbols=$output_objdir/$libname.def $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols fi tmp_deplibs= for test_deplib in $deplibs; do case " $convenience " in *" $test_deplib "*) ;; *) func_append tmp_deplibs " $test_deplib" ;; esac done deplibs=$tmp_deplibs if test -n "$convenience"; then if test -n "$whole_archive_flag_spec" && test yes = "$compiler_needs_object" && test -z "$libobjs"; then # extract the archives, so we have objects to list. # TODO: could optimize this to just extract one archive. whole_archive_flag_spec= fi if test -n "$whole_archive_flag_spec"; then save_libobjs=$libobjs eval libobjs=\"\$libobjs $whole_archive_flag_spec\" test "X$libobjs" = "X " && libobjs= else gentop=$output_objdir/${outputname}x func_append generated " $gentop" func_extract_archives $gentop $convenience func_append libobjs " $func_extract_archives_result" test "X$libobjs" = "X " && libobjs= fi fi if test yes = "$thread_safe" && test -n "$thread_safe_flag_spec"; then eval flag=\"$thread_safe_flag_spec\" func_append linker_flags " $flag" fi # Make a backup of the uninstalled library when relinking if test relink = "$opt_mode"; then $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}U && $MV $realname ${realname}U)' || exit $? fi # Do each of the archive commands. if test yes = "$module" && test -n "$module_cmds"; then if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then eval test_cmds=\"$module_expsym_cmds\" cmds=$module_expsym_cmds else eval test_cmds=\"$module_cmds\" cmds=$module_cmds fi else if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then eval test_cmds=\"$archive_expsym_cmds\" cmds=$archive_expsym_cmds else eval test_cmds=\"$archive_cmds\" cmds=$archive_cmds fi fi if test : != "$skipped_export" && func_len " $test_cmds" && len=$func_len_result && test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then : else # The command line is too long to link in one step, link piecewise # or, if using GNU ld and skipped_export is not :, use a linker # script. # Save the value of $output and $libobjs because we want to # use them later. If we have whole_archive_flag_spec, we # want to use save_libobjs as it was before # whole_archive_flag_spec was expanded, because we can't # assume the linker understands whole_archive_flag_spec. # This may have to be revisited, in case too many # convenience libraries get linked in and end up exceeding # the spec. if test -z "$convenience" || test -z "$whole_archive_flag_spec"; then save_libobjs=$libobjs fi save_output=$output func_basename "$output" output_la=$func_basename_result # Clear the reloadable object creation command queue and # initialize k to one. test_cmds= concat_cmds= objlist= last_robj= k=1 if test -n "$save_libobjs" && test : != "$skipped_export" && test yes = "$with_gnu_ld"; then output=$output_objdir/$output_la.lnkscript func_verbose "creating GNU ld script: $output" echo 'INPUT (' > $output for obj in $save_libobjs do func_to_tool_file "$obj" $ECHO "$func_to_tool_file_result" >> $output done echo ')' >> $output func_append delfiles " $output" func_to_tool_file "$output" output=$func_to_tool_file_result elif test -n "$save_libobjs" && test : != "$skipped_export" && test -n "$file_list_spec"; then output=$output_objdir/$output_la.lnk func_verbose "creating linker input file list: $output" : > $output set x $save_libobjs shift firstobj= if test yes = "$compiler_needs_object"; then firstobj="$1 " shift fi for obj do func_to_tool_file "$obj" $ECHO "$func_to_tool_file_result" >> $output done func_append delfiles " $output" func_to_tool_file "$output" output=$firstobj\"$file_list_spec$func_to_tool_file_result\" else if test -n "$save_libobjs"; then func_verbose "creating reloadable object files..." output=$output_objdir/$output_la-$k.$objext eval test_cmds=\"$reload_cmds\" func_len " $test_cmds" len0=$func_len_result len=$len0 # Loop over the list of objects to be linked. for obj in $save_libobjs do func_len " $obj" func_arith $len + $func_len_result len=$func_arith_result if test -z "$objlist" || test "$len" -lt "$max_cmd_len"; then func_append objlist " $obj" else # The command $test_cmds is almost too long, add a # command to the queue. if test 1 -eq "$k"; then # The first file doesn't have a previous command to add. reload_objs=$objlist eval concat_cmds=\"$reload_cmds\" else # All subsequent reloadable object files will link in # the last one created. reload_objs="$objlist $last_robj" eval concat_cmds=\"\$concat_cmds~$reload_cmds~\$RM $last_robj\" fi last_robj=$output_objdir/$output_la-$k.$objext func_arith $k + 1 k=$func_arith_result output=$output_objdir/$output_la-$k.$objext objlist=" $obj" func_len " $last_robj" func_arith $len0 + $func_len_result len=$func_arith_result fi done # Handle the remaining objects by creating one last # reloadable object file. All subsequent reloadable object # files will link in the last one created. test -z "$concat_cmds" || concat_cmds=$concat_cmds~ reload_objs="$objlist $last_robj" eval concat_cmds=\"\$concat_cmds$reload_cmds\" if test -n "$last_robj"; then eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\" fi func_append delfiles " $output" else output= fi ${skipped_export-false} && { func_verbose "generating symbol list for '$libname.la'" export_symbols=$output_objdir/$libname.exp $opt_dry_run || $RM $export_symbols libobjs=$output # Append the command to create the export file. test -z "$concat_cmds" || concat_cmds=$concat_cmds~ eval concat_cmds=\"\$concat_cmds$export_symbols_cmds\" if test -n "$last_robj"; then eval concat_cmds=\"\$concat_cmds~\$RM $last_robj\" fi } test -n "$save_libobjs" && func_verbose "creating a temporary reloadable object file: $output" # Loop through the commands generated above and execute them. save_ifs=$IFS; IFS='~' for cmd in $concat_cmds; do IFS=$save_ifs $opt_quiet || { func_quote_for_expand "$cmd" eval "func_echo $func_quote_for_expand_result" } $opt_dry_run || eval "$cmd" || { lt_exit=$? # Restore the uninstalled library and exit if test relink = "$opt_mode"; then ( cd "$output_objdir" && \ $RM "${realname}T" && \ $MV "${realname}U" "$realname" ) fi exit $lt_exit } done IFS=$save_ifs if test -n "$export_symbols_regex" && ${skipped_export-false}; then func_show_eval '$EGREP -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"' func_show_eval '$MV "${export_symbols}T" "$export_symbols"' fi fi ${skipped_export-false} && { if test -n "$export_symbols" && test -n "$include_expsyms"; then tmp_export_symbols=$export_symbols test -n "$orig_export_symbols" && tmp_export_symbols=$orig_export_symbols $opt_dry_run || eval '$ECHO "$include_expsyms" | $SP2NL >> "$tmp_export_symbols"' fi if test -n "$orig_export_symbols"; then # The given exports_symbols file has to be filtered, so filter it. func_verbose "filter symbol list for '$libname.la' to tag DATA exports" # FIXME: $output_objdir/$libname.filter potentially contains lots of # 's' commands, which not all seds can handle. GNU sed should be fine # though. Also, the filter scales superlinearly with the number of # global variables. join(1) would be nice here, but unfortunately # isn't a blessed tool. $opt_dry_run || $SED -e '/[ ,]DATA/!d;s,\(.*\)\([ \,].*\),s|^\1$|\1\2|,' < $export_symbols > $output_objdir/$libname.filter func_append delfiles " $export_symbols $output_objdir/$libname.filter" export_symbols=$output_objdir/$libname.def $opt_dry_run || $SED -f $output_objdir/$libname.filter < $orig_export_symbols > $export_symbols fi } libobjs=$output # Restore the value of output. output=$save_output if test -n "$convenience" && test -n "$whole_archive_flag_spec"; then eval libobjs=\"\$libobjs $whole_archive_flag_spec\" test "X$libobjs" = "X " && libobjs= fi # Expand the library linking commands again to reset the # value of $libobjs for piecewise linking. # Do each of the archive commands. if test yes = "$module" && test -n "$module_cmds"; then if test -n "$export_symbols" && test -n "$module_expsym_cmds"; then cmds=$module_expsym_cmds else cmds=$module_cmds fi else if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then cmds=$archive_expsym_cmds else cmds=$archive_cmds fi fi fi if test -n "$delfiles"; then # Append the command to remove temporary files to $cmds. eval cmds=\"\$cmds~\$RM $delfiles\" fi # Add any objects from preloaded convenience libraries if test -n "$dlprefiles"; then gentop=$output_objdir/${outputname}x func_append generated " $gentop" func_extract_archives $gentop $dlprefiles func_append libobjs " $func_extract_archives_result" test "X$libobjs" = "X " && libobjs= fi save_ifs=$IFS; IFS='~' for cmd in $cmds; do IFS=$sp$nl eval cmd=\"$cmd\" IFS=$save_ifs $opt_quiet || { func_quote_for_expand "$cmd" eval "func_echo $func_quote_for_expand_result" } $opt_dry_run || eval "$cmd" || { lt_exit=$? # Restore the uninstalled library and exit if test relink = "$opt_mode"; then ( cd "$output_objdir" && \ $RM "${realname}T" && \ $MV "${realname}U" "$realname" ) fi exit $lt_exit } done IFS=$save_ifs # Restore the uninstalled library and exit if test relink = "$opt_mode"; then $opt_dry_run || eval '(cd $output_objdir && $RM ${realname}T && $MV $realname ${realname}T && $MV ${realname}U $realname)' || exit $? if test -n "$convenience"; then if test -z "$whole_archive_flag_spec"; then func_show_eval '${RM}r "$gentop"' fi fi exit $EXIT_SUCCESS fi # Create links to the real library. for linkname in $linknames; do if test "$realname" != "$linkname"; then func_show_eval '(cd "$output_objdir" && $RM "$linkname" && $LN_S "$realname" "$linkname")' 'exit $?' fi done # If -module or -export-dynamic was specified, set the dlname. if test yes = "$module" || test yes = "$export_dynamic"; then # On all known operating systems, these are identical. dlname=$soname fi fi ;; obj) if test -n "$dlfiles$dlprefiles" || test no != "$dlself"; then func_warning "'-dlopen' is ignored for objects" fi case " $deplibs" in *\ -l* | *\ -L*) func_warning "'-l' and '-L' are ignored for objects" ;; esac test -n "$rpath" && \ func_warning "'-rpath' is ignored for objects" test -n "$xrpath" && \ func_warning "'-R' is ignored for objects" test -n "$vinfo" && \ func_warning "'-version-info' is ignored for objects" test -n "$release" && \ func_warning "'-release' is ignored for objects" case $output in *.lo) test -n "$objs$old_deplibs" && \ func_fatal_error "cannot build library object '$output' from non-libtool objects" libobj=$output func_lo2o "$libobj" obj=$func_lo2o_result ;; *) libobj= obj=$output ;; esac # Delete the old objects. $opt_dry_run || $RM $obj $libobj # Objects from convenience libraries. This assumes # single-version convenience libraries. Whenever we create # different ones for PIC/non-PIC, this we'll have to duplicate # the extraction. reload_conv_objs= gentop= # if reload_cmds runs $LD directly, get rid of -Wl from # whole_archive_flag_spec and hope we can get by with turning comma # into space. case $reload_cmds in *\$LD[\ \$]*) wl= ;; esac if test -n "$convenience"; then if test -n "$whole_archive_flag_spec"; then eval tmp_whole_archive_flags=\"$whole_archive_flag_spec\" test -n "$wl" || tmp_whole_archive_flags=`$ECHO "$tmp_whole_archive_flags" | $SED 's|,| |g'` reload_conv_objs=$reload_objs\ $tmp_whole_archive_flags else gentop=$output_objdir/${obj}x func_append generated " $gentop" func_extract_archives $gentop $convenience reload_conv_objs="$reload_objs $func_extract_archives_result" fi fi # If we're not building shared, we need to use non_pic_objs test yes = "$build_libtool_libs" || libobjs=$non_pic_objects # Create the old-style object. reload_objs=$objs$old_deplibs' '`$ECHO "$libobjs" | $SP2NL | $SED "/\.$libext$/d; /\.lib$/d; $lo2o" | $NL2SP`' '$reload_conv_objs output=$obj func_execute_cmds "$reload_cmds" 'exit $?' # Exit if we aren't doing a library object file. if test -z "$libobj"; then if test -n "$gentop"; then func_show_eval '${RM}r "$gentop"' fi exit $EXIT_SUCCESS fi test yes = "$build_libtool_libs" || { if test -n "$gentop"; then func_show_eval '${RM}r "$gentop"' fi # Create an invalid libtool object if no PIC, so that we don't # accidentally link it into a program. # $show "echo timestamp > $libobj" # $opt_dry_run || eval "echo timestamp > $libobj" || exit $? exit $EXIT_SUCCESS } if test -n "$pic_flag" || test default != "$pic_mode"; then # Only do commands if we really have different PIC objects. reload_objs="$libobjs $reload_conv_objs" output=$libobj func_execute_cmds "$reload_cmds" 'exit $?' fi if test -n "$gentop"; then func_show_eval '${RM}r "$gentop"' fi exit $EXIT_SUCCESS ;; prog) case $host in *cygwin*) func_stripname '' '.exe' "$output" output=$func_stripname_result.exe;; esac test -n "$vinfo" && \ func_warning "'-version-info' is ignored for programs" test -n "$release" && \ func_warning "'-release' is ignored for programs" $preload \ && test unknown,unknown,unknown = "$dlopen_support,$dlopen_self,$dlopen_self_static" \ && func_warning "'LT_INIT([dlopen])' not used. Assuming no dlopen support." case $host in *-*-rhapsody* | *-*-darwin1.[012]) # On Rhapsody replace the C library is the System framework compile_deplibs=`$ECHO " $compile_deplibs" | $SED 's/ -lc / System.ltframework /'` finalize_deplibs=`$ECHO " $finalize_deplibs" | $SED 's/ -lc / System.ltframework /'` ;; esac case $host in *-*-darwin*) # Don't allow lazy linking, it breaks C++ global constructors # But is supposedly fixed on 10.4 or later (yay!). if test CXX = "$tagname"; then case ${MACOSX_DEPLOYMENT_TARGET-10.0} in 10.[0123]) func_append compile_command " $wl-bind_at_load" func_append finalize_command " $wl-bind_at_load" ;; esac fi # Time to change all our "foo.ltframework" stuff back to "-framework foo" compile_deplibs=`$ECHO " $compile_deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` finalize_deplibs=`$ECHO " $finalize_deplibs" | $SED 's% \([^ $]*\).ltframework% -framework \1%g'` ;; esac # move library search paths that coincide with paths to not yet # installed libraries to the beginning of the library search list new_libs= for path in $notinst_path; do case " $new_libs " in *" -L$path/$objdir "*) ;; *) case " $compile_deplibs " in *" -L$path/$objdir "*) func_append new_libs " -L$path/$objdir" ;; esac ;; esac done for deplib in $compile_deplibs; do case $deplib in -L*) case " $new_libs " in *" $deplib "*) ;; *) func_append new_libs " $deplib" ;; esac ;; *) func_append new_libs " $deplib" ;; esac done compile_deplibs=$new_libs func_append compile_command " $compile_deplibs" func_append finalize_command " $finalize_deplibs" if test -n "$rpath$xrpath"; then # If the user specified any rpath flags, then add them. for libdir in $rpath $xrpath; do # This is the magic to use -rpath. case "$finalize_rpath " in *" $libdir "*) ;; *) func_append finalize_rpath " $libdir" ;; esac done fi # Now hardcode the library paths rpath= hardcode_libdirs= for libdir in $compile_rpath $finalize_rpath; do if test -n "$hardcode_libdir_flag_spec"; then if test -n "$hardcode_libdir_separator"; then if test -z "$hardcode_libdirs"; then hardcode_libdirs=$libdir else # Just accumulate the unique libdirs. case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*) ;; *) func_append hardcode_libdirs "$hardcode_libdir_separator$libdir" ;; esac fi else eval flag=\"$hardcode_libdir_flag_spec\" func_append rpath " $flag" fi elif test -n "$runpath_var"; then case "$perm_rpath " in *" $libdir "*) ;; *) func_append perm_rpath " $libdir" ;; esac fi case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-cegcc*) testbindir=`$ECHO "$libdir" | $SED -e 's*/lib$*/bin*'` case :$dllsearchpath: in *":$libdir:"*) ;; ::) dllsearchpath=$libdir;; *) func_append dllsearchpath ":$libdir";; esac case :$dllsearchpath: in *":$testbindir:"*) ;; ::) dllsearchpath=$testbindir;; *) func_append dllsearchpath ":$testbindir";; esac ;; esac done # Substitute the hardcoded libdirs into the rpath. if test -n "$hardcode_libdir_separator" && test -n "$hardcode_libdirs"; then libdir=$hardcode_libdirs eval rpath=\" $hardcode_libdir_flag_spec\" fi compile_rpath=$rpath rpath= hardcode_libdirs= for libdir in $finalize_rpath; do if test -n "$hardcode_libdir_flag_spec"; then if test -n "$hardcode_libdir_separator"; then if test -z "$hardcode_libdirs"; then hardcode_libdirs=$libdir else # Just accumulate the unique libdirs. case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*) ;; *) func_append hardcode_libdirs "$hardcode_libdir_separator$libdir" ;; esac fi else eval flag=\"$hardcode_libdir_flag_spec\" func_append rpath " $flag" fi elif test -n "$runpath_var"; then case "$finalize_perm_rpath " in *" $libdir "*) ;; *) func_append finalize_perm_rpath " $libdir" ;; esac fi done # Substitute the hardcoded libdirs into the rpath. if test -n "$hardcode_libdir_separator" && test -n "$hardcode_libdirs"; then libdir=$hardcode_libdirs eval rpath=\" $hardcode_libdir_flag_spec\" fi finalize_rpath=$rpath if test -n "$libobjs" && test yes = "$build_old_libs"; then # Transform all the library objects into standard objects. compile_command=`$ECHO "$compile_command" | $SP2NL | $SED "$lo2o" | $NL2SP` finalize_command=`$ECHO "$finalize_command" | $SP2NL | $SED "$lo2o" | $NL2SP` fi func_generate_dlsyms "$outputname" "@PROGRAM@" false # template prelinking step if test -n "$prelink_cmds"; then func_execute_cmds "$prelink_cmds" 'exit $?' fi wrappers_required=: case $host in *cegcc* | *mingw32ce*) # Disable wrappers for cegcc and mingw32ce hosts, we are cross compiling anyway. wrappers_required=false ;; *cygwin* | *mingw* ) test yes = "$build_libtool_libs" || wrappers_required=false ;; *) if test no = "$need_relink" || test yes != "$build_libtool_libs"; then wrappers_required=false fi ;; esac $wrappers_required || { # Replace the output file specification. compile_command=`$ECHO "$compile_command" | $SED 's%@OUTPUT@%'"$output"'%g'` link_command=$compile_command$compile_rpath # We have no uninstalled library dependencies, so finalize right now. exit_status=0 func_show_eval "$link_command" 'exit_status=$?' if test -n "$postlink_cmds"; then func_to_tool_file "$output" postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'` func_execute_cmds "$postlink_cmds" 'exit $?' fi # Delete the generated files. if test -f "$output_objdir/${outputname}S.$objext"; then func_show_eval '$RM "$output_objdir/${outputname}S.$objext"' fi exit $exit_status } if test -n "$compile_shlibpath$finalize_shlibpath"; then compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command" fi if test -n "$finalize_shlibpath"; then finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command" fi compile_var= finalize_var= if test -n "$runpath_var"; then if test -n "$perm_rpath"; then # We should set the runpath_var. rpath= for dir in $perm_rpath; do func_append rpath "$dir:" done compile_var="$runpath_var=\"$rpath\$$runpath_var\" " fi if test -n "$finalize_perm_rpath"; then # We should set the runpath_var. rpath= for dir in $finalize_perm_rpath; do func_append rpath "$dir:" done finalize_var="$runpath_var=\"$rpath\$$runpath_var\" " fi fi if test yes = "$no_install"; then # We don't need to create a wrapper script. link_command=$compile_var$compile_command$compile_rpath # Replace the output file specification. link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output"'%g'` # Delete the old output file. $opt_dry_run || $RM $output # Link the executable and exit func_show_eval "$link_command" 'exit $?' if test -n "$postlink_cmds"; then func_to_tool_file "$output" postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'` func_execute_cmds "$postlink_cmds" 'exit $?' fi exit $EXIT_SUCCESS fi case $hardcode_action,$fast_install in relink,*) # Fast installation is not supported link_command=$compile_var$compile_command$compile_rpath relink_command=$finalize_var$finalize_command$finalize_rpath func_warning "this platform does not like uninstalled shared libraries" func_warning "'$output' will be relinked during installation" ;; *,yes) link_command=$finalize_var$compile_command$finalize_rpath relink_command=`$ECHO "$compile_var$compile_command$compile_rpath" | $SED 's%@OUTPUT@%\$progdir/\$file%g'` ;; *,no) link_command=$compile_var$compile_command$compile_rpath relink_command=$finalize_var$finalize_command$finalize_rpath ;; *,needless) link_command=$finalize_var$compile_command$finalize_rpath relink_command= ;; esac # Replace the output file specification. link_command=`$ECHO "$link_command" | $SED 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'` # Delete the old output files. $opt_dry_run || $RM $output $output_objdir/$outputname $output_objdir/lt-$outputname func_show_eval "$link_command" 'exit $?' if test -n "$postlink_cmds"; then func_to_tool_file "$output_objdir/$outputname" postlink_cmds=`func_echo_all "$postlink_cmds" | $SED -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g' -e 's%@TOOL_OUTPUT@%'"$func_to_tool_file_result"'%g'` func_execute_cmds "$postlink_cmds" 'exit $?' fi # Now create the wrapper script. func_verbose "creating $output" # Quote the relink command for shipping. if test -n "$relink_command"; then # Preserve any variables that may affect compiler behavior for var in $variables_saved_for_relink; do if eval test -z \"\${$var+set}\"; then relink_command="{ test -z \"\${$var+set}\" || $lt_unset $var || { $var=; export $var; }; }; $relink_command" elif eval var_value=\$$var; test -z "$var_value"; then relink_command="$var=; export $var; $relink_command" else func_quote_for_eval "$var_value" relink_command="$var=$func_quote_for_eval_result; export $var; $relink_command" fi done relink_command="(cd `pwd`; $relink_command)" relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"` fi # Only actually do things if not in dry run mode. $opt_dry_run || { # win32 will think the script is a binary if it has # a .exe suffix, so we strip it off here. case $output in *.exe) func_stripname '' '.exe' "$output" output=$func_stripname_result ;; esac # test for cygwin because mv fails w/o .exe extensions case $host in *cygwin*) exeext=.exe func_stripname '' '.exe' "$outputname" outputname=$func_stripname_result ;; *) exeext= ;; esac case $host in *cygwin* | *mingw* ) func_dirname_and_basename "$output" "" "." output_name=$func_basename_result output_path=$func_dirname_result cwrappersource=$output_path/$objdir/lt-$output_name.c cwrapper=$output_path/$output_name.exe $RM $cwrappersource $cwrapper trap "$RM $cwrappersource $cwrapper; exit $EXIT_FAILURE" 1 2 15 func_emit_cwrapperexe_src > $cwrappersource # The wrapper executable is built using the $host compiler, # because it contains $host paths and files. If cross- # compiling, it, like the target executable, must be # executed on the $host or under an emulation environment. $opt_dry_run || { $LTCC $LTCFLAGS -o $cwrapper $cwrappersource $STRIP $cwrapper } # Now, create the wrapper script for func_source use: func_ltwrapper_scriptname $cwrapper $RM $func_ltwrapper_scriptname_result trap "$RM $func_ltwrapper_scriptname_result; exit $EXIT_FAILURE" 1 2 15 $opt_dry_run || { # note: this script will not be executed, so do not chmod. if test "x$build" = "x$host"; then $cwrapper --lt-dump-script > $func_ltwrapper_scriptname_result else func_emit_wrapper no > $func_ltwrapper_scriptname_result fi } ;; * ) $RM $output trap "$RM $output; exit $EXIT_FAILURE" 1 2 15 func_emit_wrapper no > $output chmod +x $output ;; esac } exit $EXIT_SUCCESS ;; esac # See if we need to build an old-fashioned archive. for oldlib in $oldlibs; do case $build_libtool_libs in convenience) oldobjs="$libobjs_save $symfileobj" addlibs=$convenience build_libtool_libs=no ;; module) oldobjs=$libobjs_save addlibs=$old_convenience build_libtool_libs=no ;; *) oldobjs="$old_deplibs $non_pic_objects" $preload && test -f "$symfileobj" \ && func_append oldobjs " $symfileobj" addlibs=$old_convenience ;; esac if test -n "$addlibs"; then gentop=$output_objdir/${outputname}x func_append generated " $gentop" func_extract_archives $gentop $addlibs func_append oldobjs " $func_extract_archives_result" fi # Do each command in the archive commands. if test -n "$old_archive_from_new_cmds" && test yes = "$build_libtool_libs"; then cmds=$old_archive_from_new_cmds else # Add any objects from preloaded convenience libraries if test -n "$dlprefiles"; then gentop=$output_objdir/${outputname}x func_append generated " $gentop" func_extract_archives $gentop $dlprefiles func_append oldobjs " $func_extract_archives_result" fi # POSIX demands no paths to be encoded in archives. We have # to avoid creating archives with duplicate basenames if we # might have to extract them afterwards, e.g., when creating a # static archive out of a convenience library, or when linking # the entirety of a libtool archive into another (currently # not supported by libtool). if (for obj in $oldobjs do func_basename "$obj" $ECHO "$func_basename_result" done | sort | sort -uc >/dev/null 2>&1); then : else echo "copying selected object files to avoid basename conflicts..." gentop=$output_objdir/${outputname}x func_append generated " $gentop" func_mkdir_p "$gentop" save_oldobjs=$oldobjs oldobjs= counter=1 for obj in $save_oldobjs do func_basename "$obj" objbase=$func_basename_result case " $oldobjs " in " ") oldobjs=$obj ;; *[\ /]"$objbase "*) while :; do # Make sure we don't pick an alternate name that also # overlaps. newobj=lt$counter-$objbase func_arith $counter + 1 counter=$func_arith_result case " $oldobjs " in *[\ /]"$newobj "*) ;; *) if test ! -f "$gentop/$newobj"; then break; fi ;; esac done func_show_eval "ln $obj $gentop/$newobj || cp $obj $gentop/$newobj" func_append oldobjs " $gentop/$newobj" ;; *) func_append oldobjs " $obj" ;; esac done fi func_to_tool_file "$oldlib" func_convert_file_msys_to_w32 tool_oldlib=$func_to_tool_file_result eval cmds=\"$old_archive_cmds\" func_len " $cmds" len=$func_len_result if test "$len" -lt "$max_cmd_len" || test "$max_cmd_len" -le -1; then cmds=$old_archive_cmds elif test -n "$archiver_list_spec"; then func_verbose "using command file archive linking..." for obj in $oldobjs do func_to_tool_file "$obj" $ECHO "$func_to_tool_file_result" done > $output_objdir/$libname.libcmd func_to_tool_file "$output_objdir/$libname.libcmd" oldobjs=" $archiver_list_spec$func_to_tool_file_result" cmds=$old_archive_cmds else # the command line is too long to link in one step, link in parts func_verbose "using piecewise archive linking..." save_RANLIB=$RANLIB RANLIB=: objlist= concat_cmds= save_oldobjs=$oldobjs oldobjs= # Is there a better way of finding the last object in the list? for obj in $save_oldobjs do last_oldobj=$obj done eval test_cmds=\"$old_archive_cmds\" func_len " $test_cmds" len0=$func_len_result len=$len0 for obj in $save_oldobjs do func_len " $obj" func_arith $len + $func_len_result len=$func_arith_result func_append objlist " $obj" if test "$len" -lt "$max_cmd_len"; then : else # the above command should be used before it gets too long oldobjs=$objlist if test "$obj" = "$last_oldobj"; then RANLIB=$save_RANLIB fi test -z "$concat_cmds" || concat_cmds=$concat_cmds~ eval concat_cmds=\"\$concat_cmds$old_archive_cmds\" objlist= len=$len0 fi done RANLIB=$save_RANLIB oldobjs=$objlist if test -z "$oldobjs"; then eval cmds=\"\$concat_cmds\" else eval cmds=\"\$concat_cmds~\$old_archive_cmds\" fi fi fi func_execute_cmds "$cmds" 'exit $?' done test -n "$generated" && \ func_show_eval "${RM}r$generated" # Now create the libtool archive. case $output in *.la) old_library= test yes = "$build_old_libs" && old_library=$libname.$libext func_verbose "creating $output" # Preserve any variables that may affect compiler behavior for var in $variables_saved_for_relink; do if eval test -z \"\${$var+set}\"; then relink_command="{ test -z \"\${$var+set}\" || $lt_unset $var || { $var=; export $var; }; }; $relink_command" elif eval var_value=\$$var; test -z "$var_value"; then relink_command="$var=; export $var; $relink_command" else func_quote_for_eval "$var_value" relink_command="$var=$func_quote_for_eval_result; export $var; $relink_command" fi done # Quote the link command for shipping. relink_command="(cd `pwd`; $SHELL \"$progpath\" $preserve_args --mode=relink $libtool_args @inst_prefix_dir@)" relink_command=`$ECHO "$relink_command" | $SED "$sed_quote_subst"` if test yes = "$hardcode_automatic"; then relink_command= fi # Only create the output if not a dry run. $opt_dry_run || { for installed in no yes; do if test yes = "$installed"; then if test -z "$install_libdir"; then break fi output=$output_objdir/${outputname}i # Replace all uninstalled libtool libraries with the installed ones newdependency_libs= for deplib in $dependency_libs; do case $deplib in *.la) func_basename "$deplib" name=$func_basename_result func_resolve_sysroot "$deplib" eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $func_resolve_sysroot_result` test -z "$libdir" && \ func_fatal_error "'$deplib' is not a valid libtool archive" func_append newdependency_libs " ${lt_sysroot:+=}$libdir/$name" ;; -L*) func_stripname -L '' "$deplib" func_replace_sysroot "$func_stripname_result" func_append newdependency_libs " -L$func_replace_sysroot_result" ;; -R*) func_stripname -R '' "$deplib" func_replace_sysroot "$func_stripname_result" func_append newdependency_libs " -R$func_replace_sysroot_result" ;; *) func_append newdependency_libs " $deplib" ;; esac done dependency_libs=$newdependency_libs newdlfiles= for lib in $dlfiles; do case $lib in *.la) func_basename "$lib" name=$func_basename_result eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $lib` test -z "$libdir" && \ func_fatal_error "'$lib' is not a valid libtool archive" func_append newdlfiles " ${lt_sysroot:+=}$libdir/$name" ;; *) func_append newdlfiles " $lib" ;; esac done dlfiles=$newdlfiles newdlprefiles= for lib in $dlprefiles; do case $lib in *.la) # Only pass preopened files to the pseudo-archive (for # eventual linking with the app. that links it) if we # didn't already link the preopened objects directly into # the library: func_basename "$lib" name=$func_basename_result eval libdir=`$SED -n -e 's/^libdir=\(.*\)$/\1/p' $lib` test -z "$libdir" && \ func_fatal_error "'$lib' is not a valid libtool archive" func_append newdlprefiles " ${lt_sysroot:+=}$libdir/$name" ;; esac done dlprefiles=$newdlprefiles else newdlfiles= for lib in $dlfiles; do case $lib in [\\/]* | [A-Za-z]:[\\/]*) abs=$lib ;; *) abs=`pwd`"/$lib" ;; esac func_append newdlfiles " $abs" done dlfiles=$newdlfiles newdlprefiles= for lib in $dlprefiles; do case $lib in [\\/]* | [A-Za-z]:[\\/]*) abs=$lib ;; *) abs=`pwd`"/$lib" ;; esac func_append newdlprefiles " $abs" done dlprefiles=$newdlprefiles fi $RM $output # place dlname in correct position for cygwin # In fact, it would be nice if we could use this code for all target # systems that can't hard-code library paths into their executables # and that have no shared library path variable independent of PATH, # but it turns out we can't easily determine that from inspecting # libtool variables, so we have to hard-code the OSs to which it # applies here; at the moment, that means platforms that use the PE # object format with DLL files. See the long comment at the top of # tests/bindir.at for full details. tdlname=$dlname case $host,$output,$installed,$module,$dlname in *cygwin*,*lai,yes,no,*.dll | *mingw*,*lai,yes,no,*.dll | *cegcc*,*lai,yes,no,*.dll) # If a -bindir argument was supplied, place the dll there. if test -n "$bindir"; then func_relative_path "$install_libdir" "$bindir" tdlname=$func_relative_path_result/$dlname else # Otherwise fall back on heuristic. tdlname=../bin/$dlname fi ;; esac $ECHO > $output "\ # $outputname - a libtool library file # Generated by $PROGRAM (GNU $PACKAGE) $VERSION # # Please DO NOT delete this file! # It is necessary for linking the library. # The name that we can dlopen(3). dlname='$tdlname' # Names of this library. library_names='$library_names' # The name of the static archive. old_library='$old_library' # Linker flags that cannot go in dependency_libs. inherited_linker_flags='$new_inherited_linker_flags' # Libraries that this one depends upon. dependency_libs='$dependency_libs' # Names of additional weak libraries provided by this library weak_library_names='$weak_libs' # Version information for $libname. current=$current age=$age revision=$revision # Is this an already installed library? installed=$installed # Should we warn about portability when linking against -modules? shouldnotlink=$module # Files to dlopen/dlpreopen dlopen='$dlfiles' dlpreopen='$dlprefiles' # Directory that this library needs to be installed in: libdir='$install_libdir'" if test no,yes = "$installed,$need_relink"; then $ECHO >> $output "\ relink_command=\"$relink_command\"" fi done } # Do a symbolic link so that the libtool archive can be found in # LD_LIBRARY_PATH before the program is installed. func_show_eval '( cd "$output_objdir" && $RM "$outputname" && $LN_S "../$outputname" "$outputname" )' 'exit $?' ;; esac exit $EXIT_SUCCESS } if test link = "$opt_mode" || test relink = "$opt_mode"; then func_mode_link ${1+"$@"} fi # func_mode_uninstall arg... func_mode_uninstall () { $debug_cmd RM=$nonopt files= rmforce=false exit_status=0 # This variable tells wrapper scripts just to set variables rather # than running their programs. libtool_install_magic=$magic for arg do case $arg in -f) func_append RM " $arg"; rmforce=: ;; -*) func_append RM " $arg" ;; *) func_append files " $arg" ;; esac done test -z "$RM" && \ func_fatal_help "you must specify an RM program" rmdirs= for file in $files; do func_dirname "$file" "" "." dir=$func_dirname_result if test . = "$dir"; then odir=$objdir else odir=$dir/$objdir fi func_basename "$file" name=$func_basename_result test uninstall = "$opt_mode" && odir=$dir # Remember odir for removal later, being careful to avoid duplicates if test clean = "$opt_mode"; then case " $rmdirs " in *" $odir "*) ;; *) func_append rmdirs " $odir" ;; esac fi # Don't error if the file doesn't exist and rm -f was used. if { test -L "$file"; } >/dev/null 2>&1 || { test -h "$file"; } >/dev/null 2>&1 || test -f "$file"; then : elif test -d "$file"; then exit_status=1 continue elif $rmforce; then continue fi rmfiles=$file case $name in *.la) # Possibly a libtool archive, so verify it. if func_lalib_p "$file"; then func_source $dir/$name # Delete the libtool libraries and symlinks. for n in $library_names; do func_append rmfiles " $odir/$n" done test -n "$old_library" && func_append rmfiles " $odir/$old_library" case $opt_mode in clean) case " $library_names " in *" $dlname "*) ;; *) test -n "$dlname" && func_append rmfiles " $odir/$dlname" ;; esac test -n "$libdir" && func_append rmfiles " $odir/$name $odir/${name}i" ;; uninstall) if test -n "$library_names"; then # Do each command in the postuninstall commands. func_execute_cmds "$postuninstall_cmds" '$rmforce || exit_status=1' fi if test -n "$old_library"; then # Do each command in the old_postuninstall commands. func_execute_cmds "$old_postuninstall_cmds" '$rmforce || exit_status=1' fi # FIXME: should reinstall the best remaining shared library. ;; esac fi ;; *.lo) # Possibly a libtool object, so verify it. if func_lalib_p "$file"; then # Read the .lo file func_source $dir/$name # Add PIC object to the list of files to remove. if test -n "$pic_object" && test none != "$pic_object"; then func_append rmfiles " $dir/$pic_object" fi # Add non-PIC object to the list of files to remove. if test -n "$non_pic_object" && test none != "$non_pic_object"; then func_append rmfiles " $dir/$non_pic_object" fi fi ;; *) if test clean = "$opt_mode"; then noexename=$name case $file in *.exe) func_stripname '' '.exe' "$file" file=$func_stripname_result func_stripname '' '.exe' "$name" noexename=$func_stripname_result # $file with .exe has already been added to rmfiles, # add $file without .exe func_append rmfiles " $file" ;; esac # Do a test to see if this is a libtool program. if func_ltwrapper_p "$file"; then if func_ltwrapper_executable_p "$file"; then func_ltwrapper_scriptname "$file" relink_command= func_source $func_ltwrapper_scriptname_result func_append rmfiles " $func_ltwrapper_scriptname_result" else relink_command= func_source $dir/$noexename fi # note $name still contains .exe if it was in $file originally # as does the version of $file that was added into $rmfiles func_append rmfiles " $odir/$name $odir/${name}S.$objext" if test yes = "$fast_install" && test -n "$relink_command"; then func_append rmfiles " $odir/lt-$name" fi if test "X$noexename" != "X$name"; then func_append rmfiles " $odir/lt-$noexename.c" fi fi fi ;; esac func_show_eval "$RM $rmfiles" 'exit_status=1' done # Try to remove the $objdir's in the directories where we deleted files for dir in $rmdirs; do if test -d "$dir"; then func_show_eval "rmdir $dir >/dev/null 2>&1" fi done exit $exit_status } if test uninstall = "$opt_mode" || test clean = "$opt_mode"; then func_mode_uninstall ${1+"$@"} fi test -z "$opt_mode" && { help=$generic_help func_fatal_help "you must specify a MODE" } test -z "$exec_cmd" && \ func_fatal_help "invalid operation mode '$opt_mode'" if test -n "$exec_cmd"; then eval exec "$exec_cmd" exit $EXIT_FAILURE fi exit $exit_status # The TAGs below are defined such that we never get into a situation # where we disable both kinds of libraries. Given conflicting # choices, we go for a static library, that is the most portable, # since we can't tell whether shared libraries were disabled because # the user asked for that or because the platform doesn't support # them. This is particularly important on AIX, because we don't # support having both static and shared libraries enabled at the same # time on that platform, so we default to a shared-only configuration. # If a disable-shared tag is given, we'll fallback to a static-only # configuration. But we'll never go from static-only to shared-only. # ### BEGIN LIBTOOL TAG CONFIG: disable-shared build_libtool_libs=no build_old_libs=yes # ### END LIBTOOL TAG CONFIG: disable-shared # ### BEGIN LIBTOOL TAG CONFIG: disable-static build_old_libs=`case $build_libtool_libs in yes) echo no;; *) echo yes;; esac` # ### END LIBTOOL TAG CONFIG: disable-static # Local Variables: # mode:shell-script # sh-indentation:2 # End: elpa-2016.05.001/compile0000755000312500001440000001624512717533405011417 00000000000000#! /bin/sh # Wrapper for compilers which do not understand '-c -o'. scriptversion=2012-10-14.11; # UTC # Copyright (C) 1999-2014 Free Software Foundation, Inc. # Written by Tom Tromey . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # This file is maintained in Automake, please report # bugs to or send patches to # . nl=' ' # We need space, tab and new line, in precisely that order. Quoting is # there to prevent tools from complaining about whitespace usage. IFS=" "" $nl" file_conv= # func_file_conv build_file lazy # Convert a $build file to $host form and store it in $file # Currently only supports Windows hosts. If the determined conversion # type is listed in (the comma separated) LAZY, no conversion will # take place. func_file_conv () { file=$1 case $file in / | /[!/]*) # absolute file, and not a UNC file if test -z "$file_conv"; then # lazily determine how to convert abs files case `uname -s` in MINGW*) file_conv=mingw ;; CYGWIN*) file_conv=cygwin ;; *) file_conv=wine ;; esac fi case $file_conv/,$2, in *,$file_conv,*) ;; mingw/*) file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` ;; cygwin/*) file=`cygpath -m "$file" || echo "$file"` ;; wine/*) file=`winepath -w "$file" || echo "$file"` ;; esac ;; esac } # func_cl_dashL linkdir # Make cl look for libraries in LINKDIR func_cl_dashL () { func_file_conv "$1" if test -z "$lib_path"; then lib_path=$file else lib_path="$lib_path;$file" fi linker_opts="$linker_opts -LIBPATH:$file" } # func_cl_dashl library # Do a library search-path lookup for cl func_cl_dashl () { lib=$1 found=no save_IFS=$IFS IFS=';' for dir in $lib_path $LIB do IFS=$save_IFS if $shared && test -f "$dir/$lib.dll.lib"; then found=yes lib=$dir/$lib.dll.lib break fi if test -f "$dir/$lib.lib"; then found=yes lib=$dir/$lib.lib break fi if test -f "$dir/lib$lib.a"; then found=yes lib=$dir/lib$lib.a break fi done IFS=$save_IFS if test "$found" != yes; then lib=$lib.lib fi } # func_cl_wrapper cl arg... # Adjust compile command to suit cl func_cl_wrapper () { # Assume a capable shell lib_path= shared=: linker_opts= for arg do if test -n "$eat"; then eat= else case $1 in -o) # configure might choose to run compile as 'compile cc -o foo foo.c'. eat=1 case $2 in *.o | *.[oO][bB][jJ]) func_file_conv "$2" set x "$@" -Fo"$file" shift ;; *) func_file_conv "$2" set x "$@" -Fe"$file" shift ;; esac ;; -I) eat=1 func_file_conv "$2" mingw set x "$@" -I"$file" shift ;; -I*) func_file_conv "${1#-I}" mingw set x "$@" -I"$file" shift ;; -l) eat=1 func_cl_dashl "$2" set x "$@" "$lib" shift ;; -l*) func_cl_dashl "${1#-l}" set x "$@" "$lib" shift ;; -L) eat=1 func_cl_dashL "$2" ;; -L*) func_cl_dashL "${1#-L}" ;; -static) shared=false ;; -Wl,*) arg=${1#-Wl,} save_ifs="$IFS"; IFS=',' for flag in $arg; do IFS="$save_ifs" linker_opts="$linker_opts $flag" done IFS="$save_ifs" ;; -Xlinker) eat=1 linker_opts="$linker_opts $2" ;; -*) set x "$@" "$1" shift ;; *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) func_file_conv "$1" set x "$@" -Tp"$file" shift ;; *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) func_file_conv "$1" mingw set x "$@" "$file" shift ;; *) set x "$@" "$1" shift ;; esac fi shift done if test -n "$linker_opts"; then linker_opts="-link$linker_opts" fi exec "$@" $linker_opts exit 1 } eat= case $1 in '') echo "$0: No command. Try '$0 --help' for more information." 1>&2 exit 1; ;; -h | --h*) cat <<\EOF Usage: compile [--help] [--version] PROGRAM [ARGS] Wrapper for compilers which do not understand '-c -o'. Remove '-o dest.o' from ARGS, run PROGRAM with the remaining arguments, and rename the output as expected. If you are trying to build a whole package this is not the right script to run: please start by reading the file 'INSTALL'. Report bugs to . EOF exit $? ;; -v | --v*) echo "compile $scriptversion" exit $? ;; cl | *[/\\]cl | cl.exe | *[/\\]cl.exe ) func_cl_wrapper "$@" # Doesn't return... ;; esac ofile= cfile= for arg do if test -n "$eat"; then eat= else case $1 in -o) # configure might choose to run compile as 'compile cc -o foo foo.c'. # So we strip '-o arg' only if arg is an object. eat=1 case $2 in *.o | *.obj) ofile=$2 ;; *) set x "$@" -o "$2" shift ;; esac ;; *.c) cfile=$1 set x "$@" "$1" shift ;; *) set x "$@" "$1" shift ;; esac fi shift done if test -z "$ofile" || test -z "$cfile"; then # If no '-o' option was seen then we might have been invoked from a # pattern rule where we don't need one. That is ok -- this is a # normal compilation that the losing compiler can handle. If no # '.c' file was seen then we are probably linking. That is also # ok. exec "$@" fi # Name of file we expect compiler to create. cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` # Create the lock directory. # Note: use '[/\\:.-]' here to ensure that we don't use the same name # that we are using for the .o file. Also, base the name on the expected # object file name, since that is what matters with a parallel build. lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d while true; do if mkdir "$lockdir" >/dev/null 2>&1; then break fi sleep 1 done # FIXME: race condition here if user kills between mkdir and trap. trap "rmdir '$lockdir'; exit 1" 1 2 15 # Run the compile. "$@" ret=$? if test -f "$cofile"; then test "$cofile" = "$ofile" || mv "$cofile" "$ofile" elif test -f "${cofile}bj"; then test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" fi rmdir "$lockdir" exit $ret # Local Variables: # mode: shell-script # sh-indentation: 2 # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-time-zone: "UTC" # time-stamp-end: "; # UTC" # End: elpa-2016.05.001/COPYING/0000755000312500001440000000000012717541041011214 500000000000000elpa-2016.05.001/COPYING/lgpl.txt0000644000312500001440000001674312664056454012660 00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. elpa-2016.05.001/COPYING/gpl.txt0000644000312500001440000010451312664056454012475 00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . elpa-2016.05.001/COPYING/COPYING0000644000312500001440000000227012717402663012176 00000000000000Licensing and copyright terms for the ELPA library: ELPA Consortium (2011) **** Copyright of the original code rests with the authors inside the ELPA consortium. The copyright of any additional modifications shall rest with their original authors, but shall adhere to the licensing terms set forth below. **** The code is distributed under the terms of the GNU Lesser General Public License version 3 (LGPL). The full text can be found in the file "lgpl.txt" in this repository. "COPYING/lgpl.txt" makes reference to the GPL v3, which can also be found in this repository ("COPING/gpl.txt"). **** ELPA reflects a substantial effort on the part of the original ELPA consortium, and we ask you to respect the spirit of the license that we chose: i.e., please contribute any changes you may have back to the original ELPA library distribution, and keep any derivatives of ELPA under the same license that we chose for the original distribution, the GNU Lesser General Public License. When in doubt, talk to us. What we would like to ensure is that the ELPA code can be used as needed, while providing a strong incentive for others to contribute their modifications back to the original development. **** elpa-2016.05.001/src/0000755000312500001440000000000012717541041010673 500000000000000elpa-2016.05.001/src/redist_band.X900000644000312500001440000002172612717516040013404 00000000000000#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF #endif ! -------------------------------------------------------------------------------------------------- ! redist_band: redistributes band from 2D block cyclic form to 1D band #if REALCASE==1 subroutine redist_band_real(r_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, r_ab) #endif #if COMPLEXCASE==1 subroutine redist_band_complex(c_a, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, c_ab) #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm #if REALCASE==1 real(kind=rk), intent(in) :: r_a(lda, matrixCols) #endif #if COMPLEXCASE==1 complex(kind=ck), intent(in) :: c_a(lda, matrixCols) #endif #if REALCASE==1 real(kind=rk), intent(out) :: r_ab(:,:) #endif #if COMPLEXCASE==1 complex(kind=ck), intent(out) :: c_ab(:,:) #endif integer(kind=ik), allocatable :: ncnt_s(:), nstart_s(:), ncnt_r(:), nstart_r(:), & global_id(:,:), global_id_tmp(:,:), block_limits(:) #if REALCASE==1 real(kind=rk), allocatable :: r_sbuf(:,:,:), r_rbuf(:,:,:), r_buf(:,:) #endif #if COMPLEXCASE==1 complex(kind=ck), allocatable :: c_sbuf(:,:,:), c_rbuf(:,:,:), c_buf(:,:) #endif integer(kind=ik) :: i, j, my_pe, n_pes, my_prow, np_rows, my_pcol, np_cols, & nfact, np, npr, npc, mpierr, is, js integer(kind=ik) :: nblocks_total, il, jl, l_rows, l_cols, n_off #ifdef HAVE_DETAILED_TIMINGS #if REALCASE==1 call timer%start("redist_band_real") #endif #if COMPLEXCASE==1 call timer%start("redist_band_complex") #endif #endif call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1)) #ifdef WITH_OPENMP allocate(global_id_tmp(0:np_rows-1,0:np_cols-1)) #endif global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_MPI #ifdef WITH_OPENMP global_id_tmp(:,:) = global_id(:,:) call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) deallocate(global_id_tmp) #else call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) #endif #endif ! Set work distribution nblocks_total = (na-1)/nbw + 1 allocate(block_limits(0:n_pes)) call divide_band(nblocks_total, n_pes, block_limits) allocate(ncnt_s(0:n_pes-1)) allocate(nstart_s(0:n_pes-1)) allocate(ncnt_r(0:n_pes-1)) allocate(nstart_r(0:n_pes-1)) nfact = nbw/nblk ! Count how many blocks go to which PE ncnt_s(:) = 0 np = 0 ! receiver PE number do j=0,(na-1)/nblk ! loop over rows of blocks if (j/nfact==block_limits(np+1)) np = np+1 if (mod(j,np_rows) == my_prow) then do i=0,nfact if (mod(i+j,np_cols) == my_pcol) then ncnt_s(np) = ncnt_s(np) + 1 endif enddo endif enddo ! Allocate send buffer #if REALCASE==1 allocate(r_sbuf(nblk,nblk,sum(ncnt_s))) r_sbuf(:,:,:) = 0. #endif #if COMPLEXCASE==1 allocate(c_sbuf(nblk,nblk,sum(ncnt_s))) c_sbuf(:,:,:) = 0. #endif ! Determine start offsets in send buffer nstart_s(0) = 0 do i=1,n_pes-1 nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1) enddo ! Fill send buffer l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of a np = 0 do j=0,(na-1)/nblk ! loop over rows of blocks if (j/nfact==block_limits(np+1)) np = np+1 if (mod(j,np_rows) == my_prow) then do i=0,nfact if (mod(i+j,np_cols) == my_pcol) then nstart_s(np) = nstart_s(np) + 1 js = (j/np_rows)*nblk is = ((i+j)/np_cols)*nblk jl = MIN(nblk,l_rows-js) il = MIN(nblk,l_cols-is) #if REALCASE==1 r_sbuf(1:jl,1:il,nstart_s(np)) = r_a(js+1:js+jl,is+1:is+il) #endif #if COMPLEXCASE==1 c_sbuf(1:jl,1:il,nstart_s(np)) = c_a(js+1:js+jl,is+1:is+il) #endif endif enddo endif enddo ! Count how many blocks we get from which PE ncnt_r(:) = 0 do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk) npr = mod(j,np_rows) do i=0,nfact npc = mod(i+j,np_cols) np = global_id(npr,npc) ncnt_r(np) = ncnt_r(np) + 1 enddo enddo ! Allocate receive buffer #if REALCASE==1 allocate(r_rbuf(nblk,nblk,sum(ncnt_r))) #endif #if COMPLEXCASE==1 allocate(c_rbuf(nblk,nblk,sum(ncnt_r))) #endif ! Set send counts/send offsets, receive counts/receive offsets ! now actually in variables, not in blocks ncnt_s(:) = ncnt_s(:)*nblk*nblk nstart_s(0) = 0 do i=1,n_pes-1 nstart_s(i) = nstart_s(i-1) + ncnt_s(i-1) enddo ncnt_r(:) = ncnt_r(:)*nblk*nblk nstart_r(0) = 0 do i=1,n_pes-1 nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1) enddo ! Exchange all data with MPI_Alltoallv #ifdef WITH_MPI #if REALCASE==1 call MPI_Alltoallv(r_sbuf,ncnt_s,nstart_s,MPI_REAL8,r_rbuf,ncnt_r,nstart_r,MPI_REAL8,mpi_comm,mpierr) #endif #if COMPLEXCASE==1 call MPI_Alltoallv(c_sbuf,ncnt_s,nstart_s,MPI_COMPLEX16,c_rbuf,ncnt_r,nstart_r,MPI_COMPLEX16,mpi_comm,mpierr) #endif #else /* WITH_MPI */ #if REALCASE==1 r_rbuf = r_sbuf #endif #if COMPLEXCASE==1 c_rbuf = c_sbuf #endif #endif /* WITH_MPI */ ! set band from receive buffer ncnt_r(:) = ncnt_r(:)/(nblk*nblk) nstart_r(0) = 0 do i=1,n_pes-1 nstart_r(i) = nstart_r(i-1) + ncnt_r(i-1) enddo #if REALCASE==1 allocate(r_buf((nfact+1)*nblk,nblk)) #endif #if COMPLEXCASE==1 allocate(c_buf((nfact+1)*nblk,nblk)) #endif ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nbw do j=block_limits(my_pe)*nfact,min(block_limits(my_pe+1)*nfact-1,(na-1)/nblk) npr = mod(j,np_rows) do i=0,nfact npc = mod(i+j,np_cols) np = global_id(npr,npc) nstart_r(np) = nstart_r(np) + 1 #if REALCASE==1 r_buf(i*nblk+1:i*nblk+nblk,:) = transpose(r_rbuf(:,:,nstart_r(np))) #endif #if COMPLEXCASE==1 c_buf(i*nblk+1:i*nblk+nblk,:) = conjg(transpose(c_rbuf(:,:,nstart_r(np)))) #endif enddo do i=1,MIN(nblk,na-j*nblk) #if REALCASE==1 r_ab(1:nbw+1,i+j*nblk-n_off) = r_buf(i:i+nbw,i) #endif #if COMPLEXCASE==1 c_ab(1:nbw+1,i+j*nblk-n_off) = c_buf(i:i+nbw,i) #endif enddo enddo deallocate(ncnt_s, nstart_s) deallocate(ncnt_r, nstart_r) deallocate(global_id) deallocate(block_limits) #if REALCASE==1 deallocate(r_sbuf, r_rbuf, r_buf) #endif #if COMPLEXCASE==1 deallocate(c_sbuf, c_rbuf, c_buf) #endif #ifdef HAVE_DETAILED_TIMINGS #if REALCASE==1 call timer%stop("redist_band_real") #endif #if COMPLEXCASE==1 call timer%stop("redist_band_complex") #endif #endif end subroutine elpa-2016.05.001/src/elpa_utilities.F900000644000312500001440000001013312717402663014113 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! Author: Andreas Marek, MPCDF #include "config-f90.h" module ELPA_utilities #ifdef HAVE_ISO_FORTRAN_ENV use iso_fortran_env, only : error_unit #endif use precision implicit none private ! By default, all routines contained are private public :: debug_messages_via_environment_variable, pcol, prow, error_unit #ifndef HAVE_ISO_FORTRAN_ENV integer(kind=ik), parameter :: error_unit = 0 #endif !****** contains function debug_messages_via_environment_variable() result(isSet) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none logical :: isSet CHARACTER(len=255) :: ELPA_DEBUG_MESSAGES #ifdef HAVE_DETAILED_TIMINGS call timer%start("debug_messages_via_environment_variable") #endif isSet = .false. #if defined(HAVE_ENVIRONMENT_CHECKING) call get_environment_variable("ELPA_DEBUG_MESSAGES",ELPA_DEBUG_MESSAGES) #endif if (trim(ELPA_DEBUG_MESSAGES) .eq. "yes") then isSet = .true. endif if (trim(ELPA_DEBUG_MESSAGES) .eq. "no") then isSet = .true. endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("debug_messages_via_environment_variable") #endif end function debug_messages_via_environment_variable !------------------------------------------------------------------------------- !Processor col for global col number pure function pcol(i, nblk, np_cols) result(col) use precision implicit none integer(kind=ik), intent(in) :: i, nblk, np_cols integer(kind=ik) :: col col = MOD((i-1)/nblk,np_cols) end function !------------------------------------------------------------------------------- !Processor row for global row number pure function prow(i, nblk, np_rows) result(row) use precision implicit none integer(kind=ik), intent(in) :: i, nblk, np_rows integer(kind=ik) :: row row = MOD((i-1)/nblk,np_rows) end function !------------------------------------------------------------------------------- end module ELPA_utilities elpa-2016.05.001/src/elpa2_compute.F900000644000312500001440000065730112717516040013647 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), fomerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "config-f90.h" module ELPA2_compute ! Version 1.1.2, 2011-02-21 use elpa_utilities USE ELPA1_compute use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve use elpa2_utilities use elpa_pdgeqrf use elpa_mpi use aligned_mem implicit none PRIVATE ! By default, all routines contained are private public :: bandred_real public :: tridiag_band_real public :: trans_ev_tridi_to_band_real public :: trans_ev_band_to_full_real public :: bandred_complex public :: tridiag_band_complex public :: trans_ev_tridi_to_band_complex public :: trans_ev_band_to_full_complex public :: band_band_real public :: divide_band integer, public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used ! 0 for unblocked ! 1 for blocked (maxrank: nblk) contains subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, & tmat, wantDebug, success, useQR) !------------------------------------------------------------------------------- ! bandred_real: Reduces a distributed symmetric matrix to band form ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the band and the Householder vectors ! in the upper half. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nbw semi bandwith of output matrix ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 ! Factors for the Householder vectors (returned), needed for back transformation ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif #ifdef WITH_OPENMP use omp_lib #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk) :: a(lda,*), tmat(nbw,nbw,*) #else real(kind=rk) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) #endif integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows, vmrCols integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile real(kind=rk) :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) real(kind=rk), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:) ! needed for blocked QR decomposition integer(kind=ik) :: PQRPARAM(11), work_size real(kind=rk) :: dwork_size(1) real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:) logical, intent(in) :: wantDebug logical, intent(out) :: success logical, intent(in) :: useQR integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize #ifdef HAVE_DETAILED_TIMINGS call timer%start("bandred_real") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) success = .true. ! Semibandwith nbw must be a multiple of blocksize nblk if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk' endif success = .false. return endif endif ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile if (useQR) then if (which_qr_decomposition == 1) then call qr_pqrparam_init(pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s') allocate(tauvector(na)) allocate(blockheuristic(nblk)) l_rows = local_index(na, my_prow, np_rows, nblk, -1) allocate(vmr(max(l_rows,1),na)) vmrCols = na #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmr, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), & nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & mpi_comm_rows, mpi_comm_cols, blockheuristic) #else call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), matrixCols, lda, vmr(1:max(l_rows,1),1:vmrCols), max(l_rows,1), & vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, & nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & mpi_comm_rows, mpi_comm_cols, blockheuristic) #endif work_size = dwork_size(1) allocate(work_blocked(work_size)) work_blocked = 0.0d0 deallocate(vmr) endif endif do istep = (na-1)/nbw, 1, -1 n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step ! Number of local columns/rows of remaining matrix l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces allocate(vmr(max(l_rows,1),2*n_cols)) allocate(umc(max(l_cols,1),2*n_cols)) allocate(vr(l_rows+1)) vmr(1:l_rows,1:n_cols) = 0. vr(:) = 0 tmat(:,:,istep) = 0 ! Reduce current block to lower triangular form if (useQR) then if (which_qr_decomposition == 1) then vmrCols = 2*n_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmr, max(l_rows,1), vmrCols, tauvector(1), & na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, & work_size, na, n_cols, nblk, nblk, & istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& blockheuristic) #else call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), lda, matrixCols, vmr(1:max(l_rows,1),1:vmrCols) , & max(l_rows,1), vmrCols, tauvector(1:na), na, & tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, & work_size, na, n_cols, nblk, nblk, & istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& blockheuristic) #endif endif else do lc = n_cols, 1, -1 ncol = istep*nbw + lc ! absolute column number of householder vector nrow = ncol - nbw ! Absolute number of pivot row lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number tau = 0 if (nrow == 1) exit ! Nothing to do cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block if (my_pcol==cur_pcol) then ! Get vector to be transformed; distribute last element and norm of ! remaining elements to all procs in current column vr(1:lr) = a(1:lr,lch) ! vector to be transformed if (my_prow==prow(nrow, nblk, np_rows)) then aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) aux1(2) = vr(lr) else aux1(1) = dot_product(vr(1:lr),vr(1:lr)) aux1(2) = 0. endif #ifdef WITH_MPI call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else aux2 = aux1 #endif vnorm2 = aux2(1) vrl = aux2(2) ! Householder transformation call hh_transform_real(vrl, vnorm2, xf, tau) ! Scale vr and store Householder vector for back transformation vr(1:lr) = vr(1:lr) * xf if (my_prow==prow(nrow, nblk, np_rows)) then a(1:lr-1,lch) = vr(1:lr-1) a(lr,lch) = vrl vr(lr) = 1. else a(1:lr,lch) = vr(1:lr) endif endif ! Broadcast Householder vector and tau along columns vr(lr+1) = tau #ifdef WITH_MPI call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr) #endif vmr(1:lr,lc) = vr(1:lr) tau = vr(lr+1) tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat ! Transform remaining columns in current block with Householder vector ! Local dot product aux1 = 0 #ifdef WITH_OPENMP !Open up one omp region to avoid paying openmp overhead. !This does not help performance due to the addition of two openmp barriers around the MPI call, !But in the future this may be beneficial if these barriers are replaced with a faster implementation !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1) mynlc = 0 ! number of local columns !This loop does not have independent iterations, !'mynlc' is incremented each iteration, and it is difficult to remove this dependency !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration !That is, a thread only executes the work associated with an iteration if its thread id is congruent to !the iteration number modulo the number of threads do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0 ) then mynlc = mynlc+1 if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx)) endif endif enddo ! Get global dot products !$omp barrier !$omp single #ifdef WITH_MPI if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else if (mynlc>0) aux2 = aux1 #endif !$omp end single !$omp barrier ! Transform transformChunkSize=32 mynlc = 0 do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then mynlc = mynlc+1 !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32 !However, for some reason this is slower than doing it manually, so it is parallelized as below. do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize do pp = 1,transformChunkSize if (pp + ii > lr) exit a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp) enddo enddo endif enddo !$omp end parallel #else /* WITH_OPENMP */ nlc = 0 ! number of local columns do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) endif enddo ! Get global dot products #ifdef WITH_MPI if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else if (nlc>0) aux2=aux1 #endif ! Transform nlc = 0 do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr) endif enddo #endif /* WITH_OPENMP */ enddo ! Calculate scalar products of stored Householder vectors. ! This can be done in different ways, we use dsyrk vav = 0 if (l_rows>0) & call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,dim=1),0.d0,vav,ubound(vav,dim=1)) call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows) ! Calculate triangular matrix T for block Householder Transformation do lc=n_cols,1,-1 tau = tmat(lc,lc,istep) if (lc vmc (stored in umc, second half) call elpa_transpose_vectors_real (vmr, ubound(vmr,dim=1), mpi_comm_rows, & umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & 1, istep*nbw, n_cols, nblk) ! Calculate umc = A**T * vmr ! Note that the distributed A has to be transposed ! Opposed to direct tridiagonalization there is no need to use the cache locality ! of the tiles, so we can use strips of the matrix !Code for Algorithm 4 n_way = 1 #ifdef WITH_OPENMP n_way = omp_get_max_threads() #endif !umc(1:l_cols,1:n_cols) = 0.d0 !vmr(1:l_rows,n_cols+1:2*n_cols) = 0 #ifdef WITH_OPENMP !$omp parallel private( i,lcs,lce,lrs,lre) #endif if (n_way > 1) then !$omp do do i=1,min(l_cols_tile, l_cols) umc(i,1:n_cols) = 0.d0 enddo !$omp do do i=1,l_rows vmr(i,n_cols+1:2*n_cols) = 0.d0 enddo if (l_cols>0 .and. l_rows>0) then !SYMM variant 4 !Partitioned Matrix Expression: ! Ct = Atl Bt + Atr Bb ! Cb = Atr' Bt + Abl Bb ! !Loop invariant: ! Ct = Atl Bt + Atr Bb ! !Update: ! C1 = A10'B0 + A11B1 + A21 B2 ! !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls !is easily parallelized, and regardless of choise of algorithm, !the startup cost for parallelizing the dgemms inside the loop is too great !$omp do schedule(static,1) do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 ! local column start lce = min(l_cols, (i+1)*l_cols_tile) ! local column end lrs = i*l_rows_tile+1 ! local row start lre = min(l_rows, (i+1)*l_rows_tile) ! local row end !C1 += [A11 A12] [B1 ! B2] if( lre > lrs .and. l_cols > lcs ) then call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1, & 1.d0, a(lrs,lcs), ubound(a,dim=1), & umc(lcs,n_cols+1), ubound(umc,dim=1), & 0.d0, vmr(lrs,n_cols+1), ubound(vmr,dim=1)) endif ! C1 += A10' B0 if( lce > lcs .and. i > 0 ) then call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1, & 1.d0, a(1,lcs), ubound(a,dim=1), & vmr(1,1), ubound(vmr,dim=1), & 0.d0, umc(lcs,1), ubound(umc,dim=1)) endif enddo endif else umc(1:l_cols,1:n_cols) = 0.d0 vmr(1:l_rows,n_cols+1:2*n_cols) = 0 if (l_cols>0 .and. l_rows>0) then do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce 1) then call elpa_reduce_add_vectors_real (vmr(1,n_cols+1),ubound(vmr,dim=1),mpi_comm_rows, & umc, ubound(umc,dim=1), mpi_comm_cols, & istep*nbw, n_cols, nblk) endif #ifdef WITH_MPI if (l_cols>0) then allocate(tmp(l_cols,n_cols)) call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) deallocate(tmp) endif #endif ! U = U * Tmat**T call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1)) ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,dim=1),umc(1,n_cols+1), & ubound(umc,dim=1),0.d0,vav,ubound(vav,dim=1)) call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep), & ubound(tmat,dim=1),vav,ubound(vav,dim=1)) call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols) ! U = U - 0.5 * V * VAV call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,dim=1),vav, & ubound(vav,dim=1),1.d0,umc,ubound(umc,dim=1)) ! Transpose umc -> umr (stored in vmr, second half) call elpa_transpose_vectors_real (umc, ubound(umc,dim=1), mpi_comm_cols, & vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) ! A = A - V*U**T - U*V**T #ifdef WITH_OPENMP !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) n_threads = omp_get_num_threads() if (mod(n_threads, 2) == 0) then n_way = 2 else n_way = 1 endif m_way = n_threads / n_way m_id = mod(omp_get_thread_num(), m_way) n_id = omp_get_thread_num() / m_way do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way i = ii / tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce lre ) myend = lre if ( myend-mystart+1 < 1) cycle call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, & vmr(mystart, 1), ubound(vmr,1), umc(lcs,1), ubound(umc,1), & 1.d0,a(mystart,lcs),ubound(a,1)) enddo !$omp end parallel #else /* WITH_OPENMP */ do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce= ((t_blocking+1)*nbw) ) then cwy_blocking = t_blocking * nbw allocate(tmp1(max_local_cols*cwy_blocking)) allocate(tmp2(max_local_cols*cwy_blocking)) allocate(hvb(max_local_rows*cwy_blocking)) allocate(hvm(max_local_rows,cwy_blocking)) allocate(tmat_complete(cwy_blocking,cwy_blocking)) allocate(t_tmp(cwy_blocking,nbw)) allocate(t_tmp2(cwy_blocking,nbw)) ! else ! allocate(tmp1(max_local_cols*nbw)) ! allocate(tmp2(max_local_cols*nbw)) ! allocate(hvb(max_local_rows*nbw)) ! allocate(hvm(max_local_rows,nbw)) ! endif hvm = 0 ! Must be set to 0 !!! hvb = 0 ! Safety only l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q ! if ( na >= ((t_blocking+1)*nbw) ) then do istep=1,((na-1)/nbw-1)/t_blocking + 1 ! This the call when using na >= ((t_blocking+1)*nbw) ! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step ! As an alternative we add some special case handling if na < cwy_blocking IF (na < cwy_blocking) THEN n_cols = MAX(0, na-nbw) IF ( n_cols .eq. 0 ) THEN EXIT END IF ELSE n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step END IF ! Broadcast all Householder vectors for current step compressed in hvb nb = 0 ns = 0 do lc = 1, n_cols ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector nrow = ncol - nbw ! absolute number of pivot row l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh) nb = nb+l_rows if (lc==n_cols .or. mod(ncol,nblk)==0) then #ifdef WITH_MPI call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr) #endif ns = nb endif enddo ! Expand compressed Householder vectors into matrix hvm nb = 0 do lc = 1, n_cols nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows) if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1. nb = nb+l_rows enddo l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1) ! compute tmat2 out of tmat(:,:,) tmat_complete = 0 do i = 1, t_blocking t_cols = MIN(nbw, n_cols - (i-1)*nbw) if (t_cols <= 0) exit t_rows = (i - 1) * nbw tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i) if (i > 1) then call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.d0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), & max_local_rows, 0.d0, t_tmp, cwy_blocking) #ifdef WITH_MPI call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else t_tmp2 = t_tmp #endif call dtrmm('L','U','N','N',t_rows,t_cols,1.0d0,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking) call dtrmm('R','U','N','N',t_rows,t_cols,-1.0d0,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking) tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols) endif enddo ! Q = Q - V * T**T * V**T * Q if (l_rows>0) then call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), & q,ldq,0.d0,tmp1,n_cols) else tmp1(1:l_cols*n_cols) = 0 endif #ifdef WITH_MPI call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else tmp2=tmp1 #endif if (l_rows>0) then call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat_complete,cwy_blocking,tmp2,n_cols) call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), tmp2,n_cols,1.d0,q,ldq) endif enddo ! else ! ! do istep=1,(na-1)/nbw ! ! n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step ! ! ! Broadcast all Householder vectors for current step compressed in hvb ! ! nb = 0 ! ns = 0 ! ! do lc = 1, n_cols ! ncol = istep*nbw + lc ! absolute column number of householder vector ! nrow = ncol - nbw ! absolute number of pivot row ! ! l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast ! l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number ! ! if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh) ! ! nb = nb+l_rows ! ! if (lc==n_cols .or. mod(ncol,nblk)==0) then ! call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr) ! ns = nb ! endif ! enddo ! ! ! Expand compressed Householder vectors into matrix hvm ! ! nb = 0 ! do lc = 1, n_cols ! nrow = (istep-1)*nbw+lc ! absolute number of pivot row ! l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast ! ! hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows) ! if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1. ! ! nb = nb+l_rows ! enddo ! ! l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1) ! ! ! Q = Q - V * T**T * V**T * Q ! ! if (l_rows>0) then ! call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), & ! q,ldq,0.d0,tmp1,n_cols) ! else ! tmp1(1:l_cols*n_cols) = 0 ! endif ! ! call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) ! ! if (l_rows>0) then ! call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols) ! call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), & ! tmp2,n_cols,1.d0,q,ldq) ! endif ! enddo ! endif deallocate(tmp1, tmp2, hvb, hvm) ! if ( na >= ((t_blocking+1)*nbw) ) then deallocate(tmat_complete, t_tmp, t_tmp2) ! endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_band_to_full_real") #endif end subroutine trans_ev_band_to_full_real subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, mpi_comm) !------------------------------------------------------------------------------- ! tridiag_band_real: ! Reduces a real symmetric band matrix to tridiagonal form ! ! na Order of matrix a ! ! nb Semi bandwith ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk), intent(in) :: a(lda,*) #else real(kind=rk), intent(in) :: a(lda,matrixCols) #endif real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0 real(kind=rk), intent(out), & allocatable :: hh_trans_real(:,:) real(kind=rk) :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf real(kind=rk) :: hd(nb), hs(nb) integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter #ifdef WITH_MPI integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) #endif integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) real(kind=rk), allocatable :: hv_t(:,:), tau_t(:) #endif integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) integer(kind=ik), allocatable :: block_limits(:) real(kind=rk), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) #ifdef WITH_OPENMP integer(kind=ik) :: omp_get_max_threads #endif #ifndef WITH_MPI integer(kind=ik) :: startAddr #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("tridiag_band_real") #endif call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1)) global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_OPENMP allocate(global_id_tmp(0:np_rows-1,0:np_cols-1)) #endif #ifdef WITH_MPI #ifndef WITH_OPENMP call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) #else global_id_tmp(:,:) = global_id(:,:) call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) deallocate(global_id_tmp) #endif #endif /* WITH_MPI */ ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 ! Set work distribution allocate(block_limits(0:n_pes)) call divide_band(nblocks_total, n_pes, block_limits) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) ! allocate the part of the band matrix which is needed by this PE ! The size is 1 block larger than needed to avoid extensive shifts allocate(ab(2*nb,(nblocks+1)*nb)) ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb ! Redistribute band in a to ab call redist_band_real(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) ! Calculate the workload for each sweep in the back transformation ! and the space requirements to hold the HH vectors allocate(limits(0:np_rows)) call determine_workload(na, nb, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) num_hh_vecs = 0 num_chunks = 0 nx = na do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) ! add to number of householder vectors ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_hh_vecs = num_hh_vecs + local_size num_chunks = num_chunks+1 endif nx = nx - nb enddo ! Allocate space for HH vectors allocate(hh_trans_real(nb,num_hh_vecs)) ! Allocate and init MPI requests allocate(ireq_hhr(num_chunks)) ! Recv requests allocate(ireq_hhs(nblocks)) ! Send requests num_hh_vecs = 0 num_chunks = 0 nx = na nt = 0 do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_chunks = num_chunks+1 #ifdef WITH_MPI call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, mpi_real8, nt, & 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) #else ! carefull non-block recv data copy must be done at wait or send ! hh_trans_real(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) #endif num_hh_vecs = num_hh_vecs + local_size endif nx = nx - nb if (n == block_limits(nt+1)) then nt = nt + 1 endif enddo #ifdef WITH_MPI ireq_hhs(:) = MPI_REQUEST_NULL #endif ! Buffers for gathering/sending the HH vectors allocate(hh_gath(nb,max_blk_size,nblocks)) ! gathers HH vectors allocate(hh_send(nb,max_blk_size,nblocks)) ! send buffer for HH vectors hh_gath(:,:,:) = 0 hh_send(:,:,:) = 0 ! Some counters allocate(hh_cnt(nblocks)) allocate(hh_dst(nblocks)) hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all hh_dst(:) = 0 ! PE number for receive #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! Limits for sending allocate(snd_limits(0:np_rows,nblocks)) do iblk=1,nblocks call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) enddo #ifdef WITH_OPENMP ! OpenMP work distribution: max_threads = 1 max_threads = omp_get_max_threads() ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1 allocate(omp_block_limits(0:max_threads)) ! Get the OpenMP block limits call divide_band(nblocks, max_threads, omp_block_limits) allocate(hv_t(nb,max_threads), tau_t(max_threads)) hv_t = 0 tau_t = 0 #endif ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first column to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif endif #ifndef WITH_MPI startAddr = ubound(hh_trans_real,dim=2) #endif #ifdef WITH_OPENMP do istep=1,na-1-block_limits(my_pe)*nb #else do istep=1,na-1 #endif if (my_pe==0) then n = MIN(na-na_s,nb) ! number of rows to be reduced hv(:) = 0 tau = 0 ! The last step (istep=na-1) is only needed for sending the last HH vectors. ! We don't want the sign of the last element flipped (analogous to the other sweeps) if (istep < na-1) then ! Transform first column of remaining matrix vnorm2 = sum(ab(3:n+1,na_s-n_off)**2) call hh_transform_real(ab(2,na_s-n_off),vnorm2,hf,tau) hv(1) = 1 hv(2:n) = ab(3:n+1,na_s-n_off)*hf endif d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na-1) then d(na) = ab(1,na_s+1-n_off) e(na) = 0 endif else if (na>na_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS,mpierr) #else call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP hv(1:nb) = hv_s(1:nb) #else hv(1:nb) = hv_s(1:nb) #endif #endif /* WITH_MPI */ tau = hv(1) hv(1) = 1. endif endif na_s = na_s+1 if (na_s-n_off > nb) then ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 n_off = n_off + nb endif #ifdef WITH_OPENMP if (max_threads > 1) then ! Codepath for OpenMP ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. ! This simulates the behaviour of the MPI tasks which also work after each other. ! The code would be considerably easier, if the MPI communication would be made within ! the parallel region - this is avoided here since this would require ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. hv_t(:,1) = hv tau_t(1) = tau do iter = 1, 2 ! iter=1 : work on first block ! iter=2 : work on remaining blocks ! This is done in 2 iterations so that we have a barrier in between: ! After the first iteration, it is guaranteed that the last row of the last block ! is completed by the next thread. ! After the first iteration it is also the place to exchange the last row ! with MPI calls #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) do my_thread = 1, max_threads if (iter == 1) then my_block_s = omp_block_limits(my_thread-1) + 1 my_block_e = my_block_s else my_block_s = omp_block_limits(my_thread-1) + 2 my_block_e = omp_block_limits(my_thread) endif do iblk = my_block_s, my_block_e ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block ne = ns+nb-1 ! last column in block if (istepna) exit hv = hv_t(:,my_thread) tau = tau_t(my_thread) ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Transform diagonal block call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) x = dot_product(hv(1:nc),hd(1:nc))*tau hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1) hv_t(:,my_thread) = 0 tau_t(my_thread) = 0 if (nr<=0) cycle ! No subdiagonal block present any more ! Transform subdiagonal block call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) if (nr>1) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation for first column ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) hv_t(1 ,my_thread) = 1. hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0 ! update subdiagonal block for old and new Householder transformation ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster call DGEMV('T',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,0.d0,h(2),1) x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i) enddo else ! No new Householder transformation for nr=1, just complete the old one ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) enddo ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) hv_t(1,my_thread) = 1. endif enddo enddo ! my_thread !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif if (iter==1) then ! We are at the end of the first block ! Send our first column to previous PE if (my_pe>0 .and. na_s <= na) then #ifdef WITH_MPI call mpi_wait(ireq_ab,mpi_status,mpierr) #endif ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif endif ! Request last column from next PE ne = na_s + nblocks*nb - (max_threads-1) - 1 #ifdef WITH_MPI if (istep>=max_threads .and. ne <= na) then call mpi_recv(ab(1,ne-n_off),nb+1,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr) endif #else if (istep>=max_threads .and. ne <= na) then ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) endif #endif else ! We are at the end of all blocks ! Send last HH vector and TAU to next PE if it has been calculated above ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne < na) then #ifdef WITH_MPI call mpi_wait(ireq_hv,mpi_status,mpierr) #endif hv_s(1) = tau_t(max_threads) hv_s(2:) = hv_t(2:,max_threads) #ifdef WITH_MPI call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr) #endif endif ! "Send" HH vector and TAU to next OpenMP thread do my_thread = max_threads, 2, -1 hv_t(:,my_thread) = hv_t(:,my_thread-1) tau_t(my_thread) = tau_t(my_thread-1) enddo endif enddo ! iter else ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-1 ! last column in block if (ns+n_off>na) exit ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) #ifndef WITH_OPENMP if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #else startAddr = startAddr - hh_cnt(iblk) hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif /* WITH_MPI */ ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif /* WITH_OPENMP */ nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Multiply diagonal block and subdiagonal block with Householder vector if (iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below! ab(1,ne) = 0 call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) ! Subdiagonal block if (nr>0) call DGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) ! ... then request last column ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS,mpierr) #else call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP ab(1:nb+1,ne) = ab_s(1:nb+1) #else ab(1:nb+1,ne) = ab_s(1:nb+1) #endif #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) if (nr>0) call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) endif ! Calculate first column of subdiagonal block and calculate new ! Householder transformation for this column hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb tau_new = 0 if (nr>0) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... if (nr>1) then vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new) hv_new(1) = 1. hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0 endif ! ... and send it away immediatly if this is the last block if (iblk==nblocks) then #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_hv,MPI_STATUS,mpierr) #else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #endif #endif /* WITH_MPI */ hv_s(1) = tau_new hv_s(2:) = hv_new(2:) #ifdef WITH_MPI call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr) #endif endif endif ! Transform diagonal block x = dot_product(hv(1:nc),hd(1:nc))*tau hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) if (my_pe>0 .and. iblk==1) then ! The first column of the diagonal block has to be send to the previous PE ! Calculate first column only ... ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1) ! ... send it away ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_ab,MPI_STATUS,mpierr) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #endif #endif /* WITH_MPI */ ab_s(1:nb+1) = ab(1:nb+1,ns) #ifdef WITH_MPI call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif ! ... and calculate remaining columns with rank-2 update if (nc>1) call DSYR2('L',nc-1,-1.d0,hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1) else ! No need to send, just a rank-2 update call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1) endif ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb if (nr>0) then if (nr>1) then call DGEMV('T',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,0.d0,h(2),1) x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i) enddo else ! No double Householder transformation for nr=1, just complete the row do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) enddo endif endif ! Use new HH vector for the next block hv(:) = hv_new(:) tau = tau_new enddo #ifdef WITH_OPENMP endif do iblk = 1, nblocks if (hh_dst(iblk) >= np_rows) exit if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #else startAddr = startAddr - hh_cnt(iblk) hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif enddo #endif /* WITH_OPENMP */ enddo ! Finish the last outstanding requests #ifdef WITH_OPENMP #ifdef WITH_MPI call mpi_wait(ireq_ab,MPI_STATUS,mpierr) call mpi_wait(ireq_hv,MPI_STATUS,mpierr) allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks))) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES, mpierr) deallocate(mpi_statuses) #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) #endif #endif /* WITH_OPENMP */ #ifdef WITH_MPI call mpi_barrier(mpi_comm,mpierr) #endif deallocate(ab) deallocate(ireq_hhr, ireq_hhs) deallocate(hh_cnt, hh_dst) deallocate(hh_gath, hh_send) deallocate(limits, snd_limits) deallocate(block_limits) deallocate(global_id) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("tridiag_band_real") #endif end subroutine tridiag_band_real subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, wantDebug, success, & THIS_REAL_ELPA_KERNEL) !------------------------------------------------------------------------------- ! trans_ev_tridi_to_band_real: ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nev Number eigenvectors to compute (= columns of matrix q) ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nb semi bandwith ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! ldq Leading dimension of q ! matrixCols local columns of matrix q ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns/both ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision use pack_unpack_real use compute_hh_trafo_real implicit none integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk) :: q(ldq,*) #else real(kind=rk) :: q(ldq,matrixCols) #endif real(kind=rk), intent(in) :: hh_trans_real(:,:) integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count #ifdef WITH_OPENMP integer(kind=ik) :: thread_width, csw, b_off, b_len #endif integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd integer(kind=ik) :: a_off, current_tv_off, max_blk_size integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk #ifdef WITH_OPENMP #ifdef WITH_MPI integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) #endif #endif logical :: flag #ifdef WITH_OPENMP real(kind=rk), pointer :: a(:,:,:,:) #else real(kind=rk), pointer :: a(:,:,:) #endif real(kind=rk) :: a_real type(c_ptr) :: a_ptr real(kind=rk), allocatable :: row(:) #ifdef WITH_OPENMP real(kind=rk), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) real(kind=rk), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) #else real(kind=rk), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) real(kind=rk), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) #endif real(kind=rk), allocatable :: result_buffer(:,:,:) real(kind=rk), allocatable :: bcast_buffer(:,:) integer(kind=ik) :: n_off integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) #ifdef WITH_OPENMP integer(kind=ik), allocatable :: mpi_statuses(:,:) #endif ! MPI send/recv tags, arbitrary integer(kind=ik), parameter :: bottom_recv_tag = 111 integer(kind=ik), parameter :: top_recv_tag = 222 integer(kind=ik), parameter :: result_recv_tag = 333 ! Just for measuring the kernel performance real(kind=rk) :: kernel_time ! long integer integer(kind=lik) :: kernel_flops #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread integer(kind=ik) :: omp_get_max_threads #endif logical, intent(in) :: wantDebug logical :: success #ifndef WITH_MPI integer(kind=ik) :: j1 #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("trans_ev_tridi_to_band_real") #endif success = .true. kernel_time = 1.d-100 kernel_flops = 0 #ifdef WITH_OPENMP max_threads = 1 max_threads = omp_get_max_threads() #endif call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk' endif success = .false. return endif endif nfact = nbw / nblk ! local number of eigenvectors l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if (l_nev==0) then #ifdef WITH_OPENMP thread_width = 0 #endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into ! every primary cache #ifdef WITH_OPENMP thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread #endif stripe_width = 48 ! Must be a multiple of 4 #ifdef WITH_OPENMP stripe_count = (thread_width-1)/stripe_width + 1 #else stripe_count = (l_nev-1)/stripe_width + 1 #endif ! Adapt stripe width so that last one doesn't get too small #ifdef WITH_OPENMP stripe_width = (thread_width-1)/stripe_count + 1 #else stripe_width = (l_nev-1)/stripe_count + 1 #endif stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!! last_stripe_width = l_nev - (stripe_count-1)*stripe_width endif ! Determine the matrix distribution at the beginning allocate(limits(0:np_rows)) call determine_workload(na, nbw, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) a_dim2 = max_blk_size + nbw #ifdef WITH_OPENMP if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then #else if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then #endif write(error_unit,*) "Cannot allocate memory" success = .false. return endif call c_f_pointer(a_ptr, a, & #ifdef WITH_OPENMP [stripe_width,a_dim2,stripe_count,max_threads] & #else [stripe_width,a_dim2,stripe_count] & #endif ) #ifndef WITH_OPENMP a(:,:,:) = 0 #else ! a(:,:,:,:) should be set to 0 in a parallel region, not here! #endif allocate(row(l_nev)) row(:) = 0 ! Copy q from a block cyclic distribution into a distribution with contiguous rows, ! and transpose the matrix using stripes of given stripe_width for cache blocking. ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below #ifdef WITH_OPENMP ! Please note about the OMP usage below: ! This is not for speed, but because we want the matrix a in the memory and ! in the cache of the correct thread (if possible) #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation! enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #endif do ip = np_rows-1, 0, -1 if (my_prow == ip) then ! Receive my rows which have not yet been received src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src < my_prow) then #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, stripe_count, & thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #endif /* WITH_OPENMP */ elseif (src==my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #endif /* WITH_OPENMP */ endif enddo ! Send all rows which have not yet been send src_offset = 0 do dst = 0, ip-1 do i=limits(dst)+1,limits(dst+1) if (mod((i-1)/nblk, np_rows) == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI call MPI_Send(row, l_nev, MPI_REAL8, dst, 0, mpi_comm_rows, mpierr) #endif endif enddo enddo else if (my_prow < ip) then ! Send all rows going to PE ip src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI call MPI_Send(row, l_nev, MPI_REAL8, ip, 0, mpi_comm_rows, mpierr) #endif endif enddo ! Receive all rows from PE ip do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if (src == ip) then #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_real_cpu_openmp(a, row,i-limits(my_prow),my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #else row(1:l_nev) = row(1:l_nev) #endif call unpack_row_real_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) #endif /* WITH_OPENMP */ endif enddo endif enddo ! Set up result buffer queue num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows num_result_buffers = 4*nfact allocate(result_buffer(l_nev,nblk,num_result_buffers)) allocate(result_send_request(num_result_buffers)) allocate(result_recv_request(num_result_buffers)) #ifdef WITH_MPI result_send_request(:) = MPI_REQUEST_NULL result_recv_request(:) = MPI_REQUEST_NULL #endif ! Queue up buffers if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j = 1, min(num_result_buffers, num_result_blocks) #ifdef WITH_MPI call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(j), mpierr) #else ! carefull the "recv" has to be done at the corresponding wait or send ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf) #endif enddo endif num_bufs_recvd = 0 ! No buffers received yet ! Initialize top/bottom requests allocate(top_send_request(stripe_count)) allocate(top_recv_request(stripe_count)) allocate(bottom_send_request(stripe_count)) allocate(bottom_recv_request(stripe_count)) #ifdef WITH_MPI top_send_request(:) = MPI_REQUEST_NULL top_recv_request(:) = MPI_REQUEST_NULL bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL #endif #ifdef WITH_OPENMP allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) top_border_send_buffer(:,:) = 0 top_border_recv_buffer(:,:) = 0 bottom_border_send_buffer(:,:) = 0 bottom_border_recv_buffer(:,:) = 0 ! Initialize broadcast buffer #else allocate(top_border_send_buffer(stripe_width, nbw, stripe_count)) allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count)) allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count)) allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count)) top_border_send_buffer(:,:,:) = 0 top_border_recv_buffer(:,:,:) = 0 bottom_border_send_buffer(:,:,:) = 0 bottom_border_recv_buffer(:,:,:) = 0 #endif allocate(bcast_buffer(nbw, max_blk_size)) bcast_buffer = 0 current_tv_off = 0 ! Offset of next row to be broadcast ! ------------------- start of work loop ------------------- a_off = 0 ! offset in A (to avoid unnecessary shifts) top_msg_length = 0 bottom_msg_length = 0 do sweep = 0, (na-1)/nbw current_n = na - sweep*nbw call determine_workload(current_n, nbw, np_rows, limits) current_n_start = limits(my_prow) current_n_end = limits(my_prow+1) current_local_n = current_n_end - current_n_start next_n = max(current_n - nbw, 0) call determine_workload(next_n, nbw, np_rows, limits) next_n_start = limits(my_prow) next_n_end = limits(my_prow+1) next_local_n = next_n_end - next_n_start if (next_n_end < next_n) then bottom_msg_length = current_n_end - next_n_end else bottom_msg_length = 0 endif if (next_local_n > 0) then next_top_msg_length = current_n_start - next_n_start else next_top_msg_length = 0 endif if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" b_len = csw*nbw*max_threads #ifdef WITH_MPI call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else ! carefull the "recieve" has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif #endif /* WITH_OPENMP */ enddo endif if (current_local_n > 1) then if (my_pcol == mod(sweep,np_cols)) then bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n) current_tv_off = current_tv_off + current_local_n endif #ifdef WITH_MPI call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_REAL8, mod(sweep,np_cols), mpi_comm_cols, mpierr) #endif else ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real bcast_buffer(:,1) = 0 endif if (l_nev == 0) cycle if (current_local_n > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP ! Get real stripe width for strip i; ! The last OpenMP tasks may have an even smaller stripe with, ! but we don't care about this, i.e. we send/recv a bit too much in this case. ! csw: current_stripe_width csw = min(stripe_width, thread_width-(i-1)*stripe_width) #endif !wait_b if (current_n_end < current_n) then #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Wait(bottom_recv_request(i), MPI_STATUS, mpierr) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads n_off = current_local_n+a_off b_len = csw*nbw b_off = (my_thread-1)*b_len a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else #ifdef WITH_MPI call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif n_off = current_local_n+a_off a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) #endif if (next_n_end < next_n) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & MPI_REAL8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #else ! carefull the recieve has to be done at the corresponding wait or send ! bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) #endif #endif /* WITH_MPI */ endif endif if (current_local_n <= bottom_msg_length + top_msg_length) then !wait_t if (top_msg_length>0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i), MPI_STATUS, mpierr) #else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) #endif #endif endif !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, my_thread, thread_width, & THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif !send_b #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) #endif if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) endif #endif endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) endif #endif endif #endif /* WITH_OPENMP */ else !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, thread_width, & THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif !send_b #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) #endif if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) endif #endif endif #else /* WITH_OPENMP */ call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) !send_b #ifdef WITH_MPI call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) endif #endif endif #endif /* WITH_OPENMP */ !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, & THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif !wait_t if (top_msg_length>0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) #endif #endif endif !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif endif if (next_top_msg_length > 0) then !request top_border data #ifdef WITH_OPENMP b_len = csw*next_top_msg_length*max_threads #ifdef WITH_MPI call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #else ! carefull the "recieve" has to be done at the corresponding wait or send ! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL8, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #else ! carefull the "recieve" has to be done at the corresponding wait or send ! top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & ! bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) #endif #endif /* WITH_OPENMP */ endif !send_t if (my_prow > 0) then #ifdef WITH_OPENMP #ifdef WITH_MPI call MPI_Wait(top_send_request(i), mpi_status, mpierr) #endif b_len = csw*nbw*max_threads top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) #ifdef WITH_MPI call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL8, & my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #else if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) #ifdef WITH_MPI call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #else if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) endif #endif #endif /* WITH_OPENMP */ endif ! Care that there are not too many outstanding top_recv_request's if (stripe_count > 1) then if (i>1) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i-1), MPI_STATUS, mpierr) #else call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) #endif #endif else #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS, mpierr) #else call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) #endif #endif endif endif enddo top_msg_length = next_top_msg_length else ! wait for last top_send_request #ifdef WITH_MPI do i = 1, stripe_count #ifdef WITH_OPENMP call MPI_Wait(top_send_request(i), MPI_STATUS, mpierr) #else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif enddo #endif endif ! Care about the result if (my_prow == 0) then ! topmost process sends nbw rows to destination processes do j=0,nfact-1 num_blk = sweep*nfact+j ! global number of destination block, 0 based if (num_blk*nblk >= na) exit nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(result_send_request(nbuf), MPI_STATUS, mpierr) #else call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) #endif #endif dst = mod(num_blk, np_rows) if (dst == 0) then do i = 1, min(na - num_blk*nblk, nblk) #ifdef WITH_OPENMP call pack_row_real_cpu_openmp(a, row, j*nblk+i+a_off, stripe_width, & stripe_count, max_threads, thread_width, l_nev) #else call pack_row_real_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) enddo else do i = 1, nblk #ifdef WITH_OPENMP call pack_row_real_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_real_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif enddo #ifdef WITH_MPI call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, & result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) #else if (j+num_result_buffers < num_result_blocks) & result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) if (my_prow > 0 .and. l_nev>0) then do j1 = 1, min(num_result_buffers, num_result_blocks) result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) enddo endif #endif endif enddo else ! receive and store final result do j = num_bufs_recvd, num_result_blocks-1 nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block ! If there is still work to do, just test for the next result request ! and leave the loop if it is not ready, otherwise wait for all ! outstanding requests if (next_local_n > 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS, mpierr) #else call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ flag = .true. #endif /* WITH_MPI */ if (.not.flag) exit else #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(result_recv_request(nbuf), MPI_STATUS, mpierr) #else call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) #endif #endif endif ! Fill result buffer into q num_blk = j*np_rows + my_prow ! global number of current block, 0 based do i = 1, min(na - num_blk*nblk, nblk) q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) enddo ! Queue result buffer again if there are outstanding blocks left #ifdef WITH_MPI if (j+num_result_buffers < num_result_blocks) & call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(nbuf), mpierr) #else ! carefull the "recieve" has to be done at the corresponding wait or send ! if (j+num_result_buffers < num_result_blocks) & ! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) #endif enddo num_bufs_recvd = j endif ! Shift the remaining rows to the front of A (if necessary) offset = nbw - top_msg_length if (offset<0) then if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset success = .false. return endif a_off = a_off + offset if (a_off + next_local_n + nbw > a_dim2) then #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, i, j), schedule(static, 1) do my_thread = 1, max_threads do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) enddo #else do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i) = A(:,j+a_off,i) #endif enddo enddo #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #endif a_off = 0 endif enddo #ifdef WITH_MPI ! Just for safety: if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol #endif if (my_prow == 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers)) call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) deallocate(mpi_statuses) #else call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) #endif #endif endif #ifdef WITH_MPI if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol #endif if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,'(" Kernel time:",f10.3," MFlops: ",f10.3)') kernel_time, kernel_flops/kernel_time*1.d-6 ! deallocate all working space nullify(a) call free(a_ptr) deallocate(row) deallocate(limits) deallocate(result_send_request) deallocate(result_recv_request) deallocate(top_border_send_buffer) deallocate(top_border_recv_buffer) deallocate(bottom_border_send_buffer) deallocate(bottom_border_recv_buffer) deallocate(result_buffer) deallocate(bcast_buffer) deallocate(top_send_request) deallocate(top_recv_request) deallocate(bottom_send_request) deallocate(bottom_recv_request) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_tridi_to_band_real") #endif return end subroutine trans_ev_tridi_to_band_real subroutine single_hh_trafo(q, hh, nb, nq, ldq) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision ! Perform single real Householder transformation. ! This routine is not performance critical and thus it is coded here in Fortran implicit none integer(kind=ik) :: nb, nq, ldq real(kind=rk) :: q(ldq, *), hh(*) integer(kind=ik) :: i real(kind=rk) :: v(nq) #ifdef HAVE_DETAILED_TIMINGS call timer%start("single_hh_trafo") #endif ! v = q * hh v(:) = q(1:nq,1) do i=2,nb v(:) = v(:) + q(1:nq,i) * hh(i) enddo ! v = v * tau v(:) = v(:) * hh(1) ! q = q - v * hh**T q(1:nq,1) = q(1:nq,1) - v(:) do i=2,nb q(1:nq,i) = q(1:nq,i) - v(:) * hh(i) enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("single_hh_trafo") #endif end subroutine subroutine determine_workload(na, nb, nprocs, limits) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nb, nprocs integer(kind=ik), intent(out) :: limits(0:nprocs) integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("determine_workload") #endif if (na <= 0) then limits(:) = 0 #ifdef HAVE_DETAILED_TIMINGS call timer%stop("determine_workload") #endif return endif if (nb*nprocs > na) then ! there is not enough work for all do i = 0, nprocs limits(i) = min(na, i*nb) enddo else do i = 0, nprocs limits(i) = (i*na)/nprocs enddo endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("determine_workload") #endif end subroutine subroutine bandred_complex(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, success) !------------------------------------------------------------------------------- ! bandred_complex: Reduces a distributed hermitian matrix to band form ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the band and the Householder vectors ! in the upper half. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nbw semi bandwith of output matrix ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 ! Factors for the Householder vectors (returned), needed for back transformation ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck) :: a(lda,*), tmat(nbw,nbw,*) #else complex(kind=ck) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) #endif complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow integer(kind=ik) :: istep, ncol, lch, lcx, nlc integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile real(kind=rk) :: vnorm2 complex(kind=ck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) complex(kind=ck), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:) logical, intent(in) :: wantDebug logical, intent(out) :: success #ifdef HAVE_DETAILED_TIMINGS call timer%start("bandred_complex") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) success = .true. ! Semibandwith nbw must be a multiple of blocksize nblk if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk' endif success = .false. return endif endif ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile do istep = (na-1)/nbw, 1, -1 n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step ! Number of local columns/rows of remaining matrix l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces allocate(vmr(max(l_rows,1),2*n_cols)) allocate(umc(max(l_cols,1),2*n_cols)) allocate(vr(l_rows+1)) vmr(1:l_rows,1:n_cols) = 0. vr(:) = 0 tmat(:,:,istep) = 0 ! Reduce current block to lower triangular form do lc = n_cols, 1, -1 ncol = istep*nbw + lc ! absolute column number of householder vector nrow = ncol - nbw ! Absolute number of pivot row lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number tau = 0 if(nrow == 1) exit ! Nothing to do cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block if (my_pcol==cur_pcol) then ! Get vector to be transformed; distribute last element and norm of ! remaining elements to all procs in current column vr(1:lr) = a(1:lr,lch) ! vector to be transformed if (my_prow==prow(nrow, nblk, np_rows)) then aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) aux1(2) = vr(lr) else aux1(1) = dot_product(vr(1:lr),vr(1:lr)) aux1(2) = 0. endif #ifdef WITH_MPI call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else aux2 = aux1 #endif vnorm2 = aux2(1) vrl = aux2(2) ! Householder transformation call hh_transform_complex(vrl, vnorm2, xf, tau) ! Scale vr and store Householder vector for back transformation vr(1:lr) = vr(1:lr) * xf if (my_prow==prow(nrow, nblk, np_rows)) then a(1:lr-1,lch) = vr(1:lr-1) a(lr,lch) = vrl vr(lr) = 1. else a(1:lr,lch) = vr(1:lr) endif endif ! Broadcast Householder vector and tau along columns vr(lr+1) = tau #ifdef WITH_MPI call MPI_Bcast(vr,lr+1,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr) #endif vmr(1:lr,lc) = vr(1:lr) tau = vr(lr+1) tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat ! Transform remaining columns in current block with Householder vector ! Local dot product aux1 = 0 nlc = 0 ! number of local columns do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) endif enddo ! Get global dot products #ifdef WITH_MPI if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else if (nlc>0) aux2=aux1 #endif ! Transform nlc = 0 do j=1,lc-1 lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) if (lcx>0) then nlc = nlc+1 a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) endif enddo enddo ! Calculate scalar products of stored Householder vectors. ! This can be done in different ways, we use zherk vav = 0 if (l_rows>0) & call zherk('U','C',n_cols,l_rows,CONE,vmr,ubound(vmr,dim=1),CZERO,vav,ubound(vav,dim=1)) call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_rows) ! Calculate triangular matrix T for block Householder Transformation do lc=n_cols,1,-1 tau = tmat(lc,lc,istep) if (lc vmc (stored in umc, second half) call elpa_transpose_vectors_complex (vmr, ubound(vmr,dim=1), mpi_comm_rows, & umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & 1, istep*nbw, n_cols, nblk) ! Calculate umc = A**T * vmr ! Note that the distributed A has to be transposed ! Opposed to direct tridiagonalization there is no need to use the cache locality ! of the tiles, so we can use strips of the matrix umc(1:l_cols,1:n_cols) = 0.d0 vmr(1:l_rows,n_cols+1:2*n_cols) = 0 if (l_cols>0 .and. l_rows>0) then do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce0) then allocate(tmp(l_cols,n_cols)) call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) deallocate(tmp) endif #endif ! U = U * Tmat**T call ztrmm('Right','Upper','C','Nonunit',l_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1)) ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T call zgemm('C','N',n_cols,n_cols,l_cols,CONE,umc,ubound(umc,dim=1),umc(1,n_cols+1), & ubound(umc,dim=1),CZERO,vav,ubound(vav,dim=1)) call ztrmm('Right','Upper','C','Nonunit',n_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),vav,ubound(vav,dim=1)) call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_cols) ! U = U - 0.5 * V * VAV call zgemm('N','N',l_cols,n_cols,n_cols,(-0.5d0,0.d0),umc(1,n_cols+1),ubound(umc,dim=1),vav,ubound(vav,dim=1), & CONE,umc,ubound(umc,dim=1)) ! Transpose umc -> umr (stored in vmr, second half) call elpa_transpose_vectors_complex (umc, ubound(umc,dim=1), mpi_comm_cols, & vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & 1, istep*nbw, n_cols, nblk) ! A = A - V*U**T - U*V**T do i=0,(istep*nbw-1)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) then call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), & q,ldq,CZERO,tmp1,n_cols) else tmp1(1:l_cols*n_cols) = 0 endif #ifdef WITH_MPI call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else tmp2=tmp1 #endif if (l_rows>0) then call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols) call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,dim=1), & tmp2,n_cols,CONE,q,ldq) endif enddo deallocate(tmp1, tmp2, hvb, hvm) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_band_to_full_complex") #endif end subroutine trans_ev_band_to_full_complex subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, mpi_comm) !------------------------------------------------------------------------------- ! tridiag_band_complex: ! Reduces a complex hermitian symmetric band matrix to tridiagonal form ! ! na Order of matrix a ! ! nb Semi bandwith ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) ! ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm ! MPI-Communicator for the total processor set !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck),intent(in) :: a(lda,*) #else complex(kind=ck), intent(in) :: a(lda,matrixCols) #endif real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0 complex(kind=ck), intent(inout), & allocatable :: hh_trans_complex(:,:) real(kind=rk) :: vnorm2 complex(kind=ck) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf complex(kind=ck) :: hd(nb), hs(nb) integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt integer(kind=ik) :: my_pe, n_pes, mpierr integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols integer(kind=ik) :: ireq_ab, ireq_hv integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off #ifdef WITH_OPENMP integer(kind=ik), allocatable :: mpi_statuses(:,:) integer(kind=ik), allocatable :: omp_block_limits(:) integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter integer(kind=ik) :: omp_get_max_threads #ifdef WITH_MPI integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) #endif complex(kind=ck), allocatable :: hv_t(:,:), tau_t(:) #endif integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) integer(kind=ik), allocatable :: block_limits(:) complex(kind=ck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) #ifndef WITH_MPI integer(kind=ik) :: startAddr #endif ! ! dummies for calling redist_band ! real*8 :: r_a(1,1), r_ab(1,1) #ifdef HAVE_DETAILED_TIMINGS call timer%start("tridiag_band_complex") #endif call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_size(mpi_comm,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) ! Get global_id mapping 2D procssor coordinates to global id allocate(global_id(0:np_rows-1,0:np_cols-1)) global_id(:,:) = 0 global_id(my_prow, my_pcol) = my_pe #ifdef WITH_MPI call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) #endif ! Total number of blocks in the band: nblocks_total = (na-1)/nb + 1 ! Set work distribution allocate(block_limits(0:n_pes)) call divide_band(nblocks_total, n_pes, block_limits) ! nblocks: the number of blocks for my task nblocks = block_limits(my_pe+1) - block_limits(my_pe) ! allocate the part of the band matrix which is needed by this PE ! The size is 1 block larger than needed to avoid extensive shifts allocate(ab(2*nb,(nblocks+1)*nb)) ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb ! Redistribute band in a to ab call redist_band_complex(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) ! Calculate the workload for each sweep in the back transformation ! and the space requirements to hold the HH vectors allocate(limits(0:np_rows)) call determine_workload(na, nb, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) num_hh_vecs = 0 num_chunks = 0 nx = na do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) ! add to number of householder vectors ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_hh_vecs = num_hh_vecs + local_size num_chunks = num_chunks+1 endif nx = nx - nb enddo ! Allocate space for HH vectors allocate(hh_trans_complex(nb,num_hh_vecs)) ! Allocate and init MPI requests allocate(ireq_hhr(num_chunks)) ! Recv requests allocate(ireq_hhs(nblocks)) ! Send requests num_hh_vecs = 0 num_chunks = 0 nx = na nt = 0 do n = 1, nblocks_total call determine_workload(nx, nb, np_rows, limits) local_size = limits(my_prow+1) - limits(my_prow) if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then num_chunks = num_chunks+1 #ifdef WITH_MPI call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, & 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) #else ! carefull non-block recv data copy must be done at wait or send ! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) #endif num_hh_vecs = num_hh_vecs + local_size endif nx = nx - nb if (n == block_limits(nt+1)) then nt = nt + 1 endif enddo #ifdef WITH_MPI ireq_hhs(:) = MPI_REQUEST_NULL #endif ! Buffers for gathering/sending the HH vectors allocate(hh_gath(nb,max_blk_size,nblocks)) ! gathers HH vectors allocate(hh_send(nb,max_blk_size,nblocks)) ! send buffer for HH vectors hh_gath(:,:,:) = 0 hh_send(:,:,:) = 0 ! Some counters allocate(hh_cnt(nblocks)) allocate(hh_dst(nblocks)) hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all hh_dst(:) = 0 ! PE number for receive #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! Limits for sending allocate(snd_limits(0:np_rows,nblocks)) do iblk=1,nblocks call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) enddo #ifdef WITH_OPENMP ! OpenMP work distribution: max_threads = 1 !$ max_threads = omp_get_max_threads() ! For OpenMP we need at least 2 blocks for every thread max_threads = MIN(max_threads, nblocks/2) if (max_threads==0) max_threads = 1 allocate(omp_block_limits(0:max_threads)) ! Get the OpenMP block limits call divide_band(nblocks, max_threads, omp_block_limits) allocate(hv_t(nb,max_threads), tau_t(max_threads)) hv_t = 0 tau_t = 0 #endif ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first column to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif endif #ifndef WITH_MPI startAddr = ubound(hh_trans_complex,dim=2) #endif #ifdef WITH_OPENMP do istep=1,na-1-block_limits(my_pe)*nb #else do istep=1,na-1 #endif if (my_pe==0) then n = MIN(na-na_s,nb) ! number of rows to be reduced hv(:) = 0 tau = 0 ! Transform first column of remaining matrix ! Opposed to the real case, the last step (istep=na-1) is needed here for making ! the last subdiagonal element a real number vnorm2 = sum(dble(ab(3:n+1,na_s-n_off))**2+dimag(ab(3:n+1,na_s-n_off))**2) if (n<2) vnorm2 = 0. ! Safety only call hh_transform_complex(ab(2,na_s-n_off),vnorm2,hf,tau) hv(1) = 1 hv(2:n) = ab(3:n+1,na_s-n_off)*hf d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na-1) then d(na) = ab(1,na_s+1-n_off) e(na) = 0 endif else if (na>na_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,mpi_status,mpierr) #else call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP hv(1:nb) = hv_s(1:nb) #else hv(1:nb) = hv_s(1:nb) #endif #endif /* WITH_MPI */ tau = hv(1) hv(1) = 1. endif endif na_s = na_s+1 if (na_s-n_off > nb) then ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 n_off = n_off + nb endif #ifdef WITH_OPENMP if (max_threads > 1) then ! Codepath for OpenMP ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. ! This simulates the behaviour of the MPI tasks which also work after each other. ! The code would be considerably easier, if the MPI communication would be made within ! the parallel region - this is avoided here since this would require ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. hv_t(:,1) = hv tau_t(1) = tau do iter = 1, 2 ! iter=1 : work on first block ! iter=2 : work on remaining blocks ! This is done in 2 iterations so that we have a barrier in between: ! After the first iteration, it is guaranteed that the last row of the last block ! is completed by the next thread. ! After the first iteration it is also the place to exchange the last row ! with MPI calls #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & !$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) do my_thread = 1, max_threads if (iter == 1) then my_block_s = omp_block_limits(my_thread-1) + 1 my_block_e = my_block_s else my_block_s = omp_block_limits(my_thread-1) + 2 my_block_e = omp_block_limits(my_thread) endif do iblk = my_block_s, my_block_e ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block ne = ns+nb-1 ! last column in block if (istepna) exit hv = hv_t(:,my_thread) tau = tau_t(my_thread) ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Transform diagonal block call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1) x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1) hv_t(:,my_thread) = 0 tau_t(my_thread) = 0 if (nr<=0) cycle ! No subdiagonal block present any more ! Transform subdiagonal block call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1) if (nr>1) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation for first column ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) hv_t(1 ,my_thread) = 1. hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0 ! update subdiagonal block for old and new Householder transformation ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster call ZGEMV('C',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,(0.d0,0.d0),h(2),1) x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) & - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo else ! No new Householder transformation for nr=1, just complete the old one ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) enddo ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) hv_t(1,my_thread) = 1. endif enddo enddo ! my_thread !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif if (iter==1) then ! We are at the end of the first block ! Send our first column to previous PE if (my_pe>0 .and. na_s <= na) then #ifdef WITH_MPI call mpi_wait(ireq_ab,mpi_status,mpierr) #endif ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) #ifdef WITH_MPI call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif endif ! Request last column from next PE ne = na_s + nblocks*nb - (max_threads-1) - 1 #ifdef WITH_MPI if (istep>=max_threads .and. ne <= na) then call mpi_recv(ab(1,ne-n_off),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr) endif #else if (istep>=max_threads .and. ne <= na) then ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) endif #endif else ! We are at the end of all blocks ! Send last HH vector and TAU to next PE if it has been calculated above ne = na_s + nblocks*nb - (max_threads-1) - 1 if (istep>=max_threads .and. ne < na) then #ifdef WITH_MPI call mpi_wait(ireq_hv,mpi_status,mpierr) #endif hv_s(1) = tau_t(max_threads) hv_s(2:) = hv_t(2:,max_threads) #ifdef WITH_MPI call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr) #endif endif ! "Send" HH vector and TAU to next OpenMP thread do my_thread = max_threads, 2, -1 hv_t(:,my_thread) = hv_t(:,my_thread-1) tau_t(my_thread) = tau_t(my_thread-1) enddo endif enddo ! iter else ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-1 ! last column in block if (ns+n_off>na) exit ! Store Householder vector for back transformation hh_cnt(iblk) = hh_cnt(iblk) + 1 hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) #ifndef WITH_OPENMP if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, & global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #else startAddr = startAddr - hh_cnt(iblk) hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. #endif nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! ! Multiply diagonal block and subdiagonal block with Householder vector if (iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below! ab(1,ne) = 0 call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1) ! Subdiagonal block if (nr>0) call ZGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1) ! ... then request last column ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr) #else call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP ab(1:nb+1,ne) = ab_s(1:nb+1) #else ab(1:nb+1,ne) = ab_s(1:nb+1) #endif #endif /* WITH_MPI */ ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau else ! Normal matrix multiply call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1) if (nr>0) call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1) endif ! Calculate first column of subdiagonal block and calculate new ! Householder transformation for this column hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb tau_new = 0 if (nr>0) then ! complete (old) Householder transformation for first column ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... if (nr>1) then vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_new) hv_new(1) = 1. hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf ab(nb+2:,ns) = 0 endif ! ... and send it away immediatly if this is the last block if (iblk==nblocks) then #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_hv,mpi_status,mpierr) #else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) #endif #endif hv_s(1) = tau_new hv_s(2:) = hv_new(2:) #ifdef WITH_MPI call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr) #endif endif endif ! Transform diagonal block x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) if (my_pe>0 .and. iblk==1) then ! The first column of the diagonal block has to be send to the previous PE ! Calculate first column only ... ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1)) ! ... send it away ... #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_ab,mpi_status,mpierr) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) #endif #endif ab_s(1:nb+1) = ab(1:nb+1,ns) #ifdef WITH_MPI call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif ! ... and calculate remaining columns with rank-2 update if (nc>1) call ZHER2('L',nc-1,(-1.d0,0.d0),hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1) else ! No need to send, just a rank-2 update call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1) endif ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb if (nr>0) then if (nr>1) then call ZGEMV('C',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,(0.d0,0.d0),h(2),1) x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo else ! No double Householder transformation for nr=1, just complete the row do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) enddo endif endif ! Use new HH vector for the next block hv(:) = hv_new(:) tau = tau_new enddo #ifdef WITH_OPENMP endif #endif #ifdef WITH_OPENMP do iblk = 1, nblocks if (hh_dst(iblk) >= np_rows) exit if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish #ifdef WITH_MPI call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) #endif ! Copy vectors into send buffer hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) ! Send to destination #ifdef WITH_MPI call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, & global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) #else startAddr = startAddr - hh_cnt(iblk) hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) #endif ! Reset counter and increase destination row hh_cnt(iblk) = 0 hh_dst(iblk) = hh_dst(iblk)+1 endif enddo #endif enddo ! Finish the last outstanding requests #ifdef WITH_MPI #ifdef WITH_OPENMP call mpi_wait(ireq_ab,mpi_status,mpierr) call mpi_wait(ireq_hv,mpi_status,mpierr) allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks))) call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr) call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr) deallocate(mpi_statuses) #else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) #endif call mpi_barrier(mpi_comm,mpierr) #endif deallocate(ab) deallocate(ireq_hhr, ireq_hhs) deallocate(hh_cnt, hh_dst) deallocate(hh_gath, hh_send) deallocate(limits, snd_limits) deallocate(block_limits) deallocate(global_id) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("tridiag_band_complex") #endif end subroutine tridiag_band_complex subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, matrixCols, & hh_trans_complex, mpi_comm_rows, mpi_comm_cols, & wantDebug, success, THIS_COMPLEX_ELPA_KERNEL) !------------------------------------------------------------------------------- ! trans_ev_tridi_to_band_complex: ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nev Number eigenvectors to compute (= columns of matrix q) ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! nb semi bandwith ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! ldq Leading dimension of q ! matrixCols local columns of matrix q ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns/both ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use pack_unpack_complex use compute_hh_trafo_complex use precision implicit none integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck) :: q(ldq,*) #else complex(kind=ck) :: q(ldq,matrixCols) #endif complex(kind=ck) :: hh_trans_complex(:,:) integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count #ifdef WITH_OPENMP integer(kind=ik) :: thread_width, csw, b_off, b_len #endif integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd integer(kind=ik) :: a_off, current_tv_off, max_blk_size integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk logical :: flag #ifdef WITH_OPENMP complex(kind=ck), pointer :: a(:,:,:,:) #else complex(kind=ck), pointer :: a(:,:,:) #endif complex(kind=ck) :: a_complex complex(kind=ck), allocatable :: row(:) type(c_ptr) :: a_ptr #ifdef WITH_OPENMP complex(kind=ck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) #else complex(kind=ck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) #endif complex(kind=ck), allocatable :: result_buffer(:,:,:) complex(kind=ck), allocatable :: bcast_buffer(:,:) integer(kind=ik) :: n_off integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) #ifdef WITH_OPENMP integer(kind=ik), allocatable :: mpi_statuses(:,:) #ifdef WITH_MPI integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) #endif #endif ! MPI send/recv tags, arbitrary integer(kind=ik), parameter :: bottom_recv_tag = 111 integer(kind=ik), parameter :: top_recv_tag = 222 integer(kind=ik), parameter :: result_recv_tag = 333 #ifdef WITH_OPENMP integer(kind=ik) :: max_threads, my_thread integer(kind=ik) :: omp_get_max_threads #endif ! Just for measuring the kernel performance real(kind=rk) :: kernel_time ! long integer integer(kind=lik) :: kernel_flops logical, intent(in) :: wantDebug logical :: success #ifndef WITH_MPI integer(kind=ik) :: j1 #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("trans_ev_tridi_to_band_complex") #endif kernel_time = 1.d-100 kernel_flops = 0 #ifdef WITH_OPENMP max_threads = 1 max_threads = omp_get_max_threads() #endif call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) success = .true. if (mod(nbw,nblk)/=0) then if (my_prow==0 .and. my_pcol==0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk' endif success = .false. return endif endif nfact = nbw / nblk ! local number of eigenvectors l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if (l_nev==0) then #ifdef WITH_OPENMP thread_width = 0 #endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! Suggested stripe width is 48 - should this be reduced for the complex case ??? #ifdef WITH_OPENMP thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread #endif stripe_width = 48 ! Must be a multiple of 4 #ifdef WITH_OPENMP stripe_count = (thread_width-1)/stripe_width + 1 #else stripe_count = (l_nev-1)/stripe_width + 1 #endif ! Adapt stripe width so that last one doesn't get too small #ifdef WITH_OPENMP stripe_width = (thread_width-1)/stripe_count + 1 #else stripe_width = (l_nev-1)/stripe_count + 1 #endif stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!! #ifndef WITH_OPENMP last_stripe_width = l_nev - (stripe_count-1)*stripe_width #endif endif ! Determine the matrix distribution at the beginning allocate(limits(0:np_rows)) call determine_workload(na, nbw, np_rows, limits) max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) a_dim2 = max_blk_size + nbw #ifdef WITH_OPENMP if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_complex)) /= 0) then #else if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_complex)) /= 0) then #endif write(error_unit,*) "Cannot allocate memory" success = .false. return endif call c_f_pointer(a_ptr, a, & #ifdef WITH_OPENMP [stripe_width,a_dim2,stripe_count,max_threads] & #else [stripe_width,a_dim2,stripe_count] & #endif ) #ifndef WITH_OPENMP a(:,:,:) = 0 #endif allocate(row(l_nev)) row(:) = 0 ! Copy q from a block cyclic distribution into a distribution with contiguous rows, ! and transpose the matrix using stripes of given stripe_width for cache blocking. ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below #ifdef WITH_OPENMP ! Please note about the OMP usage below: ! This is not for speed, but because we want the matrix a in the memory and ! in the cache of the correct thread (if possible) #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation! enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #endif do ip = np_rows-1, 0, -1 if (my_prow == ip) then ! Receive my rows which have not yet been received src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src < my_prow) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) #else call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP row(1:l_nev) = row(1:l_nev) #else row(1:l_nev) = row(1:l_nev) #endif #endif /* WITH_MPI */ #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #endif elseif (src==my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) #endif endif enddo ! Send all rows which have not yet been send src_offset = 0 do dst = 0, ip-1 do i=limits(dst)+1,limits(dst+1) if(mod((i-1)/nblk, np_rows) == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr) #endif endif enddo enddo else if(my_prow < ip) then ! Send all rows going to PE ip src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if (src == my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) #ifdef WITH_MPI call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr) #endif endif enddo ! Receive all rows from PE ip do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if (src == ip) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) #else call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP row(1:l_nev) = row(1:l_nev) #else row(1:l_nev) = row(1:l_nev) #endif #endif /* WITH_MPI */ #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call unpack_row_complex_cpu_openmp(a, row,i-limits(my_prow),my_thread, & stripe_count, thread_width, stripe_width, l_nev) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else call unpack_row_complex_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) #endif endif enddo endif enddo ! Set up result buffer queue num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows num_result_buffers = 4*nfact allocate(result_buffer(l_nev,nblk,num_result_buffers)) allocate(result_send_request(num_result_buffers)) allocate(result_recv_request(num_result_buffers)) #ifdef WITH_MPI result_send_request(:) = MPI_REQUEST_NULL result_recv_request(:) = MPI_REQUEST_NULL #endif ! Queue up buffers #ifdef WITH_MPI if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j = 1, min(num_result_buffers, num_result_blocks) call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(j), mpierr) enddo endif #else ! carefull the "recieve" has to be done at the corresponding wait or send !if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends ! do j = 1, min(num_result_buffers, num_result_blocks) ! result_buffer(1:l_nev*nblk,1,j) = result_buffer(1:l_nev*nblk,1,nbuf) ! enddo !endif #endif num_bufs_recvd = 0 ! No buffers received yet ! Initialize top/bottom requests allocate(top_send_request(stripe_count)) allocate(top_recv_request(stripe_count)) allocate(bottom_send_request(stripe_count)) allocate(bottom_recv_request(stripe_count)) #ifdef WITH_MPI top_send_request(:) = MPI_REQUEST_NULL top_recv_request(:) = MPI_REQUEST_NULL bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL #endif #ifdef WITH_OPENMP allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) top_border_send_buffer(:,:) = 0 top_border_recv_buffer(:,:) = 0 bottom_border_send_buffer(:,:) = 0 bottom_border_recv_buffer(:,:) = 0 #else allocate(top_border_send_buffer(stripe_width, nbw, stripe_count)) allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count)) allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count)) allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count)) top_border_send_buffer(:,:,:) = 0 top_border_recv_buffer(:,:,:) = 0 bottom_border_send_buffer(:,:,:) = 0 bottom_border_recv_buffer(:,:,:) = 0 #endif ! Initialize broadcast buffer allocate(bcast_buffer(nbw, max_blk_size)) bcast_buffer = 0 current_tv_off = 0 ! Offset of next row to be broadcast ! ------------------- start of work loop ------------------- a_off = 0 ! offset in A (to avoid unnecessary shifts) top_msg_length = 0 bottom_msg_length = 0 do sweep = 0, (na-1)/nbw current_n = na - sweep*nbw call determine_workload(current_n, nbw, np_rows, limits) current_n_start = limits(my_prow) current_n_end = limits(my_prow+1) current_local_n = current_n_end - current_n_start next_n = max(current_n - nbw, 0) call determine_workload(next_n, nbw, np_rows, limits) next_n_start = limits(my_prow) next_n_end = limits(my_prow+1) next_local_n = next_n_end - next_n_start if (next_n_end < next_n) then bottom_msg_length = current_n_end - next_n_end else bottom_msg_length = 0 endif if (next_local_n > 0) then next_top_msg_length = current_n_start - next_n_start else next_top_msg_length = 0 endif if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" b_len = csw*nbw*max_threads #ifdef WITH_MPI call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif #endif /* WITH_OPENMP */ enddo endif if (current_local_n > 1) then if (my_pcol == mod(sweep,np_cols)) then bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n) current_tv_off = current_tv_off + current_local_n endif #ifdef WITH_MPI call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr) #endif else ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex bcast_buffer(:,1) = 0 endif if (l_nev == 0) cycle if (current_local_n > 0) then do i = 1, stripe_count #ifdef WITH_OPENMP ! Get real stripe width for strip i; ! The last OpenMP tasks may have an even smaller stripe with, ! but we don't care about this, i.e. we send/recv a bit too much in this case. ! csw: current_stripe_width csw = min(stripe_width, thread_width-(i-1)*stripe_width) #endif !wait_b if (current_n_end < current_n) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr) #else call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads n_off = current_local_n+a_off b_len = csw*nbw b_off = (my_thread-1)*b_len a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ n_off = current_local_n+a_off a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) #endif /* WITH_OPENMP */ if (next_n_end < next_n) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) #endif #else /* WITH_MPI */ #ifdef WITH_OPENMP ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) #else ! carefull the "recieve" has to be do done at the corresponding wait or send ! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) #endif #endif /* WITH_MPI */ endif endif if (current_local_n <= bottom_msg_length + top_msg_length) then !wait_t if (top_msg_length>0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ #ifndef WITH_OPENMP a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) #endif endif !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, my_thread, thread_width, & THIS_COMPLEX_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, current_local_n, i, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) #endif /* WITH_OPENMP */ !send_b #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) #else call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ if (bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off #ifdef WITH_OPENMP b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) endif #endif #else /* WITH_OPENMP */ bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) endif #endif #endif /* WITH_OPENMP */ endif else !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & thread_width, THIS_COMPLEX_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & current_local_n - bottom_msg_length, bottom_msg_length, i, & last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) #endif /* WITH_OPENMP */ !send_b #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) #else call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ if (bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off #ifdef WITH_OPENMP b_len = csw*bottom_msg_length*max_threads bottom_border_send_buffer(1:b_len,i) = & reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) endif #endif #else /* WITH_OPENMP */ bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) #ifdef WITH_MPI call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) #else if (next_top_msg_length > 0) then top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) endif #endif #endif /* WITH_OPENMP */ endif !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread), schedule(static, 1) do my_thread = 1, max_threads call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, & kernel_time, top_msg_length, & current_local_n-top_msg_length-bottom_msg_length, i, & my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) #endif /* WITH_OPENMP */ !wait_t if (top_msg_length>0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ #ifndef WITH_OPENMP a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) #endif endif !compute #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) do my_thread = 1, max_threads if (top_msg_length>0) then b_len = csw*top_msg_length b_off = (my_thread-1)*b_len a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) endif call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, my_thread, thread_width, & THIS_COMPLEX_ELPA_KERNEL) enddo !$omp end parallel do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #else /* WITH_OPENMP */ call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & 0, top_msg_length, i, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) #endif /* WITH_OPENMP */ endif if (next_top_msg_length > 0) then !request top_border data #ifdef WITH_OPENMP b_len = csw*next_top_msg_length*max_threads #ifdef WITH_MPI call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #else ! carefull the "recieve" has to be done at the corresponding send or wait ! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) #endif #else /* WITH_OPENMP */ #ifdef WITH_MPI call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, & top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) #else ! carefull the "recieve" has to be done at the corresponding send or wait ! top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & ! bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) #endif #endif /* WITH_OPENMP */ endif !send_t if (my_prow > 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_send_request(i), mpi_status, mpierr) #else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ #ifdef WITH_OPENMP b_len = csw*nbw*max_threads top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) #ifdef WITH_MPI call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, & my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #else if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) endif #endif #else /* WITH_OPENMP */ top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) #ifdef WITH_MPI call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) #else if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i) endif if (next_n_end < next_n) then bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i) endif #endif #endif /* WITH_OPENMP */ endif ! Care that there are not too many outstanding top_recv_request's if (stripe_count > 1) then if (i>1) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ else #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr) #else call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ endif endif enddo top_msg_length = next_top_msg_length else ! wait for last top_send_request #ifdef WITH_MPI do i = 1, stripe_count #ifdef WITH_OPENMP call MPI_Wait(top_send_request(i), mpi_status, mpierr) #else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) #endif enddo #endif /* WITH_MPI */ endif ! Care about the result if (my_prow == 0) then ! topmost process sends nbw rows to destination processes do j=0,nfact-1 num_blk = sweep*nfact+j ! global number of destination block, 0 based if (num_blk*nblk >= na) exit nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr) #else call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ dst = mod(num_blk, np_rows) if (dst == 0) then do i = 1, min(na - num_blk*nblk, nblk) #ifdef WITH_OPENMP call pack_row_complex_cpu_openmp(a, row, j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_complex_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) enddo else do i = 1, nblk #ifdef WITH_OPENMP call pack_row_complex_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, & stripe_width, stripe_count, max_threads, thread_width, l_nev) #else call pack_row_complex_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) #endif enddo #ifdef WITH_MPI call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, & result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) #else if (j+num_result_buffers < num_result_blocks) & result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends do j1 = 1, min(num_result_buffers, num_result_blocks) result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) enddo endif #endif endif enddo else ! receive and store final result do j = num_bufs_recvd, num_result_blocks-1 nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block ! If there is still work to do, just test for the next result request ! and leave the loop if it is not ready, otherwise wait for all ! outstanding requests if (next_local_n > 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr) #else call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) #endif #else /* WITH_MPI */ flag = .true. #endif /* WITH_MPI */ if (.not.flag) exit else #ifdef WITH_MPI #ifdef WITH_OPENMP call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr) #else call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) #endif #endif /* WITH_MPI */ endif ! Fill result buffer into q num_blk = j*np_rows + my_prow ! global number of current block, 0 based do i = 1, min(na - num_blk*nblk, nblk) q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) enddo ! Queue result buffer again if there are outstanding blocks left #ifdef WITH_MPI if (j+num_result_buffers < num_result_blocks) & call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, & mpi_comm_rows, result_recv_request(nbuf), mpierr) #else ! carefull "recieve" has to be done at corresponding wait or send ! if (j+num_result_buffers < num_result_blocks) & ! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) #endif enddo num_bufs_recvd = j endif ! Shift the remaining rows to the front of A (if necessary) offset = nbw - top_msg_length if (offset<0) then if (wantDebug) then write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset endif success = .false. return endif a_off = a_off + offset if (a_off + next_local_n + nbw > a_dim2) then #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$omp parallel do private(my_thread, i, j), schedule(static, 1) do my_thread = 1, max_threads do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) enddo #else /* WITH_OPENMP */ do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i) = A(:,j+a_off,i) #endif /* WITH_OPENMP */ enddo enddo #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%stop("OpenMP parallel") #endif #endif a_off = 0 endif enddo ! Just for safety: #ifdef WITH_MPI if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol #endif if (my_prow == 0) then #ifdef WITH_MPI #ifdef WITH_OPENMP allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers)) call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) deallocate(mpi_statuses) #else call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) #endif #endif /* WITH_MPI */ endif #ifdef WITH_MPI if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol #endif if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,'(" Kernel time:",f10.3," MFlops: ",f10.3)') kernel_time, kernel_flops/kernel_time*1.d-6 ! deallocate all working space nullify(a) call free(a_ptr) deallocate(row) deallocate(limits) deallocate(result_send_request) deallocate(result_recv_request) deallocate(top_border_send_buffer) deallocate(top_border_recv_buffer) deallocate(bottom_border_send_buffer) deallocate(bottom_border_recv_buffer) deallocate(result_buffer) deallocate(bcast_buffer) deallocate(top_send_request) deallocate(top_recv_request) deallocate(bottom_send_request) deallocate(bottom_recv_request) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_tridi_to_band_complex") #endif return ! contains ! !#ifdef WITH_OPENMP ! subroutine compute_hh_trafo_complex(off, ncols, istripe, my_thread, THIS_COMPLEX_ELPA_KERNEL) !#else ! subroutine compute_hh_trafo_complex(off, ncols, istripe, THIS_COMPLEX_ELPA_KERNEL) !#endif ! use precision !#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) ! use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple !#endif !#if defined(WITH_COMPLEX_GENERIC_KERNEL) ! use complex_generic_kernel, only : single_hh_trafo_complex_generic !#endif !#ifdef HAVE_DETAILED_TIMINGS ! use timings !#endif ! implicit none ! integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL ! ! ! Private variables in OMP regions (my_thread) should better be in the argument list! ! ! integer(kind=ik) :: off, ncols, istripe, j, nl, jj !#ifdef WITH_OPENMP ! integer(kind=ik) :: my_thread, noff !#endif ! real(kind=rk) :: ttt ! ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! Currently (on Sandy Bridge), single is faster than double ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! complex(kind=ck) :: w(nbw,2) ! !#ifdef HAVE_DETAILED_TIMINGS ! call timer%start("compute_hh_trafo_complex") !#endif ! !#ifdef WITH_OPENMP ! if (istripe1) then do i=0,nblocks2-1 call mpi_irecv(ab2(1,i*nb2+1),2*nb2*nb2,mpi_real8,0,3,mpi_comm,ireq_ab2(i+1),mpierr) enddo endif #else ! carefull the "recieve" has to be done at the corresponding send or wait ! if (nb2>1) then ! do i=0,nblocks2-1 ! ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2) ! enddo ! endif #endif ! n_off: Offset of ab within band n_off = block_limits(my_pe)*nb lwork = nb*nb2 dest = 0 #ifdef WITH_MPI ireq_ab = MPI_REQUEST_NULL ireq_hv = MPI_REQUEST_NULL #endif ! --------------------------------------------------------------------------- ! Start of calculations na_s = block_limits(my_pe)*nb + 1 if (my_pe>0 .and. na_s<=na) then ! send first nb2 columns to previous PE ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) do i=1,nb2 ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1) enddo #ifdef WITH_MPI call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif endif do istep=1,na/nb2 if (my_pe==0) then n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced hv(:,:) = 0 tau(:) = 0 ! The last step (istep=na-1) is only needed for sending the last HH vectors. ! We don't want the sign of the last element flipped (analogous to the other sweeps) if (istep < na/nb2) then ! Transform first block column of remaining matrix call dgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info); do i=1,nb2 hv(i,i) = 1.0 hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1) ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0 enddo endif if (nb2==1) then d(istep) = ab(1,na_s-n_off) e(istep) = ab(2,na_s-n_off) if (istep == na) then e(na) = 0 endif else ab_s2 = 0 ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1) if (block_limits2(dest+1)1) then do i= 0,nblocks2-1 ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2) enddo endif #endif endif else if (na>na_s+nb2-1) then ! Receive Householder vectors from previous task, from PE owning subdiagonal #ifdef WITH_MPI call mpi_recv(hv,nb*nb2,mpi_real8,my_pe-1,2,mpi_comm,mpi_status,mpierr) #else hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2) #endif do i=1,nb2 tau(i) = hv(i,i) hv(i,i) = 1. enddo endif endif na_s = na_s+nb2 if (na_s-n_off > nb) then ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 n_off = n_off + nb endif do iblk=1,nblocks ns = na_s + (iblk-1)*nb - n_off ! first column in block ne = ns+nb-nb2 ! last column in block if (ns+n_off>na) exit nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! call wy_gen(nc,nb2,w,hv,tau,work,nb) if (iblk==nblocks .and. nc==nb) then !request last nb2 columns #ifdef WITH_MPI call mpi_recv(ab_r,(nb+1)*nb2,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr) #else ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2) #endif do i=1,nb2 ab(1:nb+1,ne+i-1) = ab_r(:,i) enddo endif hv_new(:,:) = 0 ! Needed, last rows must be 0 for nr < nb tau_new(:) = 0 if (nr>0) then call wy_right(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb) call dgeqrf(nr,nb2,ab(nb+1,ns),2*nb-1,tau_new,work,lwork,info); do i=1,nb2 hv_new(i,i) = 1.0 hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1) ab(nb+2:,ns+i-1) = 0 enddo !send hh-vector if (iblk==nblocks) then #ifdef WITH_MPI call mpi_wait(ireq_hv,mpi_status,mpierr) #endif hv_s = hv_new do i=1,nb2 hv_s(i,i) = tau_new(i) enddo #ifdef WITH_MPI call mpi_isend(hv_s,nb*nb2,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr) #endif endif endif call wy_symm(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb) if (my_pe>0 .and. iblk==1) then !send first nb2 columns to previous PE #ifdef WITH_MPI call mpi_wait(ireq_ab,mpi_status,mpierr) #endif do i=1,nb2 ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1) enddo #ifdef WITH_MPI call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) #endif endif if (nr>0) then call wy_gen(nr,nb2,w_new,hv_new,tau_new,work,nb) call wy_left(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb) endif ! Use new HH vector for the next block hv(:,:) = hv_new(:,:) tau = tau_new enddo enddo ! Finish the last outstanding requests #ifdef WITH_MPI call mpi_wait(ireq_ab,mpi_status,mpierr) call mpi_wait(ireq_hv,mpi_status,mpierr) allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2)) call mpi_waitall(nblocks2,ireq_ab2,mpi_statuses,mpierr) deallocate(mpi_statuses) call mpi_barrier(mpi_comm,mpierr) #endif deallocate(block_limits) deallocate(block_limits2) deallocate(ireq_ab2) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("band_band_real") #endif end subroutine subroutine wy_gen(n, nb, W, Y, tau, mem, lda) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !length of householder-vectors integer(kind=ik), intent(in) :: nb !number of householder-vectors integer(kind=ik), intent(in) :: lda !leading dimension of Y and W real(kind=rk), intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b real(kind=rk), intent(in) :: tau(nb) !tau values real(kind=rk), intent(out) :: W(lda,nb) !output matrix W real(kind=rk), intent(in) :: mem(nb) !memory for a temporary matrix of size nb integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_gen") #endif W(1:n,1) = tau(1)*Y(1:n,1) do i=2,nb W(1:n,i) = tau(i)*Y(1:n,i) call DGEMV('T',n,i-1,1.d0,Y,lda,W(1,i),1,0.d0,mem,1) call DGEMV('N',n,i-1,-1.d0,W,lda,mem,1,1.d0,W(1,i),1) enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_gen") #endif end subroutine subroutine wy_left(n, m, nb, A, lda, W, Y, mem, lda2) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !width of the matrix A integer(kind=ik), intent(in) :: m !length of matrix W and Y integer(kind=ik), intent(in) :: nb !width of matrix W and Y integer(kind=ik), intent(in) :: lda !leading dimension of A integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_left") #endif call DGEMM('T', 'N', nb, n, m, 1.d0, W, lda2, A, lda, 0.d0, mem, nb) call DGEMM('N', 'N', m, n, nb, -1.d0, Y, lda2, mem, nb, 1.d0, A, lda) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_left") #endif end subroutine subroutine wy_right(n, m, nb, A, lda, W, Y, mem, lda2) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !height of the matrix A integer(kind=ik), intent(in) :: m !length of matrix W and Y integer(kind=ik), intent(in) :: nb !width of matrix W and Y integer(kind=ik), intent(in) :: lda !leading dimension of A integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_right") #endif call DGEMM('N', 'N', n, nb, m, 1.d0, A, lda, W, lda2, 0.d0, mem, n) call DGEMM('N', 'T', n, m, nb, -1.d0, mem, n, Y, lda2, 1.d0, A, lda) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_right") #endif end subroutine subroutine wy_symm(n, nb, A, lda, W, Y, mem, mem2, lda2) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y integer(kind=ik), intent(in) :: nb !width of matrix W and Y integer(kind=ik), intent(in) :: lda !leading dimension of A integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size real(kind=rk), intent(in) :: W(n,nb) !blocked transformation matrix W real(kind=rk), intent(in) :: Y(n,nb) !blocked transformation matrix Y real(kind=rk) :: mem(n,nb) !memory for a temporary matrix of size n x nb real(kind=rk) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb #ifdef HAVE_DETAILED_TIMINGS call timer%start("wy_symm") #endif call DSYMM('L', 'L', n, nb, 1.d0, A, lda, W, lda2, 0.d0, mem, n) call DGEMM('T', 'N', nb, nb, n, 1.d0, mem, n, W, lda2, 0.d0, mem2, nb) call DGEMM('N', 'N', n, nb, nb, -0.5d0, Y, lda2, mem2, nb, 1.d0, mem, n) call DSYR2K('L', 'N', n, nb, -1.d0, Y, lda2, mem, n, 1.d0, A, lda) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("wy_symm") #endif end subroutine end module ELPA2_compute elpa-2016.05.001/src/elpa2.F900000644000312500001440000005277012717516040012112 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), fomerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "config-f90.h" !> \brief Fortran module which provides the routines to use the two-stage ELPA solver module ELPA2 ! Version 1.1.2, 2011-02-21 use elpa_utilities use elpa1_compute use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve use elpa2_utilities use elpa2_compute use elpa_pdgeqrf use elpa_mpi implicit none PRIVATE ! By default, all routines contained are private ! The following routines are public: public :: solve_evp_real_2stage public :: solve_evp_complex_2stage !****** contains !------------------------------------------------------------------------------- !> \brief solve_evp_real_2stage: Fortran function to solve the real eigenvalue problem with a 2 stage approach !> !> Parameters !> !> \param na Order of matrix a !> !> \param nev Number of eigenvalues needed !> !> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. !> Distribution is like in Scalapack. !> The full matrix must be set (not only one half like in scalapack). !> Destroyed on exit (upper and lower half). !> !> \param lda Leading dimension of a !> !> \param ev(na) On output: eigenvalues of a, every processor gets the complete set !> !> \param q(ldq,matrixCols) On output: Eigenvectors of a !> Distribution is like in Scalapack. !> Must be always dimensioned to the full size (corresponding to (na,na)) !> even if only a part of the eigenvalues is needed. !> !> \param ldq Leading dimension of q !> !> \param nblk blocksize of cyclic distribution, must be the same in both directions! !> !> \param matrixCols local columns of matrix a and q !> !> \param mpi_comm_rows MPI communicator for rows !> \param mpi_comm_cols MPI communicator for columns !> \param mpi_comm_all MPI communicator for the total processor set !> !> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API !> !> \param use_qr (optional) use QR decomposition !> !> \result success logical, false if error occured !------------------------------------------------------------------------------- function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, & mpi_comm_rows, mpi_comm_cols, & mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,& useQR) result(success) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none logical, intent(in), optional :: useQR logical :: useQRActual, useQREnvironment integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API integer(kind=ik) :: THIS_REAL_ELPA_KERNEL integer(kind=ik), intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, & mpi_comm_cols, mpi_comm_all integer(kind=ik), intent(in) :: nblk real(kind=rk), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) ! was ! real a(lda,*), q(ldq,*) real(kind=rk), allocatable :: hh_trans_real(:,:) integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: nbw, num_blocks real(kind=rk), allocatable :: tmat(:,:,:), e(:) real(kind=rk) :: ttt0, ttt1, ttts integer(kind=ik) :: i logical :: success logical, save :: firstCall = .true. logical :: wantDebug #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_evp_real_2stage") #endif call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) call mpi_comm_size(mpi_comm_all,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) wantDebug = .false. if (firstCall) then ! are debug messages desired? wantDebug = debug_messages_via_environment_variable() firstCall = .false. endif success = .true. useQRActual = .false. ! set usage of qr decomposition via API call if (present(useQR)) then if (useQR) useQRActual = .true. if (.not.(useQR)) useQRACtual = .false. endif ! overwrite this with environment variable settings if (qr_decomposition_via_environment_variable(useQREnvironment)) then useQRActual = useQREnvironment endif if (useQRActual) then if (mod(na,2) .ne. 0) then if (wantDebug) then write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize" endif print *, "Do not use QR-decomposition for this matrix and blocksize." success = .false. return endif endif if (present(THIS_REAL_ELPA_KERNEL_API)) then ! user defined kernel via the optional argument in the API call THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API else ! if kernel is not choosen via api ! check whether set by environment variable THIS_REAL_ELPA_KERNEL = get_actual_real_kernel() endif ! check whether choosen kernel is allowed: function returns true if NOT allowed! change this if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then if (my_pe == 0) then write(error_unit,*) " " write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL) write(error_unit,*) "is not in the list of the allowed kernels!" write(error_unit,*) " " write(error_unit,*) "Allowed kernels are:" do i=1,size(REAL_ELPA_KERNEL_NAMES(:)) if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " ! check whether generic kernel is defined if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then write(error_unit,*) "The default kernel REAL_ELPA_KERNEL_GENERIC will be used !" else write(error_unit,*) "As default kernel ",REAL_ELPA_KERNEL_NAMES(DEFAULT_REAL_ELPA_KERNEL)," will be used" endif endif ! my_pe == 0 if (AVAILABLE_REAL_ELPA_KERNELS(REAL_ELPA_KERNEL_GENERIC) .eq. 1) then THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC else THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL endif endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal. ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32! ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye ! on this and maybe allow a run-time optimization here nbw = (63/nblk+1)*nblk num_blocks = (na-1)/nbw + 1 allocate(tmat(nbw,nbw,num_blocks)) ! Reduction full -> band ttt0 = MPI_Wtime() ttts = ttt0 call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & tmat, wantDebug, success, useQRActual) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time bandred_real :',ttt1-ttt0 ! Reduction band -> tridiagonal allocate(e(na)) ttt0 = MPI_Wtime() call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, mpi_comm_all) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0 #ifdef WITH_MPI call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr) call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr) #endif ttt1 = MPI_Wtime() time_evp_fwd = ttt1-ttts ! Solve tridiagonal system ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 time_evp_solve = ttt1-ttt0 ttts = ttt1 deallocate(e) ! Backtransform stage 1 ttt0 = MPI_Wtime() call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, wantDebug, success, & THIS_REAL_ELPA_KERNEL) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0 ! We can now deallocate the stored householder vectors deallocate(hh_trans_real) ! Backtransform stage 2 ttt0 = MPI_Wtime() call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & mpi_comm_cols, useQRActual) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0 time_evp_back = ttt1-ttts deallocate(tmat) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_evp_real_2stage") #endif 1 format(a,f10.3) end function solve_evp_real_2stage !------------------------------------------------------------------------------- !> \brief solve_evp_complex_2stage: Fortran function to solve the complex eigenvalue problem with a 2 stage approach !> !> Parameters !> !> \param na Order of matrix a !> !> \param nev Number of eigenvalues needed !> !> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. !> Distribution is like in Scalapack. !> The full matrix must be set (not only one half like in scalapack). !> Destroyed on exit (upper and lower half). !> !> \param lda Leading dimension of a !> !> \param ev(na) On output: eigenvalues of a, every processor gets the complete set !> !> \param q(ldq,matrixCols) On output: Eigenvectors of a !> Distribution is like in Scalapack. !> Must be always dimensioned to the full size (corresponding to (na,na)) !> even if only a part of the eigenvalues is needed. !> !> \param ldq Leading dimension of q !> !> \param nblk blocksize of cyclic distribution, must be the same in both directions! !> !> \param matrixCols local columns of matrix a and q !> !> \param mpi_comm_rows MPI communicator for rows !> \param mpi_comm_cols MPI communicator for columns !> \param mpi_comm_all MPI communicator for the total processor set !> !> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API !> !> \result success logical, false if error occured !------------------------------------------------------------------------------- function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols, & mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all complex(kind=ck), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols) ! was ! complex a(lda,*), q(ldq,*) real(kind=rk), intent(inout) :: ev(na) complex(kind=ck), allocatable :: hh_trans_complex(:,:) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes integer(kind=ik) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks complex(kind=ck), allocatable :: tmat(:,:,:) real(kind=rk), allocatable :: q_real(:,:), e(:) real(kind=rk) :: ttt0, ttt1, ttts integer(kind=ik) :: i logical :: success, wantDebug logical, save :: firstCall = .true. #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_evp_complex_2stage") #endif call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) call mpi_comm_size(mpi_comm_all,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) wantDebug = .false. if (firstCall) then ! are debug messages desired? wantDebug = debug_messages_via_environment_variable() firstCall = .false. endif success = .true. if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then ! user defined kernel via the optional argument in the API call THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API else ! if kernel is not choosen via api ! check whether set by environment variable THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel() endif ! check whether choosen kernel is allowed if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then if (my_pe == 0) then write(error_unit,*) " " write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL) write(error_unit,*) "is not in the list of the allowed kernels!" write(error_unit,*) " " write(error_unit,*) "Allowed kernels are:" do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:)) if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " ! check whether generic kernel is defined if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then write(error_unit,*) "The default kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" else write(error_unit,*) "As default kernel ",COMPLEX_ELPA_KERNEL_NAMES(DEFAULT_COMPLEX_ELPA_KERNEL)," will be used" endif endif ! my_pe == 0 if (AVAILABLE_COMPLEX_ELPA_KERNELS(COMPLEX_ELPA_KERNEL_GENERIC) .eq. 1) then THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC else THIS_COMPLEX_ELPA_KERNEL = DEFAULT_COMPLEX_ELPA_KERNEL endif endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 nbw = (31/nblk+1)*nblk num_blocks = (na-1)/nbw + 1 allocate(tmat(nbw,nbw,num_blocks)) ! Reduction full -> band ttt0 = MPI_Wtime() ttts = ttt0 call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & tmat, wantDebug, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop() #endif return endif ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time bandred_complex :',ttt1-ttt0 ! Reduction band -> tridiagonal allocate(e(na)) ttt0 = MPI_Wtime() call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, mpi_comm_all) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0 #ifdef WITH_MPI call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr) call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr) #endif ttt1 = MPI_Wtime() time_evp_fwd = ttt1-ttts l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev allocate(q_real(l_rows,l_cols)) ! Solve tridiagonal system ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & mpi_comm_rows, mpi_comm_cols, wantDebug, success) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 time_evp_solve = ttt1-ttt0 ttts = ttt1 q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) deallocate(e, q_real) ! Backtransform stage 1 ttt0 = MPI_Wtime() call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, & matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, & wantDebug, success,THIS_COMPLEX_ELPA_KERNEL) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0 ! We can now deallocate the stored householder vectors deallocate(hh_trans_complex) ! Backtransform stage 2 ttt0 = MPI_Wtime() call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, & mpi_comm_rows, mpi_comm_cols) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0 time_evp_back = ttt1-ttts deallocate(tmat) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_evp_complex_2stage") #endif 1 format(a,f10.3) end function solve_evp_complex_2stage end module ELPA2 elpa-2016.05.001/src/mod_pack_unpack_real.F900000644000312500001440000001365012717516040015222 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF module pack_unpack_real #include "config-f90.h" implicit none #ifdef WITH_OPENMP public pack_row_real_cpu_openmp, unpack_row_real_cpu_openmp #else public pack_row_real_cpu, unpack_row_real_cpu #endif contains #ifdef WITH_OPENMP subroutine pack_row_real_cpu_openmp(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev) #else subroutine pack_row_real_cpu(a, row, n, stripe_width, last_stripe_width, stripe_count) #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: n, stripe_count, stripe_width #ifdef WITH_OPENMP integer(kind=ik), intent(in) :: max_threads, thread_width, l_nev real(kind=rk), intent(in) :: a(:,:,:,:) #else integer(kind=ik), intent(in) :: last_stripe_width real(kind=rk), intent(in) :: a(:,:,:) #endif real(kind=rk) :: row(:) integer(kind=ik) :: i, noff, nl #ifdef WITH_OPENMP integer(kind=ik) :: nt #endif #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%start("pack_row_real_cpu_openmp") #else call timer%start("pack_row_real_cpu") #endif #endif #ifdef WITH_OPENMP do nt = 1, max_threads do i = 1, stripe_count noff = (nt-1)*thread_width + (i-1)*stripe_width nl = min(stripe_width, nt*thread_width-noff, l_nev-noff) if (nl<=0) exit row(noff+1:noff+nl) = a(1:nl,n,i,nt) enddo enddo #else do i=1,stripe_count nl = merge(stripe_width, last_stripe_width, i ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF module pack_unpack_complex #include "config-f90.h" implicit none #ifdef WITH_OPENMP public pack_row_complex_cpu_openmp #else public pack_row_complex_cpu #endif contains #ifdef WITH_OPENMP subroutine pack_row_complex_cpu_openmp(a, row, n, stripe_width, stripe_count, max_threads, thread_width, l_nev) #else subroutine pack_row_complex_cpu(a, row, n, stripe_width, last_stripe_width, stripe_count) #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none #ifdef WITH_OPENMP integer(kind=ik), intent(in) :: stripe_width, stripe_count, max_threads, thread_width, l_nev complex(kind=ck), intent(in) :: a(:,:,:,:) #else integer(kind=ik), intent(in) :: stripe_width, last_stripe_width, stripe_count complex(kind=ck), intent(in) :: a(:,:,:) #endif complex(kind=ck) :: row(:) integer(kind=ik) :: n, i, noff, nl, nt #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%start("pack_row_complex_cpu_openmp") #else call timer%start("pack_row_complex_cpu") #endif #endif #ifdef WITH_OPENMP do nt = 1, max_threads do i = 1, stripe_count noff = (nt-1)*thread_width + (i-1)*stripe_width nl = min(stripe_width, nt*thread_width-noff, l_nev-noff) if (nl<=0) exit row(noff+1:noff+nl) = a(1:nl,n,i,nt) enddo enddo #else do i=1,stripe_count nl = merge(stripe_width, last_stripe_width, i ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Lorenz Huedepohl, MPCDF #include "config-f90.h" module time_c use precision use, intrinsic :: iso_c_binding interface function microseconds_since_epoch() result(ms) bind(C, name="ftimings_microseconds_since_epoch") use, intrinsic :: iso_c_binding implicit none integer(kind=C_INT64_T) :: ms end function end interface interface function seconds() result(s) bind(C, name="seconds") use, intrinsic :: iso_c_binding implicit none real(kind=C_DOUBLE) :: s end function end interface end module time_c elpa-2016.05.001/src/elpa2_kernels/0000755000312500001440000000000012717541041013421 500000000000000elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_simple.F900000644000312500001440000001102312717516040020721 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! -------------------------------------------------------------------------------------------------- ! ! This file contains the compute intensive kernels for the Householder transformations. ! ! This is the small and simple version (no hand unrolling of loops etc.) but for some ! compilers this performs better than a sophisticated version with transformed and unrolled loops. ! ! It should be compiled with the highest possible optimization level. ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! -------------------------------------------------------------------------------------------------- #include "config-f90.h" module real_generic_simple_kernel private public double_hh_trafo_generic_simple contains subroutine double_hh_trafo_generic_simple(q, hh, nb, nq, ldq, ldh) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*) #else real(kind=rk), intent(inout) :: q(ldq,1:nb+1) real(kind=rk), intent(in) :: hh(ldh,2) #endif real(kind=rk) :: s, h1, h2, tau1, tau2, x(nq), y(nq) integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic simple: double_hh_trafo_generic_simple") #endif ! Calculate dot product of the two Householder vectors s = hh(2,2)*1 do i=3,nb s = s+hh(i,2)*hh(i-1,1) enddo ! Do the Householder transformations x(1:nq) = q(1:nq,2) y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2) do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) x(1:nq) = x(1:nq) + q(1:nq,i)*h1 y(1:nq) = y(1:nq) + q(1:nq,i)*h2 enddo x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1) tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x(1:nq) = x(1:nq)*h1 h1 = -tau2 h2 = -tau2*s y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2 q(1:nq,1) = q(1:nq,1) + y(1:nq) q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2) do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2 enddo q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic simple: double_hh_trafo_generic_simple") #endif end subroutine double_hh_trafo_generic_simple end module real_generic_simple_kernel ! -------------------------------------------------------------------------------------------------- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_bgp.f900000644000312500001440000004657712717516040020266 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! -------------------------------------------------------------------------------------------------- ! ! This file contains the compute intensive kernels for the Householder transformations. ! ! *** Special IBM BlueGene/P version with BlueGene assembler instructions in Fortran *** ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! -------------------------------------------------------------------------------------------------- !module real_bgp_kernel ! private ! public double_hh_trafo_bgp !contains subroutine double_hh_trafo_bgp(q, hh, nb, nq, ldq, ldh) use precision implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*) real(kind=rk) :: s integer(kind=ik) :: i ! Safety only: if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!' if(mod(loc(q),16) /= 0) STOP 'Q unaligned!' ! Calculate dot product of the two Householder vectors s = hh(2,2)*1 do i=3,nb s = s+hh(i,2)*hh(i-1,1) enddo do i=1,nq-16,20 call hh_trafo_kernel_10_bgp(q(i ,1), hh, nb, ldq, ldh, s) call hh_trafo_kernel_10_bgp(q(i+10,1), hh, nb, ldq, ldh, s) enddo ! i > nq-16 now, i.e. at most 16 rows remain if(nq-i+1 > 12) then call hh_trafo_kernel_8_bgp(q(i ,1), hh, nb, ldq, ldh, s) call hh_trafo_kernel_8_bgp(q(i+8,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 8) then call hh_trafo_kernel_8_bgp(q(i ,1), hh, nb, ldq, ldh, s) call hh_trafo_kernel_4_bgp(q(i+8,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 4) then call hh_trafo_kernel_8_bgp(q(i ,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 0) then call hh_trafo_kernel_4_bgp(q(i ,1), hh, nb, ldq, ldh, s) endif end subroutine double_hh_trafo_bgp ! -------------------------------------------------------------------------------------------------- ! The following kernels perform the Householder transformation on Q for 10/8/4 rows. ! Please note that Q is declared complex*16 here. ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_10_bgp(q, hh, nb, ldq, ldh, s) use precision use elpa_mpi implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh complex(kind=ck), intent(inout) :: q(ldq/2,*) real(kind=rk), intent(in) :: hh(ldh,*), s complex(kind=ck) :: x1, x2, x3, x4, x5, y1, y2, y3, y4, y5, q1, q2, q3, q4, q5, p1, p2, p3, p4, p5 real(kind=rk) :: h1, h2 integer(kind=ik) :: i ! complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b ! real*8 x ! loadfp(a) = a ! fxcpmadd(a,b,x) = a + b*x ! fxpmul(a,x) = a*x ! fpadd(a,b) = a+b ! call alignx(16,q) x1 = loadfp(q(1,2)) x2 = loadfp(q(2,2)) x3 = loadfp(q(3,2)) x4 = loadfp(q(4,2)) x5 = loadfp(q(5,2)) h2 = hh(2,2) y1 = loadfp(q(1,1)) y2 = loadfp(q(2,1)) y3 = loadfp(q(3,1)) y4 = loadfp(q(4,1)) y5 = loadfp(q(5,1)) y1 = fxcpmadd(y1,x1,h2) q1 = loadfp(q(1,3)) y2 = fxcpmadd(y2,x2,h2) q2 = loadfp(q(2,3)) y3 = fxcpmadd(y3,x3,h2) q3 = loadfp(q(3,3)) y4 = fxcpmadd(y4,x4,h2) q4 = loadfp(q(4,3)) y5 = fxcpmadd(y5,x5,h2) q5 = loadfp(q(5,3)) h1 = hh(3-1,1) do i=3,nb,2 h2 = hh(i,2) x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) x3 = fxcpmadd(x3,q3,h1) x4 = fxcpmadd(x4,q4,h1) x5 = fxcpmadd(x5,q5,h1) h1 = hh(i ,1) y1 = fxcpmadd(y1,q1,h2) q1 = loadfp(q(1,i+1)) y2 = fxcpmadd(y2,q2,h2) q2 = loadfp(q(2,i+1)) y3 = fxcpmadd(y3,q3,h2) q3 = loadfp(q(3,i+1)) y4 = fxcpmadd(y4,q4,h2) q4 = loadfp(q(4,i+1)) y5 = fxcpmadd(y5,q5,h2) q5 = loadfp(q(5,i+1)) if(i==nb) exit h2 = hh(i+1,2) x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) x3 = fxcpmadd(x3,q3,h1) x4 = fxcpmadd(x4,q4,h1) x5 = fxcpmadd(x5,q5,h1) h1 = hh(i+1,1) y1 = fxcpmadd(y1,q1,h2) q1 = loadfp(q(1,i+2)) y2 = fxcpmadd(y2,q2,h2) q2 = loadfp(q(2,i+2)) y3 = fxcpmadd(y3,q3,h2) q3 = loadfp(q(3,i+2)) y4 = fxcpmadd(y4,q4,h2) q4 = loadfp(q(4,i+2)) y5 = fxcpmadd(y5,q5,h2) q5 = loadfp(q(5,i+2)) enddo x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) x3 = fxcpmadd(x3,q3,h1) x4 = fxcpmadd(x4,q4,h1) x5 = fxcpmadd(x5,q5,h1) h1 = -hh(1,1) ! for below h2 = -hh(1,2) x1 = fxpmul(x1,h1) x2 = fxpmul(x2,h1) x3 = fxpmul(x3,h1) x4 = fxpmul(x4,h1) x5 = fxpmul(x5,h1) h1 = -hh(1,2)*s y1 = fxpmul(y1,h2) y2 = fxpmul(y2,h2) y3 = fxpmul(y3,h2) y4 = fxpmul(y4,h2) y5 = fxpmul(y5,h2) y1 = fxcpmadd(y1,x1,h1) q1 = loadfp(q(1,1)) y2 = fxcpmadd(y2,x2,h1) q2 = loadfp(q(2,1)) y3 = fxcpmadd(y3,x3,h1) q3 = loadfp(q(3,1)) y4 = fxcpmadd(y4,x4,h1) q4 = loadfp(q(4,1)) y5 = fxcpmadd(y5,x5,h1) q5 = loadfp(q(5,1)) q1 = fpadd(q1,y1) p1 = loadfp(q(1,2)) q2 = fpadd(q2,y2) p2 = loadfp(q(2,2)) q3 = fpadd(q3,y3) p3 = loadfp(q(3,2)) q4 = fpadd(q4,y4) p4 = loadfp(q(4,2)) q5 = fpadd(q5,y5) p5 = loadfp(q(5,2)) h2 = hh(2,2) call storefp(q(1,1),q1) p1 = fpadd(p1,x1) call storefp(q(2,1),q2) p2 = fpadd(p2,x2) call storefp(q(3,1),q3) p3 = fpadd(p3,x3) call storefp(q(4,1),q4) p4 = fpadd(p4,x4) call storefp(q(5,1),q5) p5 = fpadd(p5,x5) p1 = fxcpmadd(p1,y1,h2) q1 = loadfp(q(1,3)) p2 = fxcpmadd(p2,y2,h2) q2 = loadfp(q(2,3)) p3 = fxcpmadd(p3,y3,h2) q3 = loadfp(q(3,3)) p4 = fxcpmadd(p4,y4,h2) q4 = loadfp(q(4,3)) p5 = fxcpmadd(p5,y5,h2) q5 = loadfp(q(5,3)) h1 = hh(3-1,1) do i=3,nb,2 h2 = hh(i,2) call storefp(q(1,i-1),p1) q1 = fxcpmadd(q1,x1,h1) call storefp(q(2,i-1),p2) q2 = fxcpmadd(q2,x2,h1) call storefp(q(3,i-1),p3) q3 = fxcpmadd(q3,x3,h1) call storefp(q(4,i-1),p4) q4 = fxcpmadd(q4,x4,h1) call storefp(q(5,i-1),p5) q5 = fxcpmadd(q5,x5,h1) h1 = hh(i,1) q1 = fxcpmadd(q1,y1,h2) p1 = loadfp(q(1,i+1)) q2 = fxcpmadd(q2,y2,h2) p2 = loadfp(q(2,i+1)) q3 = fxcpmadd(q3,y3,h2) p3 = loadfp(q(3,i+1)) q4 = fxcpmadd(q4,y4,h2) p4 = loadfp(q(4,i+1)) q5 = fxcpmadd(q5,y5,h2) p5 = loadfp(q(5,i+1)) if(i==nb) exit h2 = hh(i+1,2) call storefp(q(1,i),q1) p1 = fxcpmadd(p1,x1,h1) call storefp(q(2,i),q2) p2 = fxcpmadd(p2,x2,h1) call storefp(q(3,i),q3) p3 = fxcpmadd(p3,x3,h1) call storefp(q(4,i),q4) p4 = fxcpmadd(p4,x4,h1) call storefp(q(5,i),q5) p5 = fxcpmadd(p5,x5,h1) h1 = hh(i+1,1) p1 = fxcpmadd(p1,y1,h2) q1 = loadfp(q(1,i+2)) p2 = fxcpmadd(p2,y2,h2) q2 = loadfp(q(2,i+2)) p3 = fxcpmadd(p3,y3,h2) q3 = loadfp(q(3,i+2)) p4 = fxcpmadd(p4,y4,h2) q4 = loadfp(q(4,i+2)) p5 = fxcpmadd(p5,y5,h2) q5 = loadfp(q(5,i+2)) enddo if(i==nb) then call storefp(q(1,nb),q1) p1 = fxcpmadd(p1,x1,h1) call storefp(q(2,nb),q2) p2 = fxcpmadd(p2,x2,h1) call storefp(q(3,nb),q3) p3 = fxcpmadd(p3,x3,h1) call storefp(q(4,nb),q4) p4 = fxcpmadd(p4,x4,h1) call storefp(q(5,nb),q5) p5 = fxcpmadd(p5,x5,h1) call storefp(q(1,nb+1),p1) call storefp(q(2,nb+1),p2) call storefp(q(3,nb+1),p3) call storefp(q(4,nb+1),p4) call storefp(q(5,nb+1),p5) else call storefp(q(1,nb),p1) q1 = fxcpmadd(q1,x1,h1) call storefp(q(2,nb),p2) q2 = fxcpmadd(q2,x2,h1) call storefp(q(3,nb),p3) q3 = fxcpmadd(q3,x3,h1) call storefp(q(4,nb),p4) q4 = fxcpmadd(q4,x4,h1) call storefp(q(5,nb),p5) q5 = fxcpmadd(q5,x5,h1) call storefp(q(1,nb+1),q1) call storefp(q(2,nb+1),q2) call storefp(q(3,nb+1),q3) call storefp(q(4,nb+1),q4) call storefp(q(5,nb+1),q5) endif !contains ! ! subroutine storefp(a,b) ! complex*16 a, b ! ! a = b ! end subroutine ! subroutine alignx(n, x) ! integer n ! complex*16 x(ldq/2,*) ! end subroutine end subroutine hh_trafo_kernel_10_bgp ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_8_bgp(q, hh, nb, ldq, ldh, s) use precision use elpa_mpi implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh complex(kind=ck), intent(inout) :: q(ldq/2,*) real(kind=rk), intent(in) :: hh(ldh,*), s complex(kind=ck) :: x1, x2, x3, x4, y1, y2, y3, y4, q1, q2, q3, q4, p1, p2, p3, p4 real(kind=rk) :: h1, h2 integer(kind=ik) :: i ! complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b ! real*8 x ! loadfp(a) = a ! fxcpmadd(a,b,x) = a + b*x ! fxpmul(a,x) = a*x ! fpadd(a,b) = a+b call alignx(16,q) x1 = loadfp(q(1,2)) x2 = loadfp(q(2,2)) x3 = loadfp(q(3,2)) x4 = loadfp(q(4,2)) h2 = hh(2,2) y1 = loadfp(q(1,1)) y2 = loadfp(q(2,1)) y3 = loadfp(q(3,1)) y4 = loadfp(q(4,1)) y1 = fxcpmadd(y1,x1,h2) q1 = loadfp(q(1,3)) y2 = fxcpmadd(y2,x2,h2) q2 = loadfp(q(2,3)) y3 = fxcpmadd(y3,x3,h2) q3 = loadfp(q(3,3)) y4 = fxcpmadd(y4,x4,h2) q4 = loadfp(q(4,3)) h1 = hh(3-1,1) do i=3,nb,2 h2 = hh(i,2) x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) x3 = fxcpmadd(x3,q3,h1) x4 = fxcpmadd(x4,q4,h1) h1 = hh(i ,1) y1 = fxcpmadd(y1,q1,h2) q1 = loadfp(q(1,i+1)) y2 = fxcpmadd(y2,q2,h2) q2 = loadfp(q(2,i+1)) y3 = fxcpmadd(y3,q3,h2) q3 = loadfp(q(3,i+1)) y4 = fxcpmadd(y4,q4,h2) q4 = loadfp(q(4,i+1)) if(i==nb) exit h2 = hh(i+1,2) x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) x3 = fxcpmadd(x3,q3,h1) x4 = fxcpmadd(x4,q4,h1) h1 = hh(i+1,1) y1 = fxcpmadd(y1,q1,h2) q1 = loadfp(q(1,i+2)) y2 = fxcpmadd(y2,q2,h2) q2 = loadfp(q(2,i+2)) y3 = fxcpmadd(y3,q3,h2) q3 = loadfp(q(3,i+2)) y4 = fxcpmadd(y4,q4,h2) q4 = loadfp(q(4,i+2)) enddo x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) x3 = fxcpmadd(x3,q3,h1) x4 = fxcpmadd(x4,q4,h1) h1 = -hh(1,1) ! for below h2 = -hh(1,2) x1 = fxpmul(x1,h1) x2 = fxpmul(x2,h1) x3 = fxpmul(x3,h1) x4 = fxpmul(x4,h1) h1 = -hh(1,2)*s y1 = fxpmul(y1,h2) y2 = fxpmul(y2,h2) y3 = fxpmul(y3,h2) y4 = fxpmul(y4,h2) y1 = fxcpmadd(y1,x1,h1) q1 = loadfp(q(1,1)) y2 = fxcpmadd(y2,x2,h1) q2 = loadfp(q(2,1)) y3 = fxcpmadd(y3,x3,h1) q3 = loadfp(q(3,1)) y4 = fxcpmadd(y4,x4,h1) q4 = loadfp(q(4,1)) q1 = fpadd(q1,y1) p1 = loadfp(q(1,2)) q2 = fpadd(q2,y2) p2 = loadfp(q(2,2)) q3 = fpadd(q3,y3) p3 = loadfp(q(3,2)) q4 = fpadd(q4,y4) p4 = loadfp(q(4,2)) h2 = hh(2,2) call storefp(q(1,1),q1) p1 = fpadd(p1,x1) call storefp(q(2,1),q2) p2 = fpadd(p2,x2) call storefp(q(3,1),q3) p3 = fpadd(p3,x3) call storefp(q(4,1),q4) p4 = fpadd(p4,x4) p1 = fxcpmadd(p1,y1,h2) q1 = loadfp(q(1,3)) p2 = fxcpmadd(p2,y2,h2) q2 = loadfp(q(2,3)) p3 = fxcpmadd(p3,y3,h2) q3 = loadfp(q(3,3)) p4 = fxcpmadd(p4,y4,h2) q4 = loadfp(q(4,3)) h1 = hh(3-1,1) do i=3,nb,2 h2 = hh(i,2) call storefp(q(1,i-1),p1) q1 = fxcpmadd(q1,x1,h1) call storefp(q(2,i-1),p2) q2 = fxcpmadd(q2,x2,h1) call storefp(q(3,i-1),p3) q3 = fxcpmadd(q3,x3,h1) call storefp(q(4,i-1),p4) q4 = fxcpmadd(q4,x4,h1) h1 = hh(i,1) q1 = fxcpmadd(q1,y1,h2) p1 = loadfp(q(1,i+1)) q2 = fxcpmadd(q2,y2,h2) p2 = loadfp(q(2,i+1)) q3 = fxcpmadd(q3,y3,h2) p3 = loadfp(q(3,i+1)) q4 = fxcpmadd(q4,y4,h2) p4 = loadfp(q(4,i+1)) if(i==nb) exit h2 = hh(i+1,2) call storefp(q(1,i),q1) p1 = fxcpmadd(p1,x1,h1) call storefp(q(2,i),q2) p2 = fxcpmadd(p2,x2,h1) call storefp(q(3,i),q3) p3 = fxcpmadd(p3,x3,h1) call storefp(q(4,i),q4) p4 = fxcpmadd(p4,x4,h1) h1 = hh(i+1,1) p1 = fxcpmadd(p1,y1,h2) q1 = loadfp(q(1,i+2)) p2 = fxcpmadd(p2,y2,h2) q2 = loadfp(q(2,i+2)) p3 = fxcpmadd(p3,y3,h2) q3 = loadfp(q(3,i+2)) p4 = fxcpmadd(p4,y4,h2) q4 = loadfp(q(4,i+2)) enddo if(i==nb) then call storefp(q(1,nb),q1) p1 = fxcpmadd(p1,x1,h1) call storefp(q(2,nb),q2) p2 = fxcpmadd(p2,x2,h1) call storefp(q(3,nb),q3) p3 = fxcpmadd(p3,x3,h1) call storefp(q(4,nb),q4) p4 = fxcpmadd(p4,x4,h1) call storefp(q(1,nb+1),p1) call storefp(q(2,nb+1),p2) call storefp(q(3,nb+1),p3) call storefp(q(4,nb+1),p4) else call storefp(q(1,nb),p1) q1 = fxcpmadd(q1,x1,h1) call storefp(q(2,nb),p2) q2 = fxcpmadd(q2,x2,h1) call storefp(q(3,nb),p3) q3 = fxcpmadd(q3,x3,h1) call storefp(q(4,nb),p4) q4 = fxcpmadd(q4,x4,h1) call storefp(q(1,nb+1),q1) call storefp(q(2,nb+1),q2) call storefp(q(3,nb+1),q3) call storefp(q(4,nb+1),q4) endif !contains ! ! subroutine storefp(a,b) ! complex*16 a, b ! ! a = b ! end subroutine ! subroutine alignx(n, x) ! integer n ! complex*16 x(ldq/2,*) ! end subroutine end subroutine hh_trafo_kernel_8_bgp ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_4_bgp(q, hh, nb, ldq, ldh, s) use precision use elpa_mpi implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh complex(kind=ck), intent(inout) :: q(ldq/2,*) real(kind=rk), intent(in) :: hh(ldh,*), s complex(kind=ck) :: x1, x2, y1, y2, q1, q2, p1, p2 real(kind=rk) :: h1, h2 integer(kind=ik) :: i ! complex*16 loadfp, fxcpmadd, fxpmul, fpadd, a, b ! real*8 x ! loadfp(a) = a ! fxcpmadd(a,b,x) = a + b*x ! fxpmul(a,x) = a*x ! fpadd(a,b) = a+b call alignx(16,q) x1 = loadfp(q(1,2)) x2 = loadfp(q(2,2)) h2 = hh(2,2) y1 = loadfp(q(1,1)) y2 = loadfp(q(2,1)) y1 = fxcpmadd(y1,x1,h2) q1 = loadfp(q(1,3)) y2 = fxcpmadd(y2,x2,h2) q2 = loadfp(q(2,3)) h1 = hh(3-1,1) do i=3,nb,2 h2 = hh(i,2) x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) h1 = hh(i ,1) y1 = fxcpmadd(y1,q1,h2) q1 = loadfp(q(1,i+1)) y2 = fxcpmadd(y2,q2,h2) q2 = loadfp(q(2,i+1)) if(i==nb) exit h2 = hh(i+1,2) x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) h1 = hh(i+1,1) y1 = fxcpmadd(y1,q1,h2) q1 = loadfp(q(1,i+2)) y2 = fxcpmadd(y2,q2,h2) q2 = loadfp(q(2,i+2)) enddo x1 = fxcpmadd(x1,q1,h1) x2 = fxcpmadd(x2,q2,h1) h1 = -hh(1,1) ! for below h2 = -hh(1,2) x1 = fxpmul(x1,h1) x2 = fxpmul(x2,h1) h1 = -hh(1,2)*s y1 = fxpmul(y1,h2) y2 = fxpmul(y2,h2) y1 = fxcpmadd(y1,x1,h1) q1 = loadfp(q(1,1)) y2 = fxcpmadd(y2,x2,h1) q2 = loadfp(q(2,1)) q1 = fpadd(q1,y1) p1 = loadfp(q(1,2)) q2 = fpadd(q2,y2) p2 = loadfp(q(2,2)) h2 = hh(2,2) call storefp(q(1,1),q1) p1 = fpadd(p1,x1) call storefp(q(2,1),q2) p2 = fpadd(p2,x2) p1 = fxcpmadd(p1,y1,h2) q1 = loadfp(q(1,3)) p2 = fxcpmadd(p2,y2,h2) q2 = loadfp(q(2,3)) h1 = hh(3-1,1) do i=3,nb,2 h2 = hh(i,2) call storefp(q(1,i-1),p1) q1 = fxcpmadd(q1,x1,h1) call storefp(q(2,i-1),p2) q2 = fxcpmadd(q2,x2,h1) h1 = hh(i,1) q1 = fxcpmadd(q1,y1,h2) p1 = loadfp(q(1,i+1)) q2 = fxcpmadd(q2,y2,h2) p2 = loadfp(q(2,i+1)) if(i==nb) exit h2 = hh(i+1,2) call storefp(q(1,i),q1) p1 = fxcpmadd(p1,x1,h1) call storefp(q(2,i),q2) p2 = fxcpmadd(p2,x2,h1) h1 = hh(i+1,1) p1 = fxcpmadd(p1,y1,h2) q1 = loadfp(q(1,i+2)) p2 = fxcpmadd(p2,y2,h2) q2 = loadfp(q(2,i+2)) enddo if(i==nb) then call storefp(q(1,nb),q1) p1 = fxcpmadd(p1,x1,h1) call storefp(q(2,nb),q2) p2 = fxcpmadd(p2,x2,h1) call storefp(q(1,nb+1),p1) call storefp(q(2,nb+1),p2) else call storefp(q(1,nb),p1) q1 = fxcpmadd(q1,x1,h1) call storefp(q(2,nb),p2) q2 = fxcpmadd(q2,x2,h1) call storefp(q(1,nb+1),q1) call storefp(q(2,nb+1),q2) endif !contains ! ! subroutine storefp(a,b) ! complex*16 a, b ! ! a = b ! end subroutine ! subroutine alignx(n, x) ! integer n ! complex*16 x(ldq/2,*) ! end subroutine end subroutine hh_trafo_kernel_4_bgp !end module real_bgp_kernel ! -------------------------------------------------------------------------------------------------- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex.F900000644000312500001440000006060312717516040020104 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! -------------------------------------------------------------------------------------------------- ! ! This file contains the compute intensive kernels for the Householder transformations. ! It should be compiled with the highest possible optimization level. ! ! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU) ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! -------------------------------------------------------------------------------------------------- #include "config-f90.h" module complex_generic_kernel private public single_hh_trafo_complex_generic contains subroutine single_hh_trafo_complex_generic(q, hh, nb, nq, ldq) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, nq, ldq #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) #else complex(kind=ck), intent(inout) :: q(1:ldq,1:nb) complex(kind=ck), intent(in) :: hh(1:nb) #endif integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: single_hh_trafo_complex_generic") #endif ! Safety only: if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!' ! Do the Householder transformations ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller do i=1,nq-8,12 #ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_12(q(i,1),hh, nb, ldq) #else call hh_trafo_complex_kernel_12(q(i:ldq,1:nb),hh(1:nb), nb, ldq) #endif enddo ! i > nq-8 now, i.e. at most 8 rows remain if(nq-i+1 > 4) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_8(q(i,1),hh, nb, ldq) #else call hh_trafo_complex_kernel_8(q(i:ldq,1:nb),hh(1:nb), nb, ldq) #endif else if(nq-i+1 > 0) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq) #else call hh_trafo_complex_kernel_4(q(i:ldq,1:nb),hh(1:nb), nb, ldq) #endif endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: single_hh_trafo_complex_generic") #endif end subroutine single_hh_trafo_complex_generic ! -------------------------------------------------------------------------------------------------- subroutine double_hh_trafo_complex_generic(q, hh, nb, nq, ldq, ldh) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) #else complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1) complex(kind=ck), intent(in) :: hh(1:ldh,1:2) #endif complex(kind=ck) :: s integer(kind=ik) :: i ! Safety only: #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: double_hh_trafo_complex_generic") #endif if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!' ! Calculate dot product of the two Householder vectors s = conjg(hh(2,2)*1) do i=3,nb s = s+(conjg(hh(i,2))*hh(i-1,1)) enddo ! Do the Householder transformations ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller do i=1,nq,4 #ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s) #else call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) #endif enddo !do i=1,nq-8,12 #ifdef DESPERATELY_WANT_ASSUMED_SIZE ! call hh_trafo_complex_kernel_12_2hv(q(i,1),hh, nb, ldq, ldh, s) #else ! call hh_trafo_complex_kernel_12_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) #endif !enddo ! i > nq-8 now, i.e. at most 8 rows remain !if(nq-i+1 > 4) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE ! call hh_trafo_complex_kernel_8_2hv(q(i,1),hh, nb, ldq, ldh, s) #else ! call hh_trafo_complex_kernel_8_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) #endif !else if(nq-i+1 > 0) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE ! call hh_trafo_complex_kernel_4_2hv(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) #else #endif !endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: double_hh_trafo_complex_generic") #endif end subroutine double_hh_trafo_complex_generic ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) #else complex(kind=ck), intent(inout) :: q(:,:) complex(kind=ck), intent(in) :: hh(1:nb) #endif complex(kind=ck) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc complex(kind=ck) :: h1, tau1 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_complex_kernel_12") #endif x1 = q(1,1) x2 = q(2,1) x3 = q(3,1) x4 = q(4,1) x5 = q(5,1) x6 = q(6,1) x7 = q(7,1) x8 = q(8,1) x9 = q(9,1) xa = q(10,1) xb = q(11,1) xc = q(12,1) !DEC$ VECTOR ALIGNED do i=2,nb h1 = conjg(hh(i)) x1 = x1 + q(1,i)*h1 x2 = x2 + q(2,i)*h1 x3 = x3 + q(3,i)*h1 x4 = x4 + q(4,i)*h1 x5 = x5 + q(5,i)*h1 x6 = x6 + q(6,i)*h1 x7 = x7 + q(7,i)*h1 x8 = x8 + q(8,i)*h1 x9 = x9 + q(9,i)*h1 xa = xa + q(10,i)*h1 xb = xb + q(11,i)*h1 xc = xc + q(12,i)*h1 enddo tau1 = hh(1) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 x5 = x5*h1 x6 = x6*h1 x7 = x7*h1 x8 = x8*h1 x9 = x9*h1 xa = xa*h1 xb = xb*h1 xc = xc*h1 q(1,1) = q(1,1) + x1 q(2,1) = q(2,1) + x2 q(3,1) = q(3,1) + x3 q(4,1) = q(4,1) + x4 q(5,1) = q(5,1) + x5 q(6,1) = q(6,1) + x6 q(7,1) = q(7,1) + x7 q(8,1) = q(8,1) + x8 q(9,1) = q(9,1) + x9 q(10,1) = q(10,1) + xa q(11,1) = q(11,1) + xb q(12,1) = q(12,1) + xc !DEC$ VECTOR ALIGNED do i=2,nb h1 = hh(i) q(1,i) = q(1,i) + x1*h1 q(2,i) = q(2,i) + x2*h1 q(3,i) = q(3,i) + x3*h1 q(4,i) = q(4,i) + x4*h1 q(5,i) = q(5,i) + x5*h1 q(6,i) = q(6,i) + x6*h1 q(7,i) = q(7,i) + x7*h1 q(8,i) = q(8,i) + x8*h1 q(9,i) = q(9,i) + x9*h1 q(10,i) = q(10,i) + xa*h1 q(11,i) = q(11,i) + xb*h1 q(12,i) = q(12,i) + xc*h1 enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_complex_kernel_12") #endif end subroutine hh_trafo_complex_kernel_12 ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) #else complex(kind=ck), intent(inout) :: q(:,:) complex(kind=ck), intent(in) :: hh(1:nb) #endif complex(kind=ck) :: x1, x2, x3, x4, x5, x6, x7, x8 complex(kind=ck) :: h1, tau1 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_complex_kernel_8") #endif x1 = q(1,1) x2 = q(2,1) x3 = q(3,1) x4 = q(4,1) x5 = q(5,1) x6 = q(6,1) x7 = q(7,1) x8 = q(8,1) !DEC$ VECTOR ALIGNED do i=2,nb h1 = conjg(hh(i)) x1 = x1 + q(1,i)*h1 x2 = x2 + q(2,i)*h1 x3 = x3 + q(3,i)*h1 x4 = x4 + q(4,i)*h1 x5 = x5 + q(5,i)*h1 x6 = x6 + q(6,i)*h1 x7 = x7 + q(7,i)*h1 x8 = x8 + q(8,i)*h1 enddo tau1 = hh(1) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 x5 = x5*h1 x6 = x6*h1 x7 = x7*h1 x8 = x8*h1 q(1,1) = q(1,1) + x1 q(2,1) = q(2,1) + x2 q(3,1) = q(3,1) + x3 q(4,1) = q(4,1) + x4 q(5,1) = q(5,1) + x5 q(6,1) = q(6,1) + x6 q(7,1) = q(7,1) + x7 q(8,1) = q(8,1) + x8 !DEC$ VECTOR ALIGNED do i=2,nb h1 = hh(i) q(1,i) = q(1,i) + x1*h1 q(2,i) = q(2,i) + x2*h1 q(3,i) = q(3,i) + x3*h1 q(4,i) = q(4,i) + x4*h1 q(5,i) = q(5,i) + x5*h1 q(6,i) = q(6,i) + x6*h1 q(7,i) = q(7,i) + x7*h1 q(8,i) = q(8,i) + x8*h1 enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_complex_kernel_8") #endif end subroutine hh_trafo_complex_kernel_8 ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) #else complex(kind=ck), intent(inout) :: q(:,:) complex(kind=ck), intent(in) :: hh(1:nb) #endif complex(kind=ck) :: x1, x2, x3, x4 complex(kind=ck) :: h1, tau1 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_complex_kernel_4") #endif x1 = q(1,1) x2 = q(2,1) x3 = q(3,1) x4 = q(4,1) !DEC$ VECTOR ALIGNED do i=2,nb h1 = conjg(hh(i)) x1 = x1 + q(1,i)*h1 x2 = x2 + q(2,i)*h1 x3 = x3 + q(3,i)*h1 x4 = x4 + q(4,i)*h1 enddo tau1 = hh(1) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 q(1,1) = q(1,1) + x1 q(2,1) = q(2,1) + x2 q(3,1) = q(3,1) + x3 q(4,1) = q(4,1) + x4 !DEC$ VECTOR ALIGNED do i=2,nb h1 = hh(i) q(1,i) = q(1,i) + x1*h1 q(2,i) = q(2,i) + x2*h1 q(3,i) = q(3,i) + x3*h1 q(4,i) = q(4,i) + x4*h1 enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_complex_kernel_4") #endif end subroutine hh_trafo_complex_kernel_4 ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) #else complex(kind=ck), intent(inout) :: q(:,:) complex(kind=ck), intent(in) :: hh(1:ldh,1:2) #endif complex(kind=ck), intent(in) :: s complex(kind=ck) :: x1, x2, x3, x4, y1, y2, y3, y4 complex(kind=ck) :: h1, h2, tau1, tau2 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_complex_kernel_4_2hv") #endif x1 = q(1,2) x2 = q(2,2) x3 = q(3,2) x4 = q(4,2) y1 = q(1,1) + q(1,2)*conjg(hh(2,2)) y2 = q(2,1) + q(2,2)*conjg(hh(2,2)) y3 = q(3,1) + q(3,2)*conjg(hh(2,2)) y4 = q(4,1) + q(4,2)*conjg(hh(2,2)) !DEC$ VECTOR ALIGNED do i=3,nb h1 = conjg(hh(i-1,1)) h2 = conjg(hh(i,2)) x1 = x1 + q(1,i)*h1 y1 = y1 + q(1,i)*h2 x2 = x2 + q(2,i)*h1 y2 = y2 + q(2,i)*h2 x3 = x3 + q(3,i)*h1 y3 = y3 + q(3,i)*h2 x4 = x4 + q(4,i)*h1 y4 = y4 + q(4,i)*h2 enddo x1 = x1 + q(1,nb+1)*conjg(hh(nb,1)) x2 = x2 + q(2,nb+1)*conjg(hh(nb,1)) x3 = x3 + q(3,nb+1)*conjg(hh(nb,1)) x4 = x4 + q(4,nb+1)*conjg(hh(nb,1)) tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 h1 = -tau2 h2 = -tau2*s y1 = y1*h1 + x1*h2 y2 = y2*h1 + x2*h2 y3 = y3*h1 + x3*h2 y4 = y4*h1 + x4*h2 q(1,1) = q(1,1) + y1 q(2,1) = q(2,1) + y2 q(3,1) = q(3,1) + y3 q(4,1) = q(4,1) + y4 q(1,2) = q(1,2) + x1 + y1*hh(2,2) q(2,2) = q(2,2) + x2 + y2*hh(2,2) q(3,2) = q(3,2) + x3 + y3*hh(2,2) q(4,2) = q(4,2) + x4 + y4*hh(2,2) !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1,i) = q(1,i) + x1*h1 + y1*h2 q(2,i) = q(2,i) + x2*h1 + y2*h2 q(3,i) = q(3,i) + x3*h1 + y3*h2 q(4,i) = q(4,i) + x4*h1 + y4*h2 enddo q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1) q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_complex_kernel_4_2hv") #endif end subroutine hh_trafo_complex_kernel_4_2hv ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) #else complex(kind=ck), intent(inout) :: q(:,:) complex(kind=ck), intent(in) :: hh(1:ldh,1:2) #endif complex(kind=ck), intent(in) :: s complex(kind=ck) :: x1, x2, x3, x4, x5, x6 ,x7, x8, y1, y2, y3, y4, y5, y6, y7, y8 complex(kind=ck) :: h1, h2, tau1, tau2 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_complex_kernel_8_2hv") #endif x1 = q(1,2) x2 = q(2,2) x3 = q(3,2) x4 = q(4,2) x5 = q(5,2) x6 = q(6,2) x7 = q(7,2) x8 = q(8,2) y1 = q(1,1) + q(1,2)*conjg(hh(2,2)) y2 = q(2,1) + q(2,2)*conjg(hh(2,2)) y3 = q(3,1) + q(3,2)*conjg(hh(2,2)) y4 = q(4,1) + q(4,2)*conjg(hh(2,2)) y5 = q(5,1) + q(5,2)*conjg(hh(2,2)) y6 = q(6,1) + q(6,2)*conjg(hh(2,2)) y7 = q(7,1) + q(7,2)*conjg(hh(2,2)) y8 = q(8,1) + q(8,2)*conjg(hh(2,2)) !DEC$ VECTOR ALIGNED do i=3,nb h1 = conjg(hh(i-1,1)) h2 = conjg(hh(i,2)) x1 = x1 + q(1,i)*h1 y1 = y1 + q(1,i)*h2 x2 = x2 + q(2,i)*h1 y2 = y2 + q(2,i)*h2 x3 = x3 + q(3,i)*h1 y3 = y3 + q(3,i)*h2 x4 = x4 + q(4,i)*h1 y4 = y4 + q(4,i)*h2 x5 = x5 + q(5,i)*h1 y5 = y5 + q(5,i)*h2 x6 = x6 + q(6,i)*h1 y6 = y6 + q(6,i)*h2 x7 = x7 + q(7,i)*h1 y7 = y7 + q(7,i)*h2 x8 = x8 + q(8,i)*h1 y8 = y8 + q(8,i)*h2 enddo x1 = x1 + q(1,nb+1)*conjg(hh(nb,1)) x2 = x2 + q(2,nb+1)*conjg(hh(nb,1)) x3 = x3 + q(3,nb+1)*conjg(hh(nb,1)) x4 = x4 + q(4,nb+1)*conjg(hh(nb,1)) x5 = x5 + q(5,nb+1)*conjg(hh(nb,1)) x6 = x6 + q(6,nb+1)*conjg(hh(nb,1)) x7 = x7 + q(7,nb+1)*conjg(hh(nb,1)) x8 = x8 + q(8,nb+1)*conjg(hh(nb,1)) tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 x5 = x5*h1 x6 = x6*h1 x7 = x7*h1 x8 = x8*h1 h1 = -tau2 h2 = -tau2*s y1 = y1*h1 + x1*h2 y2 = y2*h1 + x2*h2 y3 = y3*h1 + x3*h2 y4 = y4*h1 + x4*h2 y5 = y5*h1 + x5*h2 y6 = y6*h1 + x6*h2 y7 = y7*h1 + x7*h2 y8 = y8*h1 + x8*h2 q(1,1) = q(1,1) + y1 q(2,1) = q(2,1) + y2 q(3,1) = q(3,1) + y3 q(4,1) = q(4,1) + y4 q(5,1) = q(5,1) + y5 q(6,1) = q(6,1) + y6 q(7,1) = q(7,1) + y7 q(8,1) = q(8,1) + y8 q(1,2) = q(1,2) + x1 + y1*hh(2,2) q(2,2) = q(2,2) + x2 + y2*hh(2,2) q(3,2) = q(3,2) + x3 + y3*hh(2,2) q(4,2) = q(4,2) + x4 + y4*hh(2,2) q(5,2) = q(5,2) + x5 + y5*hh(2,2) q(6,2) = q(6,2) + x6 + y6*hh(2,2) q(7,2) = q(7,2) + x7 + y7*hh(2,2) q(8,2) = q(8,2) + x8 + y8*hh(2,2) !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1,i) = q(1,i) + x1*h1 + y1*h2 q(2,i) = q(2,i) + x2*h1 + y2*h2 q(3,i) = q(3,i) + x3*h1 + y3*h2 q(4,i) = q(4,i) + x4*h1 + y4*h2 q(5,i) = q(5,i) + x5*h1 + y5*h2 q(6,i) = q(6,i) + x6*h1 + y6*h2 q(7,i) = q(7,i) + x7*h1 + y7*h2 q(8,i) = q(8,i) + x8*h1 + y8*h2 enddo q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1) q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1) q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1) q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1) q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1) q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_complex_kernel_8_2hv") #endif end subroutine hh_trafo_complex_kernel_8_2hv ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) #else complex(kind=ck), intent(inout) :: q(:,:) complex(kind=ck), intent(in) :: hh(1:ldh,1:2) #endif complex(kind=ck), intent(in) :: s complex(kind=ck) :: x1, x2, x3, x4, x5, x6 ,x7, x8, x9, x10, x11, x12, y1, y2, y3, y4, y5, y6, & y7, y8, y9, y10, y11, y12 complex(kind=ck) :: h1, h2, tau1, tau2 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_complex_kernel_12_2hv") #endif x1 = q(1,2) x2 = q(2,2) x3 = q(3,2) x4 = q(4,2) x5 = q(5,2) x6 = q(6,2) x7 = q(7,2) x8 = q(8,2) x9 = q(9,2) x10 = q(10,2) x11 = q(11,2) x12 = q(12,2) y1 = q(1,1) + q(1,2)*conjg(hh(2,2)) y2 = q(2,1) + q(2,2)*conjg(hh(2,2)) y3 = q(3,1) + q(3,2)*conjg(hh(2,2)) y4 = q(4,1) + q(4,2)*conjg(hh(2,2)) y5 = q(5,1) + q(5,2)*conjg(hh(2,2)) y6 = q(6,1) + q(6,2)*conjg(hh(2,2)) y7 = q(7,1) + q(7,2)*conjg(hh(2,2)) y8 = q(8,1) + q(8,2)*conjg(hh(2,2)) y9 = q(9,1) + q(9,2)*conjg(hh(2,2)) y10 = q(10,1) + q(10,2)*conjg(hh(2,2)) y11 = q(11,1) + q(11,2)*conjg(hh(2,2)) y12 = q(12,1) + q(12,2)*conjg(hh(2,2)) !DEC$ VECTOR ALIGNED do i=3,nb h1 = conjg(hh(i-1,1)) h2 = conjg(hh(i,2)) x1 = x1 + q(1,i)*h1 y1 = y1 + q(1,i)*h2 x2 = x2 + q(2,i)*h1 y2 = y2 + q(2,i)*h2 x3 = x3 + q(3,i)*h1 y3 = y3 + q(3,i)*h2 x4 = x4 + q(4,i)*h1 y4 = y4 + q(4,i)*h2 x5 = x5 + q(5,i)*h1 y5 = y5 + q(5,i)*h2 x6 = x6 + q(6,i)*h1 y6 = y6 + q(6,i)*h2 x7 = x7 + q(7,i)*h1 y7 = y7 + q(7,i)*h2 x8 = x8 + q(8,i)*h1 y8 = y8 + q(8,i)*h2 x9 = x9 + q(9,i)*h1 y9 = y9 + q(9,i)*h2 x10 = x10 + q(10,i)*h1 y10 = y10 + q(10,i)*h2 x11 = x11 + q(11,i)*h1 y11 = y11 + q(11,i)*h2 x12 = x12 + q(12,i)*h1 y12 = y12 + q(12,i)*h2 enddo x1 = x1 + q(1,nb+1)*conjg(hh(nb,1)) x2 = x2 + q(2,nb+1)*conjg(hh(nb,1)) x3 = x3 + q(3,nb+1)*conjg(hh(nb,1)) x4 = x4 + q(4,nb+1)*conjg(hh(nb,1)) x5 = x5 + q(5,nb+1)*conjg(hh(nb,1)) x6 = x6 + q(6,nb+1)*conjg(hh(nb,1)) x7 = x7 + q(7,nb+1)*conjg(hh(nb,1)) x8 = x8 + q(8,nb+1)*conjg(hh(nb,1)) x9 = x9 + q(9,nb+1)*conjg(hh(nb,1)) x10 = x10 + q(10,nb+1)*conjg(hh(nb,1)) x11 = x11 + q(11,nb+1)*conjg(hh(nb,1)) x12 = x12 + q(12,nb+1)*conjg(hh(nb,1)) tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 x5 = x5*h1 x6 = x6*h1 x7 = x7*h1 x8 = x8*h1 x9 = x9*h1 x10 = x10*h1 x11 = x11*h1 x12 = x12*h1 h1 = -tau2 h2 = -tau2*s y1 = y1*h1 + x1*h2 y2 = y2*h1 + x2*h2 y3 = y3*h1 + x3*h2 y4 = y4*h1 + x4*h2 y5 = y5*h1 + x5*h2 y6 = y6*h1 + x6*h2 y7 = y7*h1 + x7*h2 y8 = y8*h1 + x8*h2 y9 = y9*h1 + x9*h2 y10 = y10*h1 + x10*h2 y11 = y11*h1 + x11*h2 y12 = y12*h1 + x12*h2 q(1,1) = q(1,1) + y1 q(2,1) = q(2,1) + y2 q(3,1) = q(3,1) + y3 q(4,1) = q(4,1) + y4 q(5,1) = q(5,1) + y5 q(6,1) = q(6,1) + y6 q(7,1) = q(7,1) + y7 q(8,1) = q(8,1) + y8 q(9,1) = q(9,1) + y9 q(10,1) = q(10,1) + y10 q(11,1) = q(11,1) + y11 q(12,1) = q(12,1) + y12 q(1,2) = q(1,2) + x1 + y1*hh(2,2) q(2,2) = q(2,2) + x2 + y2*hh(2,2) q(3,2) = q(3,2) + x3 + y3*hh(2,2) q(4,2) = q(4,2) + x4 + y4*hh(2,2) q(5,2) = q(5,2) + x5 + y5*hh(2,2) q(6,2) = q(6,2) + x6 + y6*hh(2,2) q(7,2) = q(7,2) + x7 + y7*hh(2,2) q(8,2) = q(8,2) + x8 + y8*hh(2,2) q(9,2) = q(9,2) + x9 + y9*hh(2,2) q(10,2) = q(10,2) + x10 + y10*hh(2,2) q(11,2) = q(11,2) + x11 + y11*hh(2,2) q(12,2) = q(12,2) + x12 + y12*hh(2,2) !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1,i) = q(1,i) + x1*h1 + y1*h2 q(2,i) = q(2,i) + x2*h1 + y2*h2 q(3,i) = q(3,i) + x3*h1 + y3*h2 q(4,i) = q(4,i) + x4*h1 + y4*h2 q(5,i) = q(5,i) + x5*h1 + y5*h2 q(6,i) = q(6,i) + x6*h1 + y6*h2 q(7,i) = q(7,i) + x7*h1 + y7*h2 q(8,i) = q(8,i) + x8*h1 + y8*h2 q(9,i) = q(9,i) + x9*h1 + y9*h2 q(10,i) = q(10,i) + x10*h1 + y10*h2 q(11,i) = q(11,i) + x11*h1 + y11*h2 q(12,i) = q(12,i) + x12*h1 + y12*h2 enddo q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1) q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1) q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1) q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1) q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1) q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1) q(9,nb+1) = q(9,nb+1) + x9*hh(nb,1) q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1) q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1) q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_complex_kernel_12_2hv") #endif end subroutine hh_trafo_complex_kernel_12_2hv end module complex_generic_kernel ! -------------------------------------------------------------------------------------------------- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c0000644000312500001440000010004412717516040021551 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #endif #ifdef __AVX2__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) #endif #endif //Forward declaration __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); /* !f>#ifdef HAVE_AVX !f> interface !f> subroutine double_hh_trafo_real_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_avx_avx2_2hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> real(kind=c_double) :: q(*) !f> real(kind=c_double) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif */ void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar product to compute // 2 householder vectors simultaneously double s = hh[(ldh)+1]*1.0; #pragma ivdep for (i = 2; i < nb; i++) { s += hh[i-1] * hh[(i+ldh)]; } // Production level kernel calls with padding for (i = 0; i < nq-20; i+=24) { hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } if (nq == i) { return; } if (nq-i == 20) { hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_4_AVX_2hv(&q[i+16], hh, nb, ldq, ldh, s); } else if (nq-i == 16) { hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i == 12) { hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_4_AVX_2hv(&q[i+8], hh, nb, ldq, ldh, s); } else if (nq-i == 8) { hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } else { hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } } /** * Unrolled kernel that computes * 24 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [24 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000); __m256d x1 = _mm256_load_pd(&q[ldq]); __m256d x2 = _mm256_load_pd(&q[ldq+4]); __m256d x3 = _mm256_load_pd(&q[ldq+8]); __m256d x4 = _mm256_load_pd(&q[ldq+12]); __m256d x5 = _mm256_load_pd(&q[ldq+16]); __m256d x6 = _mm256_load_pd(&q[ldq+20]); __m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h2; #ifdef __ELPA_USE_FMA__ __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_FMA_pd(x1, h1, q1); __m256d q2 = _mm256_load_pd(&q[4]); __m256d y2 = _mm256_FMA_pd(x2, h1, q2); __m256d q3 = _mm256_load_pd(&q[8]); __m256d y3 = _mm256_FMA_pd(x3, h1, q3); __m256d q4 = _mm256_load_pd(&q[12]); __m256d y4 = _mm256_FMA_pd(x4, h1, q4); __m256d q5 = _mm256_load_pd(&q[16]); __m256d y5 = _mm256_FMA_pd(x5, h1, q5); __m256d q6 = _mm256_load_pd(&q[20]); __m256d y6 = _mm256_FMA_pd(x6, h1, q6); #else __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); __m256d q2 = _mm256_load_pd(&q[4]); __m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1)); __m256d q3 = _mm256_load_pd(&q[8]); __m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1)); __m256d q4 = _mm256_load_pd(&q[12]); __m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1)); __m256d q5 = _mm256_load_pd(&q[16]); __m256d y5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1)); __m256d q6 = _mm256_load_pd(&q[20]); __m256d y6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1)); #endif for(i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); x2 = _mm256_FMA_pd(q2, h1, x2); y2 = _mm256_FMA_pd(q2, h2, y2); q3 = _mm256_load_pd(&q[(i*ldq)+8]); x3 = _mm256_FMA_pd(q3, h1, x3); y3 = _mm256_FMA_pd(q3, h2, y3); q4 = _mm256_load_pd(&q[(i*ldq)+12]); x4 = _mm256_FMA_pd(q4, h1, x4); y4 = _mm256_FMA_pd(q4, h2, y4); q5 = _mm256_load_pd(&q[(i*ldq)+16]); x5 = _mm256_FMA_pd(q5, h1, x5); y5 = _mm256_FMA_pd(q5, h2, y5); q6 = _mm256_load_pd(&q[(i*ldq)+20]); x6 = _mm256_FMA_pd(q6, h1, x6); y6 = _mm256_FMA_pd(q6, h2, y6); #else q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); q2 = _mm256_load_pd(&q[(i*ldq)+4]); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); q3 = _mm256_load_pd(&q[(i*ldq)+8]); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); q4 = _mm256_load_pd(&q[(i*ldq)+12]); x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1)); y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2)); q5 = _mm256_load_pd(&q[(i*ldq)+16]); x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1)); y5 = _mm256_add_pd(y5, _mm256_mul_pd(q5,h2)); q6 = _mm256_load_pd(&q[(i*ldq)+20]); x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1)); y6 = _mm256_add_pd(y6, _mm256_mul_pd(q6,h2)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); x2 = _mm256_FMA_pd(q2, h1, x2); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); x3 = _mm256_FMA_pd(q3, h1, x3); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); x4 = _mm256_FMA_pd(q4, h1, x4); q5 = _mm256_load_pd(&q[(nb*ldq)+16]); x5 = _mm256_FMA_pd(q5, h1, x5); q6 = _mm256_load_pd(&q[(nb*ldq)+20]); x6 = _mm256_FMA_pd(q6, h1, x6); #else q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1)); q5 = _mm256_load_pd(&q[(nb*ldq)+16]); x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1)); q6 = _mm256_load_pd(&q[(nb*ldq)+20]); x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1)); #endif ///////////////////////////////////////////////////// // Rank-2 update of Q [24 x nb+1] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(hh); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs = _mm256_broadcast_sd(&s); h1 = _mm256_xor_pd(tau1, sign); x1 = _mm256_mul_pd(x1, h1); x2 = _mm256_mul_pd(x2, h1); x3 = _mm256_mul_pd(x3, h1); x4 = _mm256_mul_pd(x4, h1); x5 = _mm256_mul_pd(x5, h1); x6 = _mm256_mul_pd(x6, h1); h1 = _mm256_xor_pd(tau2, sign); h2 = _mm256_mul_pd(h1, vs); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2)); y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2)); y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2)); y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2)); y5 = _mm256_FMA_pd(y5, h1, _mm256_mul_pd(x5,h2)); y6 = _mm256_FMA_pd(y6, h1, _mm256_mul_pd(x6,h2)); #else y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2)); y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2)); y5 = _mm256_add_pd(_mm256_mul_pd(y5,h1), _mm256_mul_pd(x5,h2)); y6 = _mm256_add_pd(_mm256_mul_pd(y6,h1), _mm256_mul_pd(x6,h2)); #endif q1 = _mm256_load_pd(q); q1 = _mm256_add_pd(q1, y1); _mm256_store_pd(q,q1); q2 = _mm256_load_pd(&q[4]); q2 = _mm256_add_pd(q2, y2); _mm256_store_pd(&q[4],q2); q3 = _mm256_load_pd(&q[8]); q3 = _mm256_add_pd(q3, y3); _mm256_store_pd(&q[8],q3); q4 = _mm256_load_pd(&q[12]); q4 = _mm256_add_pd(q4, y4); _mm256_store_pd(&q[12],q4); q5 = _mm256_load_pd(&q[16]); q5 = _mm256_add_pd(q5, y5); _mm256_store_pd(&q[16],q5); q6 = _mm256_load_pd(&q[20]); q6 = _mm256_add_pd(q6, y6); _mm256_store_pd(&q[20],q6); h2 = _mm256_broadcast_sd(&hh[ldh+1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1)); _mm256_store_pd(&q[ldq],q1); q2 = _mm256_load_pd(&q[ldq+4]); q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2)); _mm256_store_pd(&q[ldq+4],q2); q3 = _mm256_load_pd(&q[ldq+8]); q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3)); _mm256_store_pd(&q[ldq+8],q3); q4 = _mm256_load_pd(&q[ldq+12]); q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4)); _mm256_store_pd(&q[ldq+12],q4); q5 = _mm256_load_pd(&q[ldq+16]); q5 = _mm256_add_pd(q5, _mm256_FMA_pd(y5, h2, x5)); _mm256_store_pd(&q[ldq+16],q5); q6 = _mm256_load_pd(&q[ldq+20]); q6 = _mm256_add_pd(q6, _mm256_FMA_pd(y6, h2, x6)); _mm256_store_pd(&q[ldq+20],q6); #else q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[ldq],q1); q2 = _mm256_load_pd(&q[ldq+4]); q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2))); _mm256_store_pd(&q[ldq+4],q2); q3 = _mm256_load_pd(&q[ldq+8]); q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2))); _mm256_store_pd(&q[ldq+8],q3); q4 = _mm256_load_pd(&q[ldq+12]); q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2))); _mm256_store_pd(&q[ldq+12],q4); q5 = _mm256_load_pd(&q[ldq+16]); q5 = _mm256_add_pd(q5, _mm256_add_pd(x5, _mm256_mul_pd(y5, h2))); _mm256_store_pd(&q[ldq+16],q5); q6 = _mm256_load_pd(&q[ldq+20]); q6 = _mm256_add_pd(q6, _mm256_add_pd(x6, _mm256_mul_pd(y6, h2))); _mm256_store_pd(&q[ldq+20],q6); #endif for (i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); q1 = _mm256_FMA_pd(y1, h2, q1); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_FMA_pd(x2, h1, q2); q2 = _mm256_FMA_pd(y2, h2, q2); _mm256_store_pd(&q[(i*ldq)+4],q2); q3 = _mm256_load_pd(&q[(i*ldq)+8]); q3 = _mm256_FMA_pd(x3, h1, q3); q3 = _mm256_FMA_pd(y3, h2, q3); _mm256_store_pd(&q[(i*ldq)+8],q3); q4 = _mm256_load_pd(&q[(i*ldq)+12]); q4 = _mm256_FMA_pd(x4, h1, q4); q4 = _mm256_FMA_pd(y4, h2, q4); _mm256_store_pd(&q[(i*ldq)+12],q4); q5 = _mm256_load_pd(&q[(i*ldq)+16]); q5 = _mm256_FMA_pd(x5, h1, q5); q5 = _mm256_FMA_pd(y5, h2, q5); _mm256_store_pd(&q[(i*ldq)+16],q5); q6 = _mm256_load_pd(&q[(i*ldq)+20]); q6 = _mm256_FMA_pd(x6, h1, q6); q6 = _mm256_FMA_pd(y6, h2, q6); _mm256_store_pd(&q[(i*ldq)+20],q6); #else q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2))); _mm256_store_pd(&q[(i*ldq)+4],q2); q3 = _mm256_load_pd(&q[(i*ldq)+8]); q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2))); _mm256_store_pd(&q[(i*ldq)+8],q3); q4 = _mm256_load_pd(&q[(i*ldq)+12]); q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2))); _mm256_store_pd(&q[(i*ldq)+12],q4); q5 = _mm256_load_pd(&q[(i*ldq)+16]); q5 = _mm256_add_pd(q5, _mm256_add_pd(_mm256_mul_pd(x5,h1), _mm256_mul_pd(y5, h2))); _mm256_store_pd(&q[(i*ldq)+16],q5); q6 = _mm256_load_pd(&q[(i*ldq)+20]); q6 = _mm256_add_pd(q6, _mm256_add_pd(_mm256_mul_pd(x6,h1), _mm256_mul_pd(y6, h2))); _mm256_store_pd(&q[(i*ldq)+20],q6); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); _mm256_store_pd(&q[nb*ldq],q1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q2 = _mm256_FMA_pd(x2, h1, q2); _mm256_store_pd(&q[(nb*ldq)+4],q2); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); q3 = _mm256_FMA_pd(x3, h1, q3); _mm256_store_pd(&q[(nb*ldq)+8],q3); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); q4 = _mm256_FMA_pd(x4, h1, q4); _mm256_store_pd(&q[(nb*ldq)+12],q4); q5 = _mm256_load_pd(&q[(nb*ldq)+16]); q5 = _mm256_FMA_pd(x5, h1, q5); _mm256_store_pd(&q[(nb*ldq)+16],q5); q6 = _mm256_load_pd(&q[(nb*ldq)+20]); q6 = _mm256_FMA_pd(x6, h1, q6); _mm256_store_pd(&q[(nb*ldq)+20],q6); #else q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); _mm256_store_pd(&q[nb*ldq],q1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1)); _mm256_store_pd(&q[(nb*ldq)+4],q2); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1)); _mm256_store_pd(&q[(nb*ldq)+8],q3); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1)); _mm256_store_pd(&q[(nb*ldq)+12],q4); q5 = _mm256_load_pd(&q[(nb*ldq)+16]); q5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1)); _mm256_store_pd(&q[(nb*ldq)+16],q5); q6 = _mm256_load_pd(&q[(nb*ldq)+20]); q6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1)); _mm256_store_pd(&q[(nb*ldq)+20],q6); #endif } /** * Unrolled kernel that computes * 16 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [16 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000); __m256d x1 = _mm256_load_pd(&q[ldq]); __m256d x2 = _mm256_load_pd(&q[ldq+4]); __m256d x3 = _mm256_load_pd(&q[ldq+8]); __m256d x4 = _mm256_load_pd(&q[ldq+12]); __m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h2; #ifdef __ELPA_USE_FMA__ __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_FMA_pd(x1, h1, q1); __m256d q2 = _mm256_load_pd(&q[4]); __m256d y2 = _mm256_FMA_pd(x2, h1, q2); __m256d q3 = _mm256_load_pd(&q[8]); __m256d y3 = _mm256_FMA_pd(x3, h1, q3); __m256d q4 = _mm256_load_pd(&q[12]); __m256d y4 = _mm256_FMA_pd(x4, h1, q4); #else __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); __m256d q2 = _mm256_load_pd(&q[4]); __m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1)); __m256d q3 = _mm256_load_pd(&q[8]); __m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1)); __m256d q4 = _mm256_load_pd(&q[12]); __m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1)); #endif for(i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); x2 = _mm256_FMA_pd(q2, h1, x2); y2 = _mm256_FMA_pd(q2, h2, y2); q3 = _mm256_load_pd(&q[(i*ldq)+8]); x3 = _mm256_FMA_pd(q3, h1, x3); y3 = _mm256_FMA_pd(q3, h2, y3); q4 = _mm256_load_pd(&q[(i*ldq)+12]); x4 = _mm256_FMA_pd(q4, h1, x4); y4 = _mm256_FMA_pd(q4, h2, y4); #else q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); q2 = _mm256_load_pd(&q[(i*ldq)+4]); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); q3 = _mm256_load_pd(&q[(i*ldq)+8]); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); q4 = _mm256_load_pd(&q[(i*ldq)+12]); x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1)); y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); x2 = _mm256_FMA_pd(q2, h1, x2); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); x3 = _mm256_FMA_pd(q3, h1, x3); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); x4 = _mm256_FMA_pd(q4, h1, x4); #else q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1)); #endif ///////////////////////////////////////////////////// // Rank-2 update of Q [16 x nb+1] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(hh); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs = _mm256_broadcast_sd(&s); h1 = _mm256_xor_pd(tau1, sign); x1 = _mm256_mul_pd(x1, h1); x2 = _mm256_mul_pd(x2, h1); x3 = _mm256_mul_pd(x3, h1); x4 = _mm256_mul_pd(x4, h1); h1 = _mm256_xor_pd(tau2, sign); h2 = _mm256_mul_pd(h1, vs); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2)); y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2)); y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2)); y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2)); #else y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2)); y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2)); #endif q1 = _mm256_load_pd(q); q1 = _mm256_add_pd(q1, y1); _mm256_store_pd(q,q1); q2 = _mm256_load_pd(&q[4]); q2 = _mm256_add_pd(q2, y2); _mm256_store_pd(&q[4],q2); q3 = _mm256_load_pd(&q[8]); q3 = _mm256_add_pd(q3, y3); _mm256_store_pd(&q[8],q3); q4 = _mm256_load_pd(&q[12]); q4 = _mm256_add_pd(q4, y4); _mm256_store_pd(&q[12],q4); h2 = _mm256_broadcast_sd(&hh[ldh+1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1)); _mm256_store_pd(&q[ldq],q1); q2 = _mm256_load_pd(&q[ldq+4]); q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2)); _mm256_store_pd(&q[ldq+4],q2); q3 = _mm256_load_pd(&q[ldq+8]); q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3)); _mm256_store_pd(&q[ldq+8],q3); q4 = _mm256_load_pd(&q[ldq+12]); q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4)); _mm256_store_pd(&q[ldq+12],q4); #else q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[ldq],q1); q2 = _mm256_load_pd(&q[ldq+4]); q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2))); _mm256_store_pd(&q[ldq+4],q2); q3 = _mm256_load_pd(&q[ldq+8]); q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2))); _mm256_store_pd(&q[ldq+8],q3); q4 = _mm256_load_pd(&q[ldq+12]); q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2))); _mm256_store_pd(&q[ldq+12],q4); #endif for (i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); q1 = _mm256_FMA_pd(y1, h2, q1); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_FMA_pd(x2, h1, q2); q2 = _mm256_FMA_pd(y2, h2, q2); _mm256_store_pd(&q[(i*ldq)+4],q2); q3 = _mm256_load_pd(&q[(i*ldq)+8]); q3 = _mm256_FMA_pd(x3, h1, q3); q3 = _mm256_FMA_pd(y3, h2, q3); _mm256_store_pd(&q[(i*ldq)+8],q3); q4 = _mm256_load_pd(&q[(i*ldq)+12]); q4 = _mm256_FMA_pd(x4, h1, q4); q4 = _mm256_FMA_pd(y4, h2, q4); _mm256_store_pd(&q[(i*ldq)+12],q4); #else q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2))); _mm256_store_pd(&q[(i*ldq)+4],q2); q3 = _mm256_load_pd(&q[(i*ldq)+8]); q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2))); _mm256_store_pd(&q[(i*ldq)+8],q3); q4 = _mm256_load_pd(&q[(i*ldq)+12]); q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2))); _mm256_store_pd(&q[(i*ldq)+12],q4); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); _mm256_store_pd(&q[nb*ldq],q1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q2 = _mm256_FMA_pd(x2, h1, q2); _mm256_store_pd(&q[(nb*ldq)+4],q2); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); q3 = _mm256_FMA_pd(x3, h1, q3); _mm256_store_pd(&q[(nb*ldq)+8],q3); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); q4 = _mm256_FMA_pd(x4, h1, q4); _mm256_store_pd(&q[(nb*ldq)+12],q4); #else q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); _mm256_store_pd(&q[nb*ldq],q1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1)); _mm256_store_pd(&q[(nb*ldq)+4],q2); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1)); _mm256_store_pd(&q[(nb*ldq)+8],q3); q4 = _mm256_load_pd(&q[(nb*ldq)+12]); q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1)); _mm256_store_pd(&q[(nb*ldq)+12],q4); #endif } /** * Unrolled kernel that computes * 8 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000); __m256d x1 = _mm256_load_pd(&q[ldq]); __m256d x2 = _mm256_load_pd(&q[ldq+4]); __m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h2; #ifdef __ELPA_USE_FMA__ __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_FMA_pd(x1, h1, q1); __m256d q2 = _mm256_load_pd(&q[4]); __m256d y2 = _mm256_FMA_pd(x2, h1, q2); #else __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); __m256d q2 = _mm256_load_pd(&q[4]); __m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1)); #endif for(i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); x2 = _mm256_FMA_pd(q2, h1, x2); y2 = _mm256_FMA_pd(q2, h2, y2); #else q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); q2 = _mm256_load_pd(&q[(i*ldq)+4]); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); x2 = _mm256_FMA_pd(q2, h1, x2); #else q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif ///////////////////////////////////////////////////// // Rank-2 update of Q [8 x nb+1] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(hh); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs = _mm256_broadcast_sd(&s); h1 = _mm256_xor_pd(tau1, sign); x1 = _mm256_mul_pd(x1, h1); x2 = _mm256_mul_pd(x2, h1); h1 = _mm256_xor_pd(tau2, sign); h2 = _mm256_mul_pd(h1, vs); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2)); y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2)); #else y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); #endif q1 = _mm256_load_pd(q); q1 = _mm256_add_pd(q1, y1); _mm256_store_pd(q,q1); q2 = _mm256_load_pd(&q[4]); q2 = _mm256_add_pd(q2, y2); _mm256_store_pd(&q[4],q2); h2 = _mm256_broadcast_sd(&hh[ldh+1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1)); _mm256_store_pd(&q[ldq],q1); q2 = _mm256_load_pd(&q[ldq+4]); q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2)); _mm256_store_pd(&q[ldq+4],q2); #else q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[ldq],q1); q2 = _mm256_load_pd(&q[ldq+4]); q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2))); _mm256_store_pd(&q[ldq+4],q2); #endif for (i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); q1 = _mm256_FMA_pd(y1, h2, q1); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_FMA_pd(x2, h1, q2); q2 = _mm256_FMA_pd(y2, h2, q2); _mm256_store_pd(&q[(i*ldq)+4],q2); #else q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2))); _mm256_store_pd(&q[(i*ldq)+4],q2); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); _mm256_store_pd(&q[nb*ldq],q1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q2 = _mm256_FMA_pd(x2, h1, q2); _mm256_store_pd(&q[(nb*ldq)+4],q2); #else q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); _mm256_store_pd(&q[nb*ldq],q1); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1)); _mm256_store_pd(&q[(nb*ldq)+4],q2); #endif } /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000); __m256d x1 = _mm256_load_pd(&q[ldq]); __m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h2; #ifdef __ELPA_USE_FMA__ __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_FMA_pd(x1, h1, q1); #else __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); #endif for(i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); #else q1 = _mm256_load_pd(&q[i*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_FMA_pd(q1, h1, x1); #else q1 = _mm256_load_pd(&q[nb*ldq]); x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif ///////////////////////////////////////////////////// // Rank-2 update of Q [4 x nb+1] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(hh); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs = _mm256_broadcast_sd(&s); h1 = _mm256_xor_pd(tau1, sign); x1 = _mm256_mul_pd(x1, h1); h1 = _mm256_xor_pd(tau2, sign); h2 = _mm256_mul_pd(h1, vs); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2)); #else y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); #endif q1 = _mm256_load_pd(q); q1 = _mm256_add_pd(q1, y1); _mm256_store_pd(q,q1); h2 = _mm256_broadcast_sd(&hh[ldh+1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1)); _mm256_store_pd(&q[ldq],q1); #else q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[ldq],q1); #endif for (i = 2; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-1]); h2 = _mm256_broadcast_sd(&hh[ldh+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); q1 = _mm256_FMA_pd(y1, h2, q1); _mm256_store_pd(&q[i*ldq],q1); #else q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2))); _mm256_store_pd(&q[i*ldq],q1); #endif } h1 = _mm256_broadcast_sd(&hh[nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_FMA_pd(x1, h1, q1); _mm256_store_pd(&q[nb*ldq],q1); #else q1 = _mm256_load_pd(&q[nb*ldq]); q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1)); _mm256_store_pd(&q[nb*ldq],q1); #endif } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c0000644000312500001440000014460012717516040022303 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #include #define __forceinline __attribute__((always_inline)) #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c) #endif #ifdef __AVX2__ #define __ELPA_USE_FMA__ #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #endif #endif //Forward declaration static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); /* !f>#ifdef HAVE_AVX !f> interface !f> subroutine double_hh_trafo_complex_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_avx_avx2_2hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> complex(kind=c_double) :: q(*) !f> complex(kind=c_double) :: hh(pnb,2) !f> end subroutine !f> end interface !f>#endif */ void double_hh_trafo_complex_avx_avx2_2hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; double complex s = conj(hh[(ldh)+1])*1.0; for (i = 2; i < nb; i++) { s += hh[i-1] * conj(hh[(i+ldh)]); } #if 1 for (i = 0; i < nq-4; i+=8) { hh_trafo_complex_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } if (nq-i > 0) { hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } #else for (i = 0; i < nq-4; i+=6) { hh_trafo_complex_kernel_6_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } if (nq-i > 2) { hh_trafo_complex_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i > 0) { hh_trafo_complex_kernel_2_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); } #endif } static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m256d x1, x2, x3, x4; __m256d y1, y2, y3, y4; __m256d q1, q2, q3, q4; __m256d h1_real, h1_imag, h2_real, h2_imag; __m256d tmp1, tmp2, tmp3, tmp4; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]); x4 = _mm256_load_pd(&q_dbl[(2*ldq)+12]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif y1 = _mm256_load_pd(&q_dbl[0]); y2 = _mm256_load_pd(&q_dbl[4]); y3 = _mm256_load_pd(&q_dbl[8]); y4 = _mm256_load_pd(&q_dbl[12]); tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h2_imag, x4); #ifdef __ELPA_USE_FMA__ y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif tmp1 = _mm256_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, q2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, q3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h2_imag, q4); #ifdef __ELPA_USE_FMA__ y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #else x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #else x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); h2_real = _mm256_xor_pd(h2_real, sign); h2_imag = _mm256_xor_pd(h2_imag, sign); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp1 = _mm256_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]); tmp1 = _mm256_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, y2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp3 = _mm256_mul_pd(h1_imag, y3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #else y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #endif tmp4 = _mm256_mul_pd(h1_imag, y4); #ifdef __ELPA_USE_FMA__ y4 = _mm256_FMADDSUB_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #else y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #endif tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h2_imag, x4); #ifdef __ELPA_USE_FMA__ y4 = _mm256_add_pd(y4, _mm256_FMADDSUB_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif q1 = _mm256_load_pd(&q_dbl[0]); q2 = _mm256_load_pd(&q_dbl[4]); q3 = _mm256_load_pd(&q_dbl[8]); q4 = _mm256_load_pd(&q_dbl[12]); q1 = _mm256_add_pd(q1, y1); q2 = _mm256_add_pd(q2, y2); q3 = _mm256_add_pd(q3, y3); q4 = _mm256_add_pd(q4, y4); _mm256_store_pd(&q_dbl[0], q1); _mm256_store_pd(&q_dbl[4], q2); _mm256_store_pd(&q_dbl[8], q3); _mm256_store_pd(&q_dbl[12], q4); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]); q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]); q1 = _mm256_add_pd(q1, x1); q2 = _mm256_add_pd(q2, x2); q3 = _mm256_add_pd(q3, x3); q4 = _mm256_add_pd(q4, x4); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, y2); #ifdef __FMA4_ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h2_imag, y4); #ifdef __ELPA_USE_FMA__ q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); _mm256_store_pd(&q_dbl[(ldq*2)+4], q2); _mm256_store_pd(&q_dbl[(ldq*2)+8], q3); _mm256_store_pd(&q_dbl[(ldq*2)+12], q4); for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h2_imag, y4); #ifdef __ELPA_USE_FMA__ q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4); } static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m256d x1, x2, x3; __m256d y1, y2, y3; __m256d q1, q2, q3; __m256d h1_real, h1_imag, h2_real, h2_imag; __m256d tmp1, tmp2, tmp3; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); x3 = _mm256_load_pd(&q_dbl[(2*ldq)+8]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif y1 = _mm256_load_pd(&q_dbl[0]); y2 = _mm256_load_pd(&q_dbl[4]); y3 = _mm256_load_pd(&q_dbl[8]); tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif tmp1 = _mm256_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, q2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, q3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #else x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); h2_real = _mm256_xor_pd(h2_real, sign); h2_imag = _mm256_xor_pd(h2_imag, sign); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp1 = _mm256_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]); tmp1 = _mm256_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, y2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp3 = _mm256_mul_pd(h1_imag, y3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #else y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #endif tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif q1 = _mm256_load_pd(&q_dbl[0]); q2 = _mm256_load_pd(&q_dbl[4]); q3 = _mm256_load_pd(&q_dbl[8]); q1 = _mm256_add_pd(q1, y1); q2 = _mm256_add_pd(q2, y2); q3 = _mm256_add_pd(q3, y3); _mm256_store_pd(&q_dbl[0], q1); _mm256_store_pd(&q_dbl[4], q2); _mm256_store_pd(&q_dbl[8], q3); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]); q1 = _mm256_add_pd(q1, x1); q2 = _mm256_add_pd(q2, x2); q3 = _mm256_add_pd(q3, x3); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, y2); #ifdef __FMA4_ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); _mm256_store_pd(&q_dbl[(ldq*2)+4], q2); _mm256_store_pd(&q_dbl[(ldq*2)+8], q3); for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); } static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m256d x1, x2; __m256d y1, y2; __m256d q1, q2; __m256d h1_real, h1_imag, h2_real, h2_imag; __m256d tmp1, tmp2; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif y1 = _mm256_load_pd(&q_dbl[0]); y2 = _mm256_load_pd(&q_dbl[4]); tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif tmp1 = _mm256_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, q2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); h2_real = _mm256_xor_pd(h2_real, sign); h2_imag = _mm256_xor_pd(h2_imag, sign); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp1 = _mm256_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]); tmp1 = _mm256_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, y2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_FMADDSUB_pd(h1_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else y2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm256_add_pd(y2, _mm256_FMADDSUB_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif q1 = _mm256_load_pd(&q_dbl[0]); q2 = _mm256_load_pd(&q_dbl[4]); q1 = _mm256_add_pd(q1, y1); q2 = _mm256_add_pd(q2, y2); _mm256_store_pd(&q_dbl[0], q1); _mm256_store_pd(&q_dbl[4], q2); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); q2 = _mm256_load_pd(&q_dbl[(ldq*2)+4]); q1 = _mm256_add_pd(q1, x1); q2 = _mm256_add_pd(q2, x2); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, y2); #ifdef __FMA4_ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); _mm256_store_pd(&q_dbl[(ldq*2)+4], q2); for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h2_real, y2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); } static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m256d x1; __m256d y1; __m256d q1; __m256d h1_real, h1_imag, h2_real, h2_imag; __m256d tmp1; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[(2*ldq)+0]); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif y1 = _mm256_load_pd(&q_dbl[0]); tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm256_xor_pd(h2_imag, sign); #endif tmp1 = _mm256_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMSUBADD_pd(h2_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif h1_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h2_real = _mm256_broadcast_sd(&hh_dbl[ldh*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[(ldh*2)+1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); h2_real = _mm256_xor_pd(h2_real, sign); h2_imag = _mm256_xor_pd(h2_imag, sign); __m128d tmp_s_128 = _mm_loadu_pd(s_dbl); __m256d tmp2 = _mm256_broadcast_pd(&tmp_s_128); tmp1 = _mm256_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm256_FMADDSUB_pd(h2_real, tmp2, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else tmp2 = _mm256_addsub_pd( _mm256_mul_pd(h2_real, tmp2), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif _mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2)); h2_real = _mm256_broadcast_sd(&s_dbl[0]); h2_imag = _mm256_broadcast_sd(&s_dbl[1]); tmp1 = _mm256_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMADDSUB_pd(h1_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else y1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp1 = _mm256_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm256_add_pd(y1, _mm256_FMADDSUB_pd(h2_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else y1 = _mm256_add_pd(y1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif q1 = _mm256_load_pd(&q_dbl[0]); q1 = _mm256_add_pd(q1, y1); _mm256_store_pd(&q_dbl[0], q1); h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(ldq*2)+0]); q1 = _mm256_add_pd(q1, x1); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif _mm256_store_pd(&q_dbl[(ldq*2)+0], q1); for (i = 2; i < nb; i++) { q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); h1_real = _mm256_broadcast_sd(&hh_dbl[(i-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif h2_real = _mm256_broadcast_sd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm256_broadcast_sd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm256_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h2_real, y1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); } h1_real = _mm256_broadcast_sd(&hh_dbl[(nb-1)*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+0]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c0000644000312500001440000011714112717516040020701 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_SSE_INTRINSICS #undef __AVX__ #endif //Forward declaration static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface !f> subroutine hexa_hh_trafo_real_sse_6hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="hexa_hh_trafo_real_sse_6hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> real(kind=c_double) :: q(*) !f> real(kind=c_double) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif */ void hexa_hh_trafo_real_sse_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_real_sse_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 6 householder vectors simultaneously double scalarprods[15]; // scalarprods[0] = s_1_2; // scalarprods[1] = s_1_3; // scalarprods[2] = s_2_3; // scalarprods[3] = s_1_4; // scalarprods[4] = s_2_4; // scalarprods[5] = s_3_4; // scalarprods[6] = s_1_5; // scalarprods[7] = s_2_5; // scalarprods[8] = s_3_5; // scalarprods[9] = s_4_5; // scalarprods[10] = s_1_6; // scalarprods[11] = s_2_6; // scalarprods[12] = s_3_6; // scalarprods[13] = s_4_6; // scalarprods[14] = s_5_6; scalarprods[0] = hh[(ldh+1)]; scalarprods[1] = hh[(ldh*2)+2]; scalarprods[2] = hh[(ldh*2)+1]; scalarprods[3] = hh[(ldh*3)+3]; scalarprods[4] = hh[(ldh*3)+2]; scalarprods[5] = hh[(ldh*3)+1]; scalarprods[6] = hh[(ldh*4)+4]; scalarprods[7] = hh[(ldh*4)+3]; scalarprods[8] = hh[(ldh*4)+2]; scalarprods[9] = hh[(ldh*4)+1]; scalarprods[10] = hh[(ldh*5)+5]; scalarprods[11] = hh[(ldh*5)+4]; scalarprods[12] = hh[(ldh*5)+3]; scalarprods[13] = hh[(ldh*5)+2]; scalarprods[14] = hh[(ldh*5)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 scalarprods[0] += hh[1] * hh[(2+ldh)]; scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; // loop counter = 3 scalarprods[0] += hh[2] * hh[(3+ldh)]; scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; scalarprods[1] += hh[1] * hh[3+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; // loop counter = 4 scalarprods[0] += hh[3] * hh[(4+ldh)]; scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; scalarprods[1] += hh[2] * hh[4+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; scalarprods[3] += hh[1] * hh[4+(ldh*3)]; scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; // loop counter = 5 scalarprods[0] += hh[4] * hh[(5+ldh)]; scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; scalarprods[1] += hh[3] * hh[5+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; scalarprods[3] += hh[2] * hh[5+(ldh*3)]; scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; scalarprods[6] += hh[1] * hh[5+(ldh*4)]; scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; #pragma ivdep for (i = 6; i < nb; i++) { scalarprods[0] += hh[i-1] * hh[(i+ldh)]; scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; } // printf("s_1_2: %f\n", scalarprods[0]); // printf("s_1_3: %f\n", scalarprods[1]); // printf("s_2_3: %f\n", scalarprods[2]); // printf("s_1_4: %f\n", scalarprods[3]); // printf("s_2_4: %f\n", scalarprods[4]); // printf("s_3_4: %f\n", scalarprods[5]); // printf("s_1_5: %f\n", scalarprods[6]); // printf("s_2_5: %f\n", scalarprods[7]); // printf("s_3_5: %f\n", scalarprods[8]); // printf("s_4_5: %f\n", scalarprods[9]); // printf("s_1_6: %f\n", scalarprods[10]); // printf("s_2_6: %f\n", scalarprods[11]); // printf("s_3_6: %f\n", scalarprods[12]); // printf("s_4_6: %f\n", scalarprods[13]); // printf("s_5_6: %f\n", scalarprods[14]); // Production level kernel calls with padding for (i = 0; i < nq-2; i+=4) { hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { return; } else { hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } } #if 0 void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 6 householder vectors simultaneously double scalarprods[15]; // scalarprods[0] = s_1_2; // scalarprods[1] = s_1_3; // scalarprods[2] = s_2_3; // scalarprods[3] = s_1_4; // scalarprods[4] = s_2_4; // scalarprods[5] = s_3_4; // scalarprods[6] = s_1_5; // scalarprods[7] = s_2_5; // scalarprods[8] = s_3_5; // scalarprods[9] = s_4_5; // scalarprods[10] = s_1_6; // scalarprods[11] = s_2_6; // scalarprods[12] = s_3_6; // scalarprods[13] = s_4_6; // scalarprods[14] = s_5_6; scalarprods[0] = hh[(ldh+1)]; scalarprods[1] = hh[(ldh*2)+2]; scalarprods[2] = hh[(ldh*2)+1]; scalarprods[3] = hh[(ldh*3)+3]; scalarprods[4] = hh[(ldh*3)+2]; scalarprods[5] = hh[(ldh*3)+1]; scalarprods[6] = hh[(ldh*4)+4]; scalarprods[7] = hh[(ldh*4)+3]; scalarprods[8] = hh[(ldh*4)+2]; scalarprods[9] = hh[(ldh*4)+1]; scalarprods[10] = hh[(ldh*5)+5]; scalarprods[11] = hh[(ldh*5)+4]; scalarprods[12] = hh[(ldh*5)+3]; scalarprods[13] = hh[(ldh*5)+2]; scalarprods[14] = hh[(ldh*5)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 scalarprods[0] += hh[1] * hh[(2+ldh)]; scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; // loop counter = 3 scalarprods[0] += hh[2] * hh[(3+ldh)]; scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; scalarprods[1] += hh[1] * hh[3+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; // loop counter = 4 scalarprods[0] += hh[3] * hh[(4+ldh)]; scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; scalarprods[1] += hh[2] * hh[4+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; scalarprods[3] += hh[1] * hh[4+(ldh*3)]; scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; // loop counter = 5 scalarprods[0] += hh[4] * hh[(5+ldh)]; scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; scalarprods[1] += hh[3] * hh[5+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; scalarprods[3] += hh[2] * hh[5+(ldh*3)]; scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; scalarprods[6] += hh[1] * hh[5+(ldh*4)]; scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; #pragma ivdep for (i = 6; i < nb; i++) { scalarprods[0] += hh[i-1] * hh[(i+ldh)]; scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; } // printf("s_1_2: %f\n", scalarprods[0]); // printf("s_1_3: %f\n", scalarprods[1]); // printf("s_2_3: %f\n", scalarprods[2]); // printf("s_1_4: %f\n", scalarprods[3]); // printf("s_2_4: %f\n", scalarprods[4]); // printf("s_3_4: %f\n", scalarprods[5]); // printf("s_1_5: %f\n", scalarprods[6]); // printf("s_2_5: %f\n", scalarprods[7]); // printf("s_3_5: %f\n", scalarprods[8]); // printf("s_4_5: %f\n", scalarprods[9]); // printf("s_1_6: %f\n", scalarprods[10]); // printf("s_2_6: %f\n", scalarprods[11]); // printf("s_3_6: %f\n", scalarprods[12]); // printf("s_4_6: %f\n", scalarprods[13]); // printf("s_5_6: %f\n", scalarprods[14]); // Production level kernel calls with padding #ifdef __AVX__ for (i = 0; i < nq; i+=8) { hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } #else for (i = 0; i < nq; i+=4) { hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } #endif } #endif /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m128d a1_1 = _mm_load_pd(&q[ldq*5]); __m128d a2_1 = _mm_load_pd(&q[ldq*4]); __m128d a3_1 = _mm_load_pd(&q[ldq*3]); __m128d a4_1 = _mm_load_pd(&q[ldq*2]); __m128d a5_1 = _mm_load_pd(&q[ldq]); __m128d a6_1 = _mm_load_pd(&q[0]); __m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]); __m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]); __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); register __m128d x1 = a1_1; __m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]); __m128d a2_2 = _mm_load_pd(&q[(ldq*4)+2]); __m128d a3_2 = _mm_load_pd(&q[(ldq*3)+2]); __m128d a4_2 = _mm_load_pd(&q[(ldq*2)+2]); __m128d a5_2 = _mm_load_pd(&q[(ldq)+2]); __m128d a6_2 = _mm_load_pd(&q[2]); register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5)); t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4)); t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3)); t2 = _mm_add_pd(t2, _mm_mul_pd(a2_2, h_6_2)); t2 = _mm_add_pd(t2, _mm_mul_pd(a1_2, h_6_1)); register __m128d v2 = _mm_add_pd(a5_2, _mm_mul_pd(a4_2, h_5_4)); v2 = _mm_add_pd(v2, _mm_mul_pd(a3_2, h_5_3)); v2 = _mm_add_pd(v2, _mm_mul_pd(a2_2, h_5_2)); v2 = _mm_add_pd(v2, _mm_mul_pd(a1_2, h_5_1)); register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); register __m128d x2 = a1_2; __m128d q1; __m128d q2; __m128d h1; __m128d h2; __m128d h3; __m128d h4; __m128d h5; __m128d h6; for(i = 6; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-5]); q1 = _mm_load_pd(&q[i*ldq]); q2 = _mm_load_pd(&q[(i*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); h2 = _mm_loaddup_pd(&hh[ldh+i-4]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6)); } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5)); h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(&hh[0]); x1 = _mm_mul_pd(x1, tau1); x2 = _mm_mul_pd(x2, tau1); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); h2 = _mm_mul_pd(tau2, vs_1_2); y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2)); __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); h2 = _mm_mul_pd(tau3, vs_1_3); h3 = _mm_mul_pd(tau3, vs_2_3); z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); __m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]); h2 = _mm_mul_pd(tau4, vs_1_4); h3 = _mm_mul_pd(tau4, vs_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); h4 = _mm_mul_pd(tau4, vs_3_4); w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); __m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]); h2 = _mm_mul_pd(tau5, vs_1_5); h3 = _mm_mul_pd(tau5, vs_2_5); __m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]); __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); h4 = _mm_mul_pd(tau5, vs_3_5); h5 = _mm_mul_pd(tau5, vs_4_5); v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); __m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]); h2 = _mm_mul_pd(tau6, vs_1_6); h3 = _mm_mul_pd(tau6, vs_2_6); __m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]); __m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]); __m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]); h4 = _mm_mul_pd(tau6, vs_3_6); h5 = _mm_mul_pd(tau6, vs_4_6); h6 = _mm_mul_pd(tau6, vs_5_6); t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))))); ///////////////////////////////////////////////////// // Rank-1 update of Q [4 x nb+3] ///////////////////////////////////////////////////// q1 = _mm_load_pd(&q[0]); q2 = _mm_load_pd(&q[2]); q1 = _mm_sub_pd(q1, t1); q2 = _mm_sub_pd(q2, t2); _mm_store_pd(&q[0],q1); _mm_store_pd(&q[2],q2); h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); q1 = _mm_load_pd(&q[ldq]); q2 = _mm_load_pd(&q[(ldq+2)]); q1 = _mm_sub_pd(q1, v1); q2 = _mm_sub_pd(q2, v2); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); _mm_store_pd(&q[ldq],q1); _mm_store_pd(&q[(ldq+2)],q2); h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); q1 = _mm_load_pd(&q[ldq*2]); q2 = _mm_load_pd(&q[(ldq*2)+2]); q1 = _mm_sub_pd(q1, w1); q2 = _mm_sub_pd(q2, w2); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); _mm_store_pd(&q[ldq*2],q1); _mm_store_pd(&q[(ldq*2)+2],q2); h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq*3]); q2 = _mm_load_pd(&q[(ldq*3)+2]); q1 = _mm_sub_pd(q1, z1); q2 = _mm_sub_pd(q2, z2); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); _mm_store_pd(&q[ldq*3],q1); _mm_store_pd(&q[(ldq*3)+2],q2); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); q1 = _mm_load_pd(&q[ldq*4]); q2 = _mm_load_pd(&q[(ldq*4)+2]); q1 = _mm_sub_pd(q1, y1); q2 = _mm_sub_pd(q2, y2); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); _mm_store_pd(&q[ldq*4],q1); _mm_store_pd(&q[(ldq*4)+2],q2); h2 = _mm_loaddup_pd(&hh[(ldh)+1]); q1 = _mm_load_pd(&q[ldq*5]); q2 = _mm_load_pd(&q[(ldq*5)+2]); q1 = _mm_sub_pd(q1, x1); q2 = _mm_sub_pd(q2, x2); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); _mm_store_pd(&q[ldq*5],q1); _mm_store_pd(&q[(ldq*5)+2],q2); for (i = 6; i < nb; i++) { q1 = _mm_load_pd(&q[i*ldq]); q2 = _mm_load_pd(&q[(i*ldq)+2]); h1 = _mm_loaddup_pd(&hh[i-5]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); h2 = _mm_loaddup_pd(&hh[ldh+i-4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6)); _mm_store_pd(&q[i*ldq],q1); _mm_store_pd(&q[(i*ldq)+2],q2); } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5)); _mm_store_pd(&q[nb*ldq],q1); _mm_store_pd(&q[(nb*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); _mm_store_pd(&q[(nb+1)*ldq],q1); _mm_store_pd(&q[((nb+1)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); _mm_store_pd(&q[(nb+2)*ldq],q1); _mm_store_pd(&q[((nb+2)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); _mm_store_pd(&q[(nb+3)*ldq],q1); _mm_store_pd(&q[((nb+3)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); _mm_store_pd(&q[(nb+4)*ldq],q1); _mm_store_pd(&q[((nb+4)*ldq)+2],q2); } /** * Unrolled kernel that computes * 2 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [2 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m128d a1_1 = _mm_load_pd(&q[ldq*5]); __m128d a2_1 = _mm_load_pd(&q[ldq*4]); __m128d a3_1 = _mm_load_pd(&q[ldq*3]); __m128d a4_1 = _mm_load_pd(&q[ldq*2]); __m128d a5_1 = _mm_load_pd(&q[ldq]); __m128d a6_1 = _mm_load_pd(&q[0]); __m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]); __m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]); __m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]); __m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]); __m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]); register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5)); t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4)); t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3)); t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2)); t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1)); __m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]); __m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]); __m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]); __m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]); register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4)); v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3)); v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2)); v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1)); __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); register __m128d x1 = a1_1; __m128d q1; __m128d h1; __m128d h2; __m128d h3; __m128d h4; __m128d h5; __m128d h6; for(i = 6; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-5]); q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); h2 = _mm_loaddup_pd(&hh[ldh+i-4]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6)); } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5)); h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(&hh[0]); x1 = _mm_mul_pd(x1, tau1); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]); h2 = _mm_mul_pd(tau2, vs_1_2); y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2)); __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]); __m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]); h2 = _mm_mul_pd(tau3, vs_1_3); h3 = _mm_mul_pd(tau3, vs_2_3); z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]); __m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]); h2 = _mm_mul_pd(tau4, vs_1_4); h3 = _mm_mul_pd(tau4, vs_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]); h4 = _mm_mul_pd(tau4, vs_3_4); w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); __m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]); __m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]); __m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]); h2 = _mm_mul_pd(tau5, vs_1_5); h3 = _mm_mul_pd(tau5, vs_2_5); __m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]); __m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]); h4 = _mm_mul_pd(tau5, vs_3_5); h5 = _mm_mul_pd(tau5, vs_4_5); v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); __m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]); __m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]); __m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]); h2 = _mm_mul_pd(tau6, vs_1_6); h3 = _mm_mul_pd(tau6, vs_2_6); __m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]); __m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]); __m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]); h4 = _mm_mul_pd(tau6, vs_3_6); h5 = _mm_mul_pd(tau6, vs_4_6); h6 = _mm_mul_pd(tau6, vs_5_6); t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))))); ///////////////////////////////////////////////////// // Rank-1 update of Q [2 x nb+3] ///////////////////////////////////////////////////// q1 = _mm_load_pd(&q[0]); q1 = _mm_sub_pd(q1, t1); _mm_store_pd(&q[0],q1); h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]); q1 = _mm_load_pd(&q[ldq]); q1 = _mm_sub_pd(q1, v1); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); _mm_store_pd(&q[ldq],q1); h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]); q1 = _mm_load_pd(&q[ldq*2]); q1 = _mm_sub_pd(q1, w1); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); _mm_store_pd(&q[ldq*2],q1); h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq*3]); q1 = _mm_sub_pd(q1, z1); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); _mm_store_pd(&q[ldq*3],q1); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); q1 = _mm_load_pd(&q[ldq*4]); q1 = _mm_sub_pd(q1, y1); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); _mm_store_pd(&q[ldq*4],q1); h2 = _mm_loaddup_pd(&hh[(ldh)+1]); q1 = _mm_load_pd(&q[ldq*5]); q1 = _mm_sub_pd(q1, x1); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); _mm_store_pd(&q[ldq*5],q1); for (i = 6; i < nb; i++) { q1 = _mm_load_pd(&q[i*ldq]); h1 = _mm_loaddup_pd(&hh[i-5]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); h2 = _mm_loaddup_pd(&hh[ldh+i-4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]); q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6)); _mm_store_pd(&q[i*ldq],q1); } h1 = _mm_loaddup_pd(&hh[nb-5]); q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5)); _mm_store_pd(&q[nb*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-4]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-3]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); _mm_store_pd(&q[(nb+1)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); _mm_store_pd(&q[(nb+2)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+3)*ldq]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); _mm_store_pd(&q[(nb+3)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+4)*ldq]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[(nb+4)*ldq],q1); } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c0000644000312500001440000004616512717516040021427 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #include #define __forceinline __attribute__((always_inline)) #ifdef HAVE_SSE_INTRINSICS #undef __AVX__ #endif //Forward declaration static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq); /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface !f> subroutine single_hh_trafo_complex_sse_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_sse_1hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq !f> complex(kind=c_double) :: q(*) !f> complex(kind=c_double) :: hh(pnb,2) !f> end subroutine !f> end interface !f>#endif */ void single_hh_trafo_complex_sse_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; //int ldh = *pldh; for (i = 0; i < nq-4; i+=6) { hh_trafo_complex_kernel_6_SSE_1hv(&q[i], hh, nb, ldq); } if (nq-i > 2) { hh_trafo_complex_kernel_4_SSE_1hv(&q[i], hh, nb, ldq); } else if (nq-i > 0) { hh_trafo_complex_kernel_2_SSE_1hv(&q[i], hh, nb, ldq); } } static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; __m128d x1, x2, x3, x4, x5, x6; __m128d q1, q2, q3, q4, q5, q6; __m128d h1_real, h1_imag; __m128d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[0]); x2 = _mm_load_pd(&q_dbl[2]); x3 = _mm_load_pd(&q_dbl[4]); x4 = _mm_load_pd(&q_dbl[6]); x5 = _mm_load_pd(&q_dbl[8]); x6 = _mm_load_pd(&q_dbl[10]); for (i = 1; i < nb; i++) { h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]); q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif tmp5 = _mm_mul_pd(h1_imag, q5); #ifdef __ELPA_USE_FMA__ x5 = _mm_add_pd(x5, _mm_msubadd_pd(h1_real, q5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); #else x5 = _mm_add_pd(x5, _mm_addsub_pd( _mm_mul_pd(h1_real, q5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); #endif tmp6 = _mm_mul_pd(h1_imag, q6); #ifdef __ELPA_USE_FMA__ x6 = _mm_add_pd(x6, _mm_msubadd_pd(h1_real, q6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); #else x6 = _mm_add_pd(x6, _mm_addsub_pd( _mm_mul_pd(h1_real, q6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #else x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #else x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #endif tmp5 = _mm_mul_pd(h1_imag, x5); #ifdef __ELPA_USE_FMA__ x5 = _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))); #else x5 = _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1))); #endif tmp6 = _mm_mul_pd(h1_imag, x6); #ifdef __ELPA_USE_FMA__ x6 = _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))); #else x6 = _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1))); #endif q1 = _mm_load_pd(&q_dbl[0]); q2 = _mm_load_pd(&q_dbl[2]); q3 = _mm_load_pd(&q_dbl[4]); q4 = _mm_load_pd(&q_dbl[6]); q5 = _mm_load_pd(&q_dbl[8]); q6 = _mm_load_pd(&q_dbl[10]); q1 = _mm_add_pd(q1, x1); q2 = _mm_add_pd(q2, x2); q3 = _mm_add_pd(q3, x3); q4 = _mm_add_pd(q4, x4); q5 = _mm_add_pd(q5, x5); q6 = _mm_add_pd(q6, x6); _mm_store_pd(&q_dbl[0], q1); _mm_store_pd(&q_dbl[2], q2); _mm_store_pd(&q_dbl[4], q3); _mm_store_pd(&q_dbl[6], q4); _mm_store_pd(&q_dbl[8], q5); _mm_store_pd(&q_dbl[10], q6); for (i = 1; i < nb; i++) { h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]); q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif tmp5 = _mm_mul_pd(h1_imag, x5); #ifdef __ELPA_USE_FMA__ q5 = _mm_add_pd(q5, _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); #else q5 = _mm_add_pd(q5, _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, _MM_SHUFFLE2(0,1)))); #endif tmp6 = _mm_mul_pd(h1_imag, x6); #ifdef __ELPA_USE_FMA__ q6 = _mm_add_pd(q6, _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); #else q6 = _mm_add_pd(q6, _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); _mm_store_pd(&q_dbl[(2*i*ldq)+8], q5); _mm_store_pd(&q_dbl[(2*i*ldq)+10], q6); } } static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; __m128d x1, x2, x3, x4; __m128d q1, q2, q3, q4; __m128d h1_real, h1_imag; __m128d tmp1, tmp2, tmp3, tmp4; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[0]); x2 = _mm_load_pd(&q_dbl[2]); x3 = _mm_load_pd(&q_dbl[4]); x4 = _mm_load_pd(&q_dbl[6]); for (i = 1; i < nb; i++) { h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #else x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #else x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #endif q1 = _mm_load_pd(&q_dbl[0]); q2 = _mm_load_pd(&q_dbl[2]); q3 = _mm_load_pd(&q_dbl[4]); q4 = _mm_load_pd(&q_dbl[6]); q1 = _mm_add_pd(q1, x1); q2 = _mm_add_pd(q2, x2); q3 = _mm_add_pd(q3, x3); q4 = _mm_add_pd(q4, x4); _mm_store_pd(&q_dbl[0], q1); _mm_store_pd(&q_dbl[2], q2); _mm_store_pd(&q_dbl[4], q3); _mm_store_pd(&q_dbl[6], q4); for (i = 1; i < nb; i++) { h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); } } static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; __m128d x1, x2; __m128d q1, q2; __m128d h1_real, h1_imag; __m128d tmp1, tmp2; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[0]); x2 = _mm_load_pd(&q_dbl[2]); for (i = 1; i < nb; i++) { h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif q1 = _mm_load_pd(&q_dbl[0]); q2 = _mm_load_pd(&q_dbl[2]); q1 = _mm_add_pd(q1, x1); q2 = _mm_add_pd(q2, x2); _mm_store_pd(&q_dbl[0], q1); _mm_store_pd(&q_dbl[2], q2); for (i = 1; i < nb; i++) { h1_real = _mm_loaddup_pd(&hh_dbl[i*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); } } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c0000644000312500001440000012715512717516040021567 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c) #define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c) #endif #ifdef __AVX2__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) #define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c) #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) #endif #endif //Forward declaration __forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); /* !f>#ifdef HAVE_AVX !f> interface !f> subroutine quad_hh_trafo_real_avx_avx2_4hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="quad_hh_trafo_real_avx_avx2_4hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> real(kind=c_double) :: q(*) !f> real(kind=c_double) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif */ void quad_hh_trafo_real_avx_avx2_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_real_avx_avx2_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 4 householder vectors simultaneously double s_1_2 = hh[(ldh)+1]; double s_1_3 = hh[(ldh*2)+2]; double s_2_3 = hh[(ldh*2)+1]; double s_1_4 = hh[(ldh*3)+3]; double s_2_4 = hh[(ldh*3)+2]; double s_3_4 = hh[(ldh*3)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 s_1_2 += hh[2-1] * hh[(2+ldh)]; s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; // loop counter = 3 s_1_2 += hh[3-1] * hh[(3+ldh)]; s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; s_1_3 += hh[3-2] * hh[3+(ldh*2)]; s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; #pragma ivdep for (i = 4; i < nb; i++) { s_1_2 += hh[i-1] * hh[(i+ldh)]; s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; s_1_3 += hh[i-2] * hh[i+(ldh*2)]; s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; s_1_4 += hh[i-3] * hh[i+(ldh*3)]; } // Production level kernel calls with padding #ifdef __AVX__ for (i = 0; i < nq-8; i+=12) { hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } if (nq == i) { return; } else { if (nq-i > 4) { hh_trafo_kernel_8_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } else { hh_trafo_kernel_4_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } } #else for (i = 0; i < nq-4; i+=6) { hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } if (nq == i) { return; } else { if (nq-i > 2) { hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } else { hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } } #endif } /** * Unrolled kernel that computes * 12 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [12 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m256d a1_1 = _mm256_load_pd(&q[ldq*3]); __m256d a2_1 = _mm256_load_pd(&q[ldq*2]); __m256d a3_1 = _mm256_load_pd(&q[ldq]); __m256d a4_1 = _mm256_load_pd(&q[0]); __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); register __m256d x1 = a1_1; #else register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); register __m256d x1 = a1_1; #endif __m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]); __m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]); __m256d a3_2 = _mm256_load_pd(&q[ldq+4]); __m256d a4_2 = _mm256_load_pd(&q[0+4]); #ifdef __ELPA_USE_FMA__ register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); register __m256d x2 = a1_2; #else register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); register __m256d x2 = a1_2; #endif __m256d a1_3 = _mm256_load_pd(&q[(ldq*3)+8]); __m256d a2_3 = _mm256_load_pd(&q[(ldq*2)+8]); __m256d a3_3 = _mm256_load_pd(&q[ldq+8]); __m256d a4_3 = _mm256_load_pd(&q[0+8]); #ifdef __ELPA_USE_FMA__ register __m256d w3 = _mm256_FMA_pd(a3_3, h_4_3, a4_3); w3 = _mm256_FMA_pd(a2_3, h_4_2, w3); w3 = _mm256_FMA_pd(a1_3, h_4_1, w3); register __m256d z3 = _mm256_FMA_pd(a2_3, h_3_2, a3_3); z3 = _mm256_FMA_pd(a1_3, h_3_1, z3); register __m256d y3 = _mm256_FMA_pd(a1_3, h_2_1, a2_3); register __m256d x3 = a1_3; #else register __m256d w3 = _mm256_add_pd(a4_3, _mm256_mul_pd(a3_3, h_4_3)); w3 = _mm256_add_pd(w3, _mm256_mul_pd(a2_3, h_4_2)); w3 = _mm256_add_pd(w3, _mm256_mul_pd(a1_3, h_4_1)); register __m256d z3 = _mm256_add_pd(a3_3, _mm256_mul_pd(a2_3, h_3_2)); z3 = _mm256_add_pd(z3, _mm256_mul_pd(a1_3, h_3_1)); register __m256d y3 = _mm256_add_pd(a2_3, _mm256_mul_pd(a1_3, h_2_1)); register __m256d x3 = a1_3; #endif __m256d q1; __m256d q2; __m256d q3; __m256d h1; __m256d h2; __m256d h3; __m256d h4; for(i = 4; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-3]); q1 = _mm256_load_pd(&q[i*ldq]); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q3 = _mm256_load_pd(&q[(i*ldq)+8]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); x3 = _mm256_FMA_pd(q3, h1, x3); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); y3 = _mm256_FMA_pd(q3, h2, y3); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); z3 = _mm256_FMA_pd(q3, h3, z3); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); w2 = _mm256_FMA_pd(q2, h4, w2); w3 = _mm256_FMA_pd(q3, h4, w3); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); w3 = _mm256_add_pd(w3, _mm256_mul_pd(q3,h4)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-3]); q1 = _mm256_load_pd(&q[nb*ldq]); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); x3 = _mm256_FMA_pd(q3, h1, x3); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); #ifdef __FMA4_ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); y3 = _mm256_FMA_pd(q3, h2, y3); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); z3 = _mm256_FMA_pd(q3, h3, z3); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); z3 = _mm256_add_pd(z3, _mm256_mul_pd(q3,h3)); #endif h1 = _mm256_broadcast_sd(&hh[nb-2]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); x3 = _mm256_FMA_pd(q3, h1, x3); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); #endif h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); y3 = _mm256_FMA_pd(q3, h2, y3); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2)); #endif h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); x3 = _mm256_FMA_pd(q3, h1, x3); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1)); #endif ///////////////////////////////////////////////////// // Rank-1 update of Q [12 x nb+3] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(&hh[0]); h1 = tau1; x1 = _mm256_mul_pd(x1, h1); x2 = _mm256_mul_pd(x2, h1); x3 = _mm256_mul_pd(x3, h1); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2); h1 = tau2; h2 = _mm256_mul_pd(h1, vs_1_2); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2)); y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2)); y3 = _mm256_FMSUB_pd(y3, h1, _mm256_mul_pd(x3,h2)); #else y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); y3 = _mm256_sub_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2)); #endif __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3); __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3); h1 = tau3; h2 = _mm256_mul_pd(h1, vs_1_3); h3 = _mm256_mul_pd(h1, vs_2_3); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); z3 = _mm256_FMSUB_pd(z3, h1, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2))); #else z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); z3 = _mm256_sub_pd(_mm256_mul_pd(z3,h1), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2))); #endif __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4); __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4); __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4); h1 = tau4; h2 = _mm256_mul_pd(h1, vs_1_4); h3 = _mm256_mul_pd(h1, vs_2_4); h4 = _mm256_mul_pd(h1, vs_3_4); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); w3 = _mm256_FMSUB_pd(w3, h1, _mm256_FMA_pd(z3, h4, _mm256_FMA_pd(y3, h3, _mm256_mul_pd(x3,h2)))); #else w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); w3 = _mm256_sub_pd(_mm256_mul_pd(w3,h1), _mm256_add_pd(_mm256_mul_pd(z3,h4), _mm256_add_pd(_mm256_mul_pd(y3,h3), _mm256_mul_pd(x3,h2)))); #endif q1 = _mm256_load_pd(&q[0]); q2 = _mm256_load_pd(&q[4]); q3 = _mm256_load_pd(&q[8]); q1 = _mm256_sub_pd(q1, w1); q2 = _mm256_sub_pd(q2, w2); q3 = _mm256_sub_pd(q3, w3); _mm256_store_pd(&q[0],q1); _mm256_store_pd(&q[4],q2); _mm256_store_pd(&q[8],q3); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); q1 = _mm256_load_pd(&q[ldq]); q2 = _mm256_load_pd(&q[ldq+4]); q3 = _mm256_load_pd(&q[ldq+8]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1)); q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2)); q3 = _mm256_sub_pd(q3, _mm256_FMA_pd(w3, h4, z3)); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4))); q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4))); q3 = _mm256_sub_pd(q3, _mm256_add_pd(z3, _mm256_mul_pd(w3, h4))); #endif _mm256_store_pd(&q[ldq],q1); _mm256_store_pd(&q[ldq+4],q2); _mm256_store_pd(&q[ldq+8],q3); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); q1 = _mm256_load_pd(&q[ldq*2]); q2 = _mm256_load_pd(&q[(ldq*2)+4]); q3 = _mm256_load_pd(&q[(ldq*2)+8]); q1 = _mm256_sub_pd(q1, y1); q2 = _mm256_sub_pd(q2, y2); q3 = _mm256_sub_pd(q3, y3); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); q3 = _mm256_NFMA_pd(w3, h4, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); q3 = _mm256_NFMA_pd(z3, h3, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3)); #endif _mm256_store_pd(&q[ldq*2],q1); _mm256_store_pd(&q[(ldq*2)+4],q2); _mm256_store_pd(&q[(ldq*2)+8],q3); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); q1 = _mm256_load_pd(&q[ldq*3]); q2 = _mm256_load_pd(&q[(ldq*3)+4]); q3 = _mm256_load_pd(&q[(ldq*3)+8]); q1 = _mm256_sub_pd(q1, x1); q2 = _mm256_sub_pd(q2, x2); q3 = _mm256_sub_pd(q3, x3); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); q3 = _mm256_NFMA_pd(w3, h4, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3, h4)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); q3 = _mm256_NFMA_pd(y3, h2, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); q3 = _mm256_NFMA_pd(z3, h3, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3, h3)); #endif _mm256_store_pd(&q[ldq*3], q1); _mm256_store_pd(&q[(ldq*3)+4], q2); _mm256_store_pd(&q[(ldq*3)+8], q3); for (i = 4; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-3]); q1 = _mm256_load_pd(&q[i*ldq]); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q3 = _mm256_load_pd(&q[(i*ldq)+8]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q3 = _mm256_NFMA_pd(x3, h1, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); q3 = _mm256_NFMA_pd(y3, h2, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); q3 = _mm256_NFMA_pd(z3, h3, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); q3 = _mm256_NFMA_pd(w3, h4, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1,h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2,h4)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(w3,h4)); #endif _mm256_store_pd(&q[i*ldq],q1); _mm256_store_pd(&q[(i*ldq)+4],q2); _mm256_store_pd(&q[(i*ldq)+8],q3); } h1 = _mm256_broadcast_sd(&hh[nb-3]); q1 = _mm256_load_pd(&q[nb*ldq]); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); q3 = _mm256_load_pd(&q[(nb*ldq)+8]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q3 = _mm256_NFMA_pd(x3, h1, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); q3 = _mm256_NFMA_pd(y3, h2, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); q3 = _mm256_NFMA_pd(z3, h3, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1,h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2,h3)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(z3,h3)); #endif _mm256_store_pd(&q[nb*ldq],q1); _mm256_store_pd(&q[(nb*ldq)+4],q2); _mm256_store_pd(&q[(nb*ldq)+8],q3); h1 = _mm256_broadcast_sd(&hh[nb-2]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); q3 = _mm256_load_pd(&q[((nb+1)*ldq)+8]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q3 = _mm256_NFMA_pd(x3, h1, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); q3 = _mm256_NFMA_pd(y3, h2, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1,h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2,h2)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(y3,h2)); #endif _mm256_store_pd(&q[(nb+1)*ldq],q1); _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); _mm256_store_pd(&q[((nb+1)*ldq)+8],q3); h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); q3 = _mm256_load_pd(&q[((nb+2)*ldq)+8]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q3 = _mm256_NFMA_pd(x3, h1, q3); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1,h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2,h1)); q3 = _mm256_sub_pd(q3, _mm256_mul_pd(x3,h1)); #endif _mm256_store_pd(&q[(nb+2)*ldq],q1); _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); _mm256_store_pd(&q[((nb+2)*ldq)+8],q3); } /** * Unrolled kernel that computes * 8 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m256d a1_1 = _mm256_load_pd(&q[ldq*3]); __m256d a2_1 = _mm256_load_pd(&q[ldq*2]); __m256d a3_1 = _mm256_load_pd(&q[ldq]); __m256d a4_1 = _mm256_load_pd(&q[0]); __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); __m256d x1 = a1_1; #else __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); __m256d x1 = a1_1; #endif __m256d a1_2 = _mm256_load_pd(&q[(ldq*3)+4]); __m256d a2_2 = _mm256_load_pd(&q[(ldq*2)+4]); __m256d a3_2 = _mm256_load_pd(&q[ldq+4]); __m256d a4_2 = _mm256_load_pd(&q[0+4]); #ifdef __ELPA_USE_FMA__ __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); __m256d x2 = a1_2; #else __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); __m256d x2 = a1_2; #endif __m256d q1; __m256d q2; __m256d h1; __m256d h2; __m256d h3; __m256d h4; for(i = 4; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-3]); h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); q1 = _mm256_load_pd(&q[i*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); z1 = _mm256_FMA_pd(q1, h3, z1); w1 = _mm256_FMA_pd(q1, h4, w1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); #endif q2 = _mm256_load_pd(&q[(i*ldq)+4]); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMA_pd(q2, h1, x2); y2 = _mm256_FMA_pd(q2, h2, y2); z2 = _mm256_FMA_pd(q2, h3, z2); w2 = _mm256_FMA_pd(q2, h4, w2); #else x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-3]); h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); q1 = _mm256_load_pd(&q[nb*ldq]); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); #endif h1 = _mm256_broadcast_sd(&hh[nb-2]); h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif ///////////////////////////////////////////////////// // Rank-1 update of Q [8 x nb+3] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(&hh[0]); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2); __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3); __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3); __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4); __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4); __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4); h1 = tau1; x1 = _mm256_mul_pd(x1, h1); x2 = _mm256_mul_pd(x2, h1); h1 = tau2; h2 = _mm256_mul_pd(h1, vs_1_2); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2)); y2 = _mm256_FMSUB_pd(y2, h1, _mm256_mul_pd(x2,h2)); #else y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); y2 = _mm256_sub_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2)); #endif h1 = tau3; h2 = _mm256_mul_pd(h1, vs_1_3); h3 = _mm256_mul_pd(h1, vs_2_3); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); z2 = _mm256_FMSUB_pd(z2, h1, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); #else z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); z2 = _mm256_sub_pd(_mm256_mul_pd(z2,h1), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); #endif h1 = tau4; h2 = _mm256_mul_pd(h1, vs_1_4); h3 = _mm256_mul_pd(h1, vs_2_4); h4 = _mm256_mul_pd(h1, vs_3_4); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); w2 = _mm256_FMSUB_pd(w2, h1, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); #else w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); w2 = _mm256_sub_pd(_mm256_mul_pd(w2,h1), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); #endif q1 = _mm256_load_pd(&q[0]); q2 = _mm256_load_pd(&q[4]); q1 = _mm256_sub_pd(q1, w1); q2 = _mm256_sub_pd(q2, w2); _mm256_store_pd(&q[0],q1); _mm256_store_pd(&q[4],q2); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); q1 = _mm256_load_pd(&q[ldq]); q2 = _mm256_load_pd(&q[ldq+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1)); q2 = _mm256_sub_pd(q2, _mm256_FMA_pd(w2, h4, z2)); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4))); q2 = _mm256_sub_pd(q2, _mm256_add_pd(z2, _mm256_mul_pd(w2, h4))); #endif _mm256_store_pd(&q[ldq],q1); _mm256_store_pd(&q[ldq+4],q2); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); q1 = _mm256_load_pd(&q[ldq*2]); q2 = _mm256_load_pd(&q[(ldq*2)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, y1); q1 = _mm256_NFMA_pd(z1, h3, q1); q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_sub_pd(q2, y2); q2 = _mm256_NFMA_pd(z2, h3, q2); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))); q2 = _mm256_sub_pd(q2, _mm256_add_pd(y2, _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4)))); #endif _mm256_store_pd(&q[ldq*2],q1); _mm256_store_pd(&q[(ldq*2)+4],q2); h2 = _mm256_broadcast_sd(&hh[ldh+1]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); q1 = _mm256_load_pd(&q[ldq*3]); q2 = _mm256_load_pd(&q[(ldq*3)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, x1); q1 = _mm256_NFMA_pd(y1, h2, q1); q1 = _mm256_NFMA_pd(z1, h3, q1); q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_sub_pd(q2, x2); q2 = _mm256_NFMA_pd(y2, h2, q2); q2 = _mm256_NFMA_pd(z2, h3, q2); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))))); q2 = _mm256_sub_pd(q2, _mm256_add_pd(x2, _mm256_add_pd(_mm256_mul_pd(y2, h2), _mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(w2, h4))))); #endif _mm256_store_pd(&q[ldq*3], q1); _mm256_store_pd(&q[(ldq*3)+4], q2); for (i = 4; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-3]); h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_load_pd(&q[i*ldq]); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q1 = _mm256_NFMA_pd(x1, h1, q1); q1 = _mm256_NFMA_pd(y1, h2, q1); q1 = _mm256_NFMA_pd(z1, h3, q1); q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q2 = _mm256_NFMA_pd(y2, h2, q2); q2 = _mm256_NFMA_pd(z2, h3, q2); q2 = _mm256_NFMA_pd(w2, h4, q2); _mm256_store_pd(&q[i*ldq],q1); _mm256_store_pd(&q[(i*ldq)+4],q2); #else q1 = _mm256_load_pd(&q[i*ldq]); q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)))); _mm256_store_pd(&q[i*ldq],q1); q2 = _mm256_load_pd(&q[(i*ldq)+4]); q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2, h4), _mm256_mul_pd(z2, h3)), _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)))); _mm256_store_pd(&q[(i*ldq)+4],q2); #endif } h1 = _mm256_broadcast_sd(&hh[nb-3]); h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); q1 = _mm256_load_pd(&q[nb*ldq]); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q1 = _mm256_NFMA_pd(y1, h2, q1); q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q2 = _mm256_NFMA_pd(y2, h2, q2); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1))); q2 = _mm256_sub_pd(q2, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z2, h3), _mm256_mul_pd(y2, h2)) , _mm256_mul_pd(x2, h1))); #endif _mm256_store_pd(&q[nb*ldq],q1); _mm256_store_pd(&q[(nb*ldq)+4],q2); h1 = _mm256_broadcast_sd(&hh[nb-2]); h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1))); q2 = _mm256_sub_pd(q2, _mm256_add_pd( _mm256_mul_pd(y2, h2) , _mm256_mul_pd(x2, h1))); #endif _mm256_store_pd(&q[(nb+1)*ldq],q1); _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif _mm256_store_pd(&q[(nb+2)*ldq],q1); _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); } /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m256d a1_1 = _mm256_load_pd(&q[ldq*3]); __m256d a2_1 = _mm256_load_pd(&q[ldq*2]); __m256d a3_1 = _mm256_load_pd(&q[ldq]); __m256d a4_1 = _mm256_load_pd(&q[0]); __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); __m256d x1 = a1_1; #else __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); __m256d x1 = a1_1; #endif __m256d q1; __m256d h1; __m256d h2; __m256d h3; __m256d h4; for(i = 4; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-3]); h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); q1 = _mm256_load_pd(&q[i*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); z1 = _mm256_FMA_pd(q1, h3, z1); w1 = _mm256_FMA_pd(q1, h4, w1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-3]); h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); q1 = _mm256_load_pd(&q[nb*ldq]); #ifdef _FMA4__ x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); z1 = _mm256_FMA_pd(q1, h3, z1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); #endif h1 = _mm256_broadcast_sd(&hh[nb-2]); h2 = _mm256_broadcast_sd(&hh[(ldh*1)+nb-1]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); y1 = _mm256_FMA_pd(q1, h2, y1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif ///////////////////////////////////////////////////// // Rank-1 update of Q [4 x nb+3] ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(&hh[0]); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); __m256d vs_1_2 = _mm256_broadcast_sd(&s_1_2); __m256d vs_1_3 = _mm256_broadcast_sd(&s_1_3); __m256d vs_2_3 = _mm256_broadcast_sd(&s_2_3); __m256d vs_1_4 = _mm256_broadcast_sd(&s_1_4); __m256d vs_2_4 = _mm256_broadcast_sd(&s_2_4); __m256d vs_3_4 = _mm256_broadcast_sd(&s_3_4); h1 = tau1; x1 = _mm256_mul_pd(x1, h1); h1 = tau2; h2 = _mm256_mul_pd(h1, vs_1_2); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMSUB_pd(y1, h1, _mm256_mul_pd(x1,h2)); #else y1 = _mm256_sub_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2)); #endif h1 = tau3; h2 = _mm256_mul_pd(h1, vs_1_3); h3 = _mm256_mul_pd(h1, vs_2_3); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMSUB_pd(z1, h1, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); #else z1 = _mm256_sub_pd(_mm256_mul_pd(z1,h1), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); #endif h1 = tau4; h2 = _mm256_mul_pd(h1, vs_1_4); h3 = _mm256_mul_pd(h1, vs_2_4); h4 = _mm256_mul_pd(h1, vs_3_4); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMSUB_pd(w1, h1, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); #else w1 = _mm256_sub_pd(_mm256_mul_pd(w1,h1), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); #endif q1 = _mm256_load_pd(&q[0]); q1 = _mm256_sub_pd(q1, w1); _mm256_store_pd(&q[0],q1); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); q1 = _mm256_load_pd(&q[ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, _mm256_FMA_pd(w1, h4, z1)); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(z1, _mm256_mul_pd(w1, h4))); #endif _mm256_store_pd(&q[ldq],q1); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); q1 = _mm256_load_pd(&q[ldq*2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, y1); q1 = _mm256_NFMA_pd(z1, h3, q1); q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(y1, _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4)))); #endif _mm256_store_pd(&q[ldq*2],q1); h2 = _mm256_broadcast_sd(&hh[ldh+1]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); q1 = _mm256_load_pd(&q[ldq*3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_sub_pd(q1, x1); q1 = _mm256_NFMA_pd(y1, h2, q1); q1 = _mm256_NFMA_pd(z1, h3, q1); q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(x1, _mm256_add_pd(_mm256_mul_pd(y1, h2), _mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(w1, h4))))); #endif _mm256_store_pd(&q[ldq*3], q1); for (i = 4; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-3]); h2 = _mm256_broadcast_sd(&hh[ldh+i-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-1]); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i]); q1 = _mm256_load_pd(&q[i*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q1 = _mm256_NFMA_pd(y1, h2, q1); q1 = _mm256_NFMA_pd(z1, h3, q1); q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1, h4), _mm256_mul_pd(z1, h3)), _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)))); #endif _mm256_store_pd(&q[i*ldq],q1); } h1 = _mm256_broadcast_sd(&hh[nb-3]); h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); q1 = _mm256_load_pd(&q[nb*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q1 = _mm256_NFMA_pd(y1, h2, q1); q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(z1, h3), _mm256_mul_pd(y1, h2)) , _mm256_mul_pd(x1, h1))); #endif _mm256_store_pd(&q[nb*ldq],q1); h1 = _mm256_broadcast_sd(&hh[nb-2]); h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_add_pd( _mm256_mul_pd(y1, h2) , _mm256_mul_pd(x1, h1))); #endif _mm256_store_pd(&q[(nb+1)*ldq],q1); h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif _mm256_store_pd(&q[(nb+2)*ldq],q1); } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_bgq.f900000644000312500001440000005260112717516040020250 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! -------------------------------------------------------------------------------------------------- ! ! This file contains the compute intensive kernels for the Householder transformations. ! ! *** Special IBM BlueGene/Q version with QPX intrinsics in Fortran *** ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! -------------------------------------------------------------------------------------------------- module real_bgq_kernel private public double_hh_trafo_bgq contains subroutine double_hh_trafo_bgq(q, hh, nb, nq, ldq, ldh) use precision implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*) real(kind=rk) :: s integer(kind=ik) :: i ! Safety only: if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!' call alignx(32,q) ! Calculate dot product of the two Householder vectors s = hh(2,2)*1 do i=3,nb s = s+hh(i,2)*hh(i-1,1) enddo do i=1,nq-20,24 call hh_trafo_kernel_24_bgq(q(i ,1), hh, nb, ldq, ldh, s) enddo if(nq-i+1 > 16) then call hh_trafo_kernel_16_bgq(q(i ,1), hh, nb, ldq, ldh, s) call hh_trafo_kernel_4_bgq(q(i+16,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 12) then call hh_trafo_kernel_8_bgq(q(i ,1), hh, nb, ldq, ldh, s) call hh_trafo_kernel_8_bgq(q(i+8,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 8) then call hh_trafo_kernel_8_bgq(q(i ,1), hh, nb, ldq, ldh, s) call hh_trafo_kernel_4_bgq(q(i+8,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 4) then call hh_trafo_kernel_8_bgq(q(i ,1), hh, nb, ldq, ldh, s) else if(nq-i+1 > 0) then call hh_trafo_kernel_4_bgq(q(i ,1), hh, nb, ldq, ldh, s) endif end subroutine double_hh_trafo_bgq ! -------------------------------------------------------------------------------------------------- ! The following kernels perform the Householder transformation on Q for 24/16/8/4 rows. ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_24_bgq(q, hh, nb, ldq, ldh, s) use precision implicit none include 'mpif.h' integer(kind=ik), intent(in) :: nb, ldq, ldh real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*), s VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4, QPX_x5, QPX_x6 VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4, QPX_y5, QPX_y6 VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4, QPX_q5, QPX_q6 VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s integer i call alignx(32,q) !--- multiply Householder vectors with matrix q --- QPX_x1 = VEC_LD(0,q(1,2)) QPX_x2 = VEC_LD(0,q(5,2)) QPX_x3 = VEC_LD(0,q(9,2)) QPX_x4 = VEC_LD(0,q(13,2)) QPX_x5 = VEC_LD(0,q(17,2)) QPX_x6 = VEC_LD(0,q(21,2)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,1)) QPX_q2 = VEC_LD(0,q(5,1)) QPX_q3 = VEC_LD(0,q(9,1)) QPX_q4 = VEC_LD(0,q(13,1)) QPX_q5 = VEC_LD(0,q(17,1)) QPX_q6 = VEC_LD(0,q(21,1)) QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1) QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2) QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3) QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4) QPX_y5 = VEC_MADD(QPX_x5, QPX_h2, QPX_q5) QPX_y6 = VEC_MADD(QPX_x6, QPX_h2, QPX_q6) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_q2 = VEC_LD(0,q(5,i)) QPX_q3 = VEC_LD(0,q(9,i)) QPX_q4 = VEC_LD(0,q(13,i)) QPX_q5 = VEC_LD(0,q(17,i)) QPX_q6 = VEC_LD(0,q(21,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2) QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3) QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4) QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5) QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1) QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2) QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3) QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4) QPX_y5 = VEC_MADD(QPX_q5, QPX_h2, QPX_y5) QPX_y6 = VEC_MADD(QPX_q6, QPX_h2, QPX_y6) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q2 = VEC_LD(0,q(5,nb+1)) QPX_q3 = VEC_LD(0,q(9,nb+1)) QPX_q4 = VEC_LD(0,q(13,nb+1)) QPX_q5 = VEC_LD(0,q(17,nb+1)) QPX_q6 = VEC_LD(0,q(21,nb+1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2) QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3) QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4) QPX_x5 = VEC_MADD(QPX_q5, QPX_h1, QPX_x5) QPX_x6 = VEC_MADD(QPX_q6, QPX_h1, QPX_x6) !--- multiply T matrix --- QPX_tau1 = VEC_SPLATS(-hh(1,1)) QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1) QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1) QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1) QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1) QPX_x5 = VEC_MUL(QPX_x5, QPX_tau1) QPX_x6 = VEC_MUL(QPX_x6, QPX_tau1) QPX_tau2 = VEC_SPLATS(-hh(1,2)) QPX_s = VEC_SPLATS(-hh(1,2)*s) QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2) QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2) QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2) QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2) QPX_y5 = VEC_MUL(QPX_y5, QPX_tau2) QPX_y6 = VEC_MUL(QPX_y6, QPX_tau2) QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1) QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2) QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3) QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4) QPX_y5 = VEC_MADD(QPX_x5, QPX_s, QPX_y5) QPX_y6 = VEC_MADD(QPX_x6, QPX_s, QPX_y6) !--- rank-2 update of q --- QPX_q1 = VEC_LD(0,q(1,1)) QPX_q2 = VEC_LD(0,q(5,1)) QPX_q3 = VEC_LD(0,q(9,1)) QPX_q4 = VEC_LD(0,q(13,1)) QPX_q5 = VEC_LD(0,q(17,1)) QPX_q6 = VEC_LD(0,q(21,1)) QPX_q1 = VEC_ADD(QPX_q1, QPX_y1) QPX_q2 = VEC_ADD(QPX_q2, QPX_y2) QPX_q3 = VEC_ADD(QPX_q3, QPX_y3) QPX_q4 = VEC_ADD(QPX_q4, QPX_y4) QPX_q5 = VEC_ADD(QPX_q5, QPX_y5) QPX_q6 = VEC_ADD(QPX_q6, QPX_y6) call VEC_ST(QPX_q1, 0, q(1,1)) call VEC_ST(QPX_q2, 0, q(5,1)) call VEC_ST(QPX_q3, 0, q(9,1)) call VEC_ST(QPX_q4, 0, q(13,1)) call VEC_ST(QPX_q5, 0, q(17,1)) call VEC_ST(QPX_q6, 0, q(21,1)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,2)) QPX_q2 = VEC_LD(0,q(5,2)) QPX_q3 = VEC_LD(0,q(9,2)) QPX_q4 = VEC_LD(0,q(13,2)) QPX_q5 = VEC_LD(0,q(17,2)) QPX_q6 = VEC_LD(0,q(21,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2) QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3) QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4) QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5) QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6) QPX_q1 = VEC_ADD(QPX_q1, QPX_x1) QPX_q2 = VEC_ADD(QPX_q2, QPX_x2) QPX_q3 = VEC_ADD(QPX_q3, QPX_x3) QPX_q4 = VEC_ADD(QPX_q4, QPX_x4) QPX_q5 = VEC_ADD(QPX_q5, QPX_x5) QPX_q6 = VEC_ADD(QPX_q6, QPX_x6) call VEC_ST(QPX_q1, 0, q(1,2)) call VEC_ST(QPX_q2, 0, q(5,2)) call VEC_ST(QPX_q3, 0, q(9,2)) call VEC_ST(QPX_q4, 0, q(13,2)) call VEC_ST(QPX_q5, 0, q(17,2)) call VEC_ST(QPX_q6, 0, q(21,2)) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_q2 = VEC_LD(0,q(5,i)) QPX_q3 = VEC_LD(0,q(9,i)) QPX_q4 = VEC_LD(0,q(13,i)) QPX_q5 = VEC_LD(0,q(17,i)) QPX_q6 = VEC_LD(0,q(21,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2) QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3) QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4) QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5) QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2) QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3) QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4) QPX_q5 = VEC_MADD(QPX_y5, QPX_h2, QPX_q5) QPX_q6 = VEC_MADD(QPX_y6, QPX_h2, QPX_q6) call VEC_ST(QPX_q1, 0, q(1,i)) call VEC_ST(QPX_q2, 0, q(5,i)) call VEC_ST(QPX_q3, 0, q(9,i)) call VEC_ST(QPX_q4, 0, q(13,i)) call VEC_ST(QPX_q5, 0, q(17,i)) call VEC_ST(QPX_q6, 0, q(21,i)) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q2 = VEC_LD(0,q(5,nb+1)) QPX_q3 = VEC_LD(0,q(9,nb+1)) QPX_q4 = VEC_LD(0,q(13,nb+1)) QPX_q5 = VEC_LD(0,q(17,nb+1)) QPX_q6 = VEC_LD(0,q(21,nb+1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2) QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3) QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4) QPX_q5 = VEC_MADD(QPX_x5, QPX_h1, QPX_q5) QPX_q6 = VEC_MADD(QPX_x6, QPX_h1, QPX_q6) call VEC_ST(QPX_q1, 0, q(1,nb+1)) call VEC_ST(QPX_q2, 0, q(5,nb+1)) call VEC_ST(QPX_q3, 0, q(9,nb+1)) call VEC_ST(QPX_q4, 0, q(13,nb+1)) call VEC_ST(QPX_q5, 0, q(17,nb+1)) call VEC_ST(QPX_q6, 0, q(21,nb+1)) end subroutine hh_trafo_kernel_24_bgq ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_16_bgq(q, hh, nb, ldq, ldh, s) use precision implicit none include 'mpif.h' integer(kind=ik), intent(in) :: nb, ldq, ldh real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*), s VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_x3, QPX_x4 VECTOR(REAL(8))::QPX_y1, QPX_y2, QPX_y3, QPX_y4 VECTOR(REAL(8))::QPX_q1, QPX_q2, QPX_q3, QPX_q4 VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s integer i call alignx(32,q) !--- multiply Householder vectors with matrix q --- QPX_x1 = VEC_LD(0,q(1,2)) QPX_x2 = VEC_LD(0,q(5,2)) QPX_x3 = VEC_LD(0,q(9,2)) QPX_x4 = VEC_LD(0,q(13,2)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,1)) QPX_q2 = VEC_LD(0,q(5,1)) QPX_q3 = VEC_LD(0,q(9,1)) QPX_q4 = VEC_LD(0,q(13,1)) QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1) QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2) QPX_y3 = VEC_MADD(QPX_x3, QPX_h2, QPX_q3) QPX_y4 = VEC_MADD(QPX_x4, QPX_h2, QPX_q4) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_q2 = VEC_LD(0,q(5,i)) QPX_q3 = VEC_LD(0,q(9,i)) QPX_q4 = VEC_LD(0,q(13,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2) QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3) QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1) QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2) QPX_y3 = VEC_MADD(QPX_q3, QPX_h2, QPX_y3) QPX_y4 = VEC_MADD(QPX_q4, QPX_h2, QPX_y4) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q2 = VEC_LD(0,q(5,nb+1)) QPX_q3 = VEC_LD(0,q(9,nb+1)) QPX_q4 = VEC_LD(0,q(13,nb+1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2) QPX_x3 = VEC_MADD(QPX_q3, QPX_h1, QPX_x3) QPX_x4 = VEC_MADD(QPX_q4, QPX_h1, QPX_x4) !--- multiply T matrix --- QPX_tau1 = VEC_SPLATS(-hh(1,1)) QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1) QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1) QPX_x3 = VEC_MUL(QPX_x3, QPX_tau1) QPX_x4 = VEC_MUL(QPX_x4, QPX_tau1) QPX_tau2 = VEC_SPLATS(-hh(1,2)) QPX_s = VEC_SPLATS(-hh(1,2)*s) QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2) QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2) QPX_y3 = VEC_MUL(QPX_y3, QPX_tau2) QPX_y4 = VEC_MUL(QPX_y4, QPX_tau2) QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1) QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2) QPX_y3 = VEC_MADD(QPX_x3, QPX_s, QPX_y3) QPX_y4 = VEC_MADD(QPX_x4, QPX_s, QPX_y4) !--- rank-2 update of q --- QPX_q1 = VEC_LD(0,q(1,1)) QPX_q2 = VEC_LD(0,q(5,1)) QPX_q3 = VEC_LD(0,q(9,1)) QPX_q4 = VEC_LD(0,q(13,1)) QPX_q1 = VEC_ADD(QPX_q1, QPX_y1) QPX_q2 = VEC_ADD(QPX_q2, QPX_y2) QPX_q3 = VEC_ADD(QPX_q3, QPX_y3) QPX_q4 = VEC_ADD(QPX_q4, QPX_y4) call VEC_ST(QPX_q1, 0, q(1,1)) call VEC_ST(QPX_q2, 0, q(5,1)) call VEC_ST(QPX_q3, 0, q(9,1)) call VEC_ST(QPX_q4, 0, q(13,1)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,2)) QPX_q2 = VEC_LD(0,q(5,2)) QPX_q3 = VEC_LD(0,q(9,2)) QPX_q4 = VEC_LD(0,q(13,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2) QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3) QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4) QPX_q1 = VEC_ADD(QPX_q1, QPX_x1) QPX_q2 = VEC_ADD(QPX_q2, QPX_x2) QPX_q3 = VEC_ADD(QPX_q3, QPX_x3) QPX_q4 = VEC_ADD(QPX_q4, QPX_x4) call VEC_ST(QPX_q1, 0, q(1,2)) call VEC_ST(QPX_q2, 0, q(5,2)) call VEC_ST(QPX_q3, 0, q(9,2)) call VEC_ST(QPX_q4, 0, q(13,2)) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_q2 = VEC_LD(0,q(5,i)) QPX_q3 = VEC_LD(0,q(9,i)) QPX_q4 = VEC_LD(0,q(13,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2) QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3) QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2) QPX_q3 = VEC_MADD(QPX_y3, QPX_h2, QPX_q3) QPX_q4 = VEC_MADD(QPX_y4, QPX_h2, QPX_q4) call VEC_ST(QPX_q1, 0, q(1,i)) call VEC_ST(QPX_q2, 0, q(5,i)) call VEC_ST(QPX_q3, 0, q(9,i)) call VEC_ST(QPX_q4, 0, q(13,i)) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q2 = VEC_LD(0,q(5,nb+1)) QPX_q3 = VEC_LD(0,q(9,nb+1)) QPX_q4 = VEC_LD(0,q(13,nb+1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2) QPX_q3 = VEC_MADD(QPX_x3, QPX_h1, QPX_q3) QPX_q4 = VEC_MADD(QPX_x4, QPX_h1, QPX_q4) call VEC_ST(QPX_q1, 0, q(1,nb+1)) call VEC_ST(QPX_q2, 0, q(5,nb+1)) call VEC_ST(QPX_q3, 0, q(9,nb+1)) call VEC_ST(QPX_q4, 0, q(13,nb+1)) end subroutine hh_trafo_kernel_16_bgq ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_8_bgq(q, hh, nb, ldq, ldh, s) use precision implicit none include 'mpif.h' integer(kind=ik), intent(in) :: nb, ldq, ldh real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*), s integer(kind=ik) :: i VECTOR(REAL(8))::QPX_x1, QPX_x2, QPX_y1, QPX_y2 VECTOR(REAL(8))::QPX_q1, QPX_q2 VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s call alignx(32,q) !--- multiply Householder vectors with matrix q --- QPX_x1 = VEC_LD(0,q(1,2)) QPX_x2 = VEC_LD(0,q(5,2)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,1)) QPX_q2 = VEC_LD(0,q(5,1)) QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1) QPX_y2 = VEC_MADD(QPX_x2, QPX_h2, QPX_q2) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_q2 = VEC_LD(0,q(5,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1) QPX_y2 = VEC_MADD(QPX_q2, QPX_h2, QPX_y2) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q2 = VEC_LD(0,q(5,nb+1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_x2 = VEC_MADD(QPX_q2, QPX_h1, QPX_x2) !--- multiply T matrix --- QPX_tau1 = VEC_SPLATS(-hh(1,1)) QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1) QPX_x2 = VEC_MUL(QPX_x2, QPX_tau1) QPX_tau2 = VEC_SPLATS(-hh(1,2)) QPX_s = VEC_SPLATS(-hh(1,2)*s) QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2) QPX_y2 = VEC_MUL(QPX_y2, QPX_tau2) QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1) QPX_y2 = VEC_MADD(QPX_x2, QPX_s, QPX_y2) !--- rank-2 update of q --- QPX_q1 = VEC_LD(0,q(1,1)) QPX_q2 = VEC_LD(0,q(5,1)) QPX_q1 = VEC_ADD(QPX_q1, QPX_y1) QPX_q2 = VEC_ADD(QPX_q2, QPX_y2) call VEC_ST(QPX_q1, 0, q(1,1)) call VEC_ST(QPX_q2, 0, q(5,1)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,2)) QPX_q2 = VEC_LD(0,q(5,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2) QPX_q1 = VEC_ADD(QPX_q1, QPX_x1) QPX_q2 = VEC_ADD(QPX_q2, QPX_x2) call VEC_ST(QPX_q1, 0, q(1,2)) call VEC_ST(QPX_q2, 0, q(5,2)) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_q2 = VEC_LD(0,q(5,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q2 = VEC_MADD(QPX_y2, QPX_h2, QPX_q2) call VEC_ST(QPX_q1, 0, q(1,i)) call VEC_ST(QPX_q2, 0, q(5,i)) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q2 = VEC_LD(0,q(5,nb+1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_q2 = VEC_MADD(QPX_x2, QPX_h1, QPX_q2) call VEC_ST(QPX_q1, 0, q(1,nb+1)) call VEC_ST(QPX_q2, 0, q(5,nb+1)) end subroutine hh_trafo_kernel_8_bgq ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_4_bgq(q, hh, nb, ldq, ldh, s) use precision implicit none include 'mpif.h' integer(kind=ik), intent(in) :: nb, ldq, ldh real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*), s integer(kind=ik) :: i VECTOR(REAL(8))::QPX_x1, QPX_y1 VECTOR(REAL(8))::QPX_q1 VECTOR(REAL(8))::QPX_h1, QPX_h2, QPX_tau1, QPX_tau2, QPX_s call alignx(32,q) !--- multiply Householder vectors with matrix q --- QPX_x1 = VEC_LD(0,q(1,2)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,1)) QPX_y1 = VEC_MADD(QPX_x1, QPX_h2, QPX_q1) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_y1 = VEC_MADD(QPX_q1, QPX_h2, QPX_y1) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_x1 = VEC_MADD(QPX_q1, QPX_h1, QPX_x1) !--- multiply T matrix --- QPX_tau1 = VEC_SPLATS(-hh(1,1)) QPX_x1 = VEC_MUL(QPX_x1, QPX_tau1) QPX_tau2 = VEC_SPLATS(-hh(1,2)) QPX_s = VEC_SPLATS(-hh(1,2)*s) QPX_y1 = VEC_MUL(QPX_y1, QPX_tau2) QPX_y1 = VEC_MADD(QPX_x1, QPX_s, QPX_y1) !--- rank-2 update of q --- QPX_q1 = VEC_LD(0,q(1,1)) QPX_q1 = VEC_ADD(QPX_q1, QPX_y1) call VEC_ST(QPX_q1, 0, q(1,1)) QPX_h2 = VEC_SPLATS(hh(2,2)) QPX_q1 = VEC_LD(0,q(1,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) QPX_q1 = VEC_ADD(QPX_q1, QPX_x1) call VEC_ST(QPX_q1, 0, q(1,2)) do i=3,nb,1 QPX_q1 = VEC_LD(0,q(1,i)) QPX_h1 = VEC_SPLATS(hh(i-1,1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) QPX_h2 = VEC_SPLATS(hh(i,2)) QPX_q1 = VEC_MADD(QPX_y1, QPX_h2, QPX_q1) call VEC_ST(QPX_q1, 0, q(1,i)) enddo QPX_h1 = VEC_SPLATS(hh(nb,1)) QPX_q1 = VEC_LD(0,q(1,nb+1)) QPX_q1 = VEC_MADD(QPX_x1, QPX_h1, QPX_q1) call VEC_ST(QPX_q1, 0, q(1,nb+1)) end subroutine hh_trafo_kernel_4_bgq end module real_bgq_kernel ! -------------------------------------------------------------------------------------------------- elpa-2016.05.001/src/elpa2_kernels/mod_single_hh_trafo_real.F900000644000312500001440000000310612717516040020617 00000000000000module single_hh_trafo_real implicit none #include "config-f90.h" #ifdef WITH_OPENMP public single_hh_trafo_real_cpu_openmp #else public single_hh_trafo_real_cpu #endif contains #ifdef WITH_OPENMP subroutine single_hh_trafo_real_cpu_openmp(q, hh, nb, nq, ldq) #else subroutine single_hh_trafo_real_cpu(q, hh, nb, nq, ldq) #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision ! Perform single real Householder transformation. ! This routine is not performance critical and thus it is coded here in Fortran implicit none integer(kind=ik), intent(in) :: nb, nq, ldq ! real(kind=rk), intent(inout) :: q(ldq, *) ! real(kind=rk), intent(in) :: hh(*) real(kind=rk), intent(inout) :: q(1:ldq, 1:nb) real(kind=rk), intent(in) :: hh(1:nb) integer(kind=ik) :: i real(kind=rk) :: v(nq) #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%start("single_hh_trafo_real_cpu_openmp") #else call timer%start("single_hh_trafo_real_cpu") #endif #endif ! v = q * hh v(:) = q(1:nq,1) do i=2,nb v(:) = v(:) + q(1:nq,i) * hh(i) enddo ! v = v * tau v(:) = v(:) * hh(1) ! q = q - v * hh**T q(1:nq,1) = q(1:nq,1) - v(:) do i=2,nb q(1:nq,i) = q(1:nq,i) - v(:) * hh(i) enddo #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%stop("single_hh_trafo_real_cpu_openmp") #else call timer%stop("single_hh_trafo_real_cpu") #endif #endif end subroutine end module elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s0000644000312500001440000005356212717516040020145 00000000000000# This file is part of ELPA. # # The ELPA library was originally created by the ELPA consortium, # consisting of the following organizations: # # - Max Planck Computing and Data Facility (MPCDF), formerly known as # Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), # - Bergische Universität Wuppertal, Lehrstuhl für angewandte # Informatik, # - Technische Universität München, Lehrstuhl für Informatik mit # Schwerpunkt Wissenschaftliches Rechnen , # - Fritz-Haber-Institut, Berlin, Abt. Theorie, # - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, # Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, # and # - IBM Deutschland GmbH # # # More information can be found here: # http://elpa.mpcdf.mpg.de/ # # ELPA is free software: you can redistribute it and/or modify # it under the terms of the version 3 of the license of the # GNU Lesser General Public License as published by the Free # Software Foundation. # # ELPA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with ELPA. If not, see # # ELPA reflects a substantial effort on the part of the original # ELPA consortium, and we ask you to respect the spirit of the # license that we chose: i.e., please contribute any changes you # may have back to the original ELPA library distribution, and keep # any derivatives of ELPA under the same license that we chose for # the original distribution, the GNU Lesser General Public License. # # -------------------------------------------------------------------------------------------------- # # This file contains the compute intensive kernels for the Householder transformations, # coded in x86_64 assembler and using SSE2/SSE3 instructions. # # It must be assembled with GNU assembler (just "as" on most Linux machines) # # Copyright of the original code rests with the authors inside the ELPA # consortium. The copyright of any additional modifications shall rest # with their original authors, but shall adhere to the licensing terms # distributed along with the original code in the file "COPYING". # # -------------------------------------------------------------------------------------------------- .globl double_hh_trafo .globl single_hh_trafo_complex .text #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- .macro hh_trafo_real nrows # When this macro is called, the following registers are set and must not be changed # %rdi: Address of q # %rsi: Address of hh # %rdx: nb # %rcx: Remaining rows nq # %r8: ldq in bytes # %r9: ldh in bytes # %rax: address of hh at the end of the loops # The top of the stack must contain the dot product of the two Householder vectors movq %rdi, %r10 # Copy address of q movq %rsi, %r11 # Copy address of hh # x1 = q(1,2) # x2 = q(2,2) # # y1 = q(1,1) + q(1,2)*hh(2,2) # y2 = q(2,1) + q(2,2)*hh(2,2) movaps (%r10), %xmm6 # y1 = q(1,1) movaps 16(%r10), %xmm7 # y2 = q(2,1) .if \nrows>=8 movaps 32(%r10), %xmm8 movaps 48(%r10), %xmm9 .if \nrows==12 movaps 64(%r10), %xmm10 movaps 80(%r10), %xmm11 .endif .endif addq %r8, %r10 # %r10 => q(.,2) movddup 8(%r11,%r9), %xmm15 # hh(2,2) .macro mac_pre_loop1 qoff, X, Y movaps \qoff(%r10), \X # xn = q(n,2) movaps \X, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, \Y # yn = yn + xn*h(2,2) .endm mac_pre_loop1 0, %xmm0, %xmm6 mac_pre_loop1 16, %xmm1, %xmm7 .if \nrows>=8 mac_pre_loop1 32, %xmm2, %xmm8 mac_pre_loop1 48, %xmm3, %xmm9 .if \nrows==12 mac_pre_loop1 64, %xmm4, %xmm10 mac_pre_loop1 80, %xmm5, %xmm11 .endif .endif .purgem mac_pre_loop1 # do i=3,nb # h1 = hh(i-1,1) # h2 = hh(i,2) # x1 = x1 + q(1,i)*h1 # y1 = y1 + q(1,i)*h2 # x2 = x2 + q(2,i)*h1 # y2 = y2 + q(2,i)*h2 # ... # enddo addq $8, %r11 .align 16 1: cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax jge 2f addq %r8, %r10 # %r10 => q(.,i) movddup (%r11), %xmm14 # hh(i-1,1) movddup 8(%r11,%r9), %xmm15 # hh(i,2) .macro mac_loop1 qoff, X, Y movaps \qoff(%r10), %xmm13 # q(.,i) movaps %xmm13, %xmm12 mulpd %xmm14, %xmm13 addpd %xmm13, \X # xn = xn + q(.,i)*h1 mulpd %xmm15, %xmm12 addpd %xmm12, \Y # yn = yn + q(.,i)*h2 .endm mac_loop1 0, %xmm0, %xmm6 mac_loop1 16, %xmm1, %xmm7 .if \nrows>=8 mac_loop1 32, %xmm2, %xmm8 mac_loop1 48, %xmm3, %xmm9 .if \nrows==12 mac_loop1 64, %xmm4, %xmm10 mac_loop1 80, %xmm5, %xmm11 .endif .endif .purgem mac_loop1 addq $8, %r11 jmp 1b 2: # x1 = x1 + q(1,nb+1)*hh(nb,1) # x2 = x2 + q(2,nb+1)*hh(nb,1) addq %r8, %r10 # %r10 => q(.,nb+1) movddup (%r11), %xmm14 .macro mac_post_loop1 qoff, X movaps \qoff(%r10), %xmm13 # q(.,nb+1) mulpd %xmm14, %xmm13 addpd %xmm13, \X .endm mac_post_loop1 0, %xmm0 mac_post_loop1 16, %xmm1 .if \nrows>=8 mac_post_loop1 32, %xmm2 mac_post_loop1 48, %xmm3 .if \nrows==12 mac_post_loop1 64, %xmm4 mac_post_loop1 80, %xmm5 .endif .endif .purgem mac_post_loop1 # tau1 = hh(1,1) # tau2 = hh(1,2) # # h1 = -tau1 # x1 = x1*h1 # x2 = x2*h1 movq %rsi, %r11 # restore %r11 (hh(1,1)) movddup (%r11), %xmm12 # hh(1,1) xorps %xmm14, %xmm14 subpd %xmm12, %xmm14 # %xmm14 = -hh(1,1) mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm1 .if \nrows>=8 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm3 .if \nrows==12 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm5 .endif .endif # h1 = -tau2 # h2 = -tau2*s # y1 = y1*h1 + x1*h2 # y2 = y2*h1 + x2*h2 movddup (%r11,%r9), %xmm12 # hh(1,2) xorps %xmm15, %xmm15 subpd %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1 movaps %xmm15, %xmm14 movddup (%rsp), %xmm12 # Get s from top of stack mulpd %xmm12, %xmm14 # %xmm14 = h2 .macro mac_xform_y X, Y mulpd %xmm15, \Y # y1 = y1*h1 movaps \X, %xmm12 mulpd %xmm14, %xmm12 addpd %xmm12, \Y .endm mac_xform_y %xmm0, %xmm6 mac_xform_y %xmm1, %xmm7 .if \nrows>=8 mac_xform_y %xmm2, %xmm8 mac_xform_y %xmm3, %xmm9 .if \nrows==12 mac_xform_y %xmm4, %xmm10 mac_xform_y %xmm5, %xmm11 .endif .endif .purgem mac_xform_y # q(1,1) = q(1,1) + y1 # q(2,1) = q(2,1) + y2 movq %rdi, %r10 # restore original Q .macro mac_pre_loop2_1 qoff, Y movaps \qoff(%r10), %xmm13 # q(.,1) addpd \Y, %xmm13 movaps %xmm13, \qoff(%r10) .endm mac_pre_loop2_1 0, %xmm6 mac_pre_loop2_1 16, %xmm7 .if \nrows>=8 mac_pre_loop2_1 32, %xmm8 mac_pre_loop2_1 48, %xmm9 .if \nrows==12 mac_pre_loop2_1 64, %xmm10 mac_pre_loop2_1 80, %xmm11 .endif .endif .purgem mac_pre_loop2_1 # q(1,2) = q(1,2) + x1 + y1*hh(2,2) # q(2,2) = q(2,2) + x2 + y2*hh(2,2) addq %r8, %r10 # %r10 => q(.,2) movddup 8(%r11,%r9), %xmm15 # hh(2,2) .macro mac_pre_loop2_2 qoff, X, Y movaps \X, %xmm13 movaps \Y, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, %xmm13 addpd \qoff(%r10), %xmm13 movaps %xmm13, \qoff(%r10) .endm mac_pre_loop2_2 0, %xmm0, %xmm6 mac_pre_loop2_2 16, %xmm1, %xmm7 .if \nrows>=8 mac_pre_loop2_2 32, %xmm2, %xmm8 mac_pre_loop2_2 48, %xmm3, %xmm9 .if \nrows==12 mac_pre_loop2_2 64, %xmm4, %xmm10 mac_pre_loop2_2 80, %xmm5, %xmm11 .endif .endif .purgem mac_pre_loop2_2 # do i=3,nb # h1 = hh(i-1,1) # h2 = hh(i,2) # q(1,i) = q(1,i) + x1*h1 + y1*h2 # q(2,i) = q(2,i) + x2*h1 + y2*h2 # enddo addq $8, %r11 .align 16 1: cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax jge 2f addq %r8, %r10 # %r10 => q(.,i) movddup (%r11), %xmm14 # hh(i-1,1) movddup 8(%r11,%r9), %xmm15 # hh(i,2) .macro mac_loop2 qoff, X, Y movaps \X, %xmm13 mulpd %xmm14, %xmm13 movaps \Y, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, %xmm13 addpd \qoff(%r10), %xmm13 movaps %xmm13, \qoff(%r10) .endm mac_loop2 0, %xmm0, %xmm6 mac_loop2 16, %xmm1, %xmm7 .if \nrows>=8 mac_loop2 32, %xmm2, %xmm8 mac_loop2 48, %xmm3, %xmm9 .if \nrows==12 mac_loop2 64, %xmm4, %xmm10 mac_loop2 80, %xmm5, %xmm11 .endif .endif .purgem mac_loop2 addq $8, %r11 jmp 1b 2: # q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) # q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) addq %r8, %r10 # %r10 => q(.,nb+1) movddup (%r11), %xmm14 .macro mac_post_loop2 qoff, X movaps \qoff(%r10), %xmm13 # q(.,nb+1) mulpd %xmm14, \X addpd \X, %xmm13 movaps %xmm13, \qoff(%r10) .endm mac_post_loop2 0, %xmm0 mac_post_loop2 16, %xmm1 .if \nrows>=8 mac_post_loop2 32, %xmm2 mac_post_loop2 48, %xmm3 .if \nrows==12 mac_post_loop2 64, %xmm4 mac_post_loop2 80, %xmm5 .endif .endif .purgem mac_post_loop2 .endm #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # FORTRAN Interface: # # subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh) # # integer, intent(in) :: nb, nq, ldq, ldh # real*8, intent(inout) :: q(ldq,*) # real*8, intent(in) :: hh(ldh,*) # # Parameter mapping to registers # parameter 1: %rdi : q # parameter 2: %rsi : hh # parameter 3: %rdx : nb # parameter 4: %rcx : nq # parameter 5: %r8 : ldq # parameter 6: %r9 : ldh # #------------------------------------------------------------------------------- #!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL #!f> interface #!f> subroutine double_hh_trafo(q, hh, nb, nq, ldq, ldh) bind(C,name="double_hh_trafo") #!f> use, intrinsic :: iso_c_binding #!f> integer(kind=c_int) :: nb, nq, ldq, ldh #!f> real(kind=c_double) :: q(*) #!f> real(kind=c_double) :: hh(nb,6) #!f> end subroutine #!f> end interface #!f>#endif .align 16,0x90 double_hh_trafo: # Get integer parameters into corresponding registers movslq (%rdx), %rdx # nb movslq (%rcx), %rcx # nq movslq (%r8), %r8 # ldq movslq (%r9), %r9 # ldh # Get ldq in bytes addq %r8, %r8 addq %r8, %r8 addq %r8, %r8 # 8*ldq, i.e. ldq in bytes # Get ldh in bytes addq %r9, %r9 addq %r9, %r9 addq %r9, %r9 # 8*ldh, i.e. ldh in bytes # set %rax to the address of hh at the end of the loops, # i.e. if %rdx >= %rax we must jump out of the loop. # please note: %rax = 8*%rdx + %rsi - 8 movq %rdx, %rax addq %rax, %rax addq %rax, %rax addq %rax, %rax addq %rsi, %rax subq $8, %rax #----------------------------------------------------------- # Calculate the dot product of the two Householder vectors # decrement stack pointer to make space for s subq $8, %rsp # Fortran code: # s = hh(2,2)*1 # do i=3,nb # s = s+hh(i,2)*hh(i-1,1) # enddo movq %rsi, %r11 # Copy address of hh movsd 8(%r11,%r9), %xmm0 # hh(2,2) addq $8, %r11 1: cmpq %rax, %r11 jge 2f movsd (%r11), %xmm14 # hh(i-1,1) movsd 8(%r11,%r9), %xmm15 # hh(i,2) mulsd %xmm14, %xmm15 addsd %xmm15, %xmm0 addq $8, %r11 jmp 1b 2: movsd %xmm0, (%rsp) # put s on top of stack #----------------------------------------------------------- rloop_s: cmpq $8, %rcx # if %rcx <= 8 jump out of loop jle rloop_e hh_trafo_real 12 # transform 12 rows addq $96, %rdi # increment q start adress by 96 bytes (6 rows) subq $12, %rcx # decrement nq jmp rloop_s rloop_e: cmpq $4, %rcx # if %rcx <= 4 jump to test_2 jle test_4 hh_trafo_real 8 # transform 8 rows jmp return1 test_4: cmpq $0, %rcx # if %rcx <= 0 jump to return jle return1 hh_trafo_real 4 # transform 4 rows return1: addq $8, %rsp # reset stack pointer ret .align 16,0x90 #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- .macro hh_trafo_complex nrows # When this macro is called, the following registers are set and must not be changed # %rdi: Address of q # %rsi: Address of hh # %rdx: nb # %rcx: Remaining rows nq # %r8: ldq in bytes movq %rdi, %r10 # Copy address of q movq %rsi, %r11 # Copy address of hh # set %rax to the address of hh at the end of the loops, # i.e. if %rdx >= %rax we must jump out of the loop. # please note: %rax = 16*%rdx + %rsi movq %rdx, %rax addq %rax, %rax addq %rax, %rax addq %rax, %rax addq %rax, %rax addq %rsi, %rax # x1 = q(1,1); y1 = 0 # x2 = q(2,1); y2 = 0 # ... movaps (%r10), %xmm0 movaps 16(%r10), %xmm1 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 .if \nrows>=4 movaps 32(%r10), %xmm2 movaps 48(%r10), %xmm3 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 .if \nrows==6 movaps 64(%r10), %xmm4 movaps 80(%r10), %xmm5 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 .endif .endif # do i=2,nb # h1 = conjg(hh(i)) # x1 = x1 + q(1,i)*h1 # x2 = x2 + q(2,i)*h1 # ... # enddo addq $16, %r11 # %r11 => hh(2) .align 16 1: cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax jge 2f addq %r8, %r10 # %r10 => q(.,i) movddup (%r11), %xmm14 # real(hh(i)) movddup 8(%r11), %xmm15 # imag(hh(i)) .macro mac_loop1 qoff, X, Y movaps \qoff(%r10), %xmm13 # q(.,i) movaps %xmm13, %xmm12 mulpd %xmm14, %xmm13 # q(.,i)*real(hh(i)) addpd %xmm13, \X # x1 = x1 + q(.,i)*real(hh(i)) mulpd %xmm15, %xmm12 # q(.,i)*imag(hh(i)) addsubpd %xmm12, \Y # y1 = y1 -/+ q(.,i)*imag(hh(i)) .endm mac_loop1 0, %xmm0, %xmm6 mac_loop1 16, %xmm1, %xmm7 .if \nrows>=4 mac_loop1 32, %xmm2, %xmm8 mac_loop1 48, %xmm3, %xmm9 .if \nrows==6 mac_loop1 64, %xmm4, %xmm10 mac_loop1 80, %xmm5, %xmm11 .endif .endif .purgem mac_loop1 addq $16, %r11 # %r11 => hh(i+1) jmp 1b 2: # Now the content of the yn has to be swapped and added to xn .macro mac_post_loop_1 X, Y shufpd $1, \Y, \Y addpd \Y, \X .endm mac_post_loop_1 %xmm0, %xmm6 mac_post_loop_1 %xmm1, %xmm7 .if \nrows>=4 mac_post_loop_1 %xmm2, %xmm8 mac_post_loop_1 %xmm3, %xmm9 .if \nrows==6 mac_post_loop_1 %xmm4, %xmm10 mac_post_loop_1 %xmm5, %xmm11 .endif .endif .purgem mac_post_loop_1 # tau1 = hh(1) # # h1 = -tau1 # x1 = x1*h1; y1 = x1 with halfes exchanged # x2 = x2*h1; y2 = x2 with halfes exchanged # ... movq %rsi, %r11 # restore address of hh xorps %xmm14, %xmm14 movddup (%r11), %xmm12 # real(hh(1)) subpd %xmm12, %xmm14 #-real(hh(1)) xorps %xmm15, %xmm15 movddup 8(%r11), %xmm12 # imag(hh(1)) subpd %xmm12, %xmm15 #-imag(hh(1)) .macro mac_xform X, Y movaps \X, %xmm12 shufpd $1, \X, %xmm12 mulpd %xmm15, %xmm12 mulpd %xmm14, \X addsubpd %xmm12, \X movaps \X, \Y # copy to y shufpd $1, \X, \Y # exchange halfes .endm mac_xform %xmm0, %xmm6 mac_xform %xmm1, %xmm7 .if \nrows>=4 mac_xform %xmm2, %xmm8 mac_xform %xmm3, %xmm9 .if \nrows==6 mac_xform %xmm4, %xmm10 mac_xform %xmm5, %xmm11 .endif .endif .purgem mac_xform # q(1,1) = q(1,1) + x1 # q(2,1) = q(2,1) + x2 # ... movq %rdi, %r10 # restore address of q .macro mac_pre_loop2 qoff, X movaps \qoff(%r10), %xmm13 # q(.,1) addpd \X, %xmm13 movaps %xmm13, \qoff(%r10) .endm mac_pre_loop2 0, %xmm0 mac_pre_loop2 16, %xmm1 .if \nrows>=4 mac_pre_loop2 32, %xmm2 mac_pre_loop2 48, %xmm3 .if \nrows==6 mac_pre_loop2 64, %xmm4 mac_pre_loop2 80, %xmm5 .endif .endif .purgem mac_pre_loop2 # do i=2,nb # h1 = hh(i) # q(1,i) = q(1,i) + x1*h1 # q(2,i) = q(2,i) + x2*h1 # ... # enddo addq $16, %r11 .align 16 1: cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax jge 2f addq %r8, %r10 # %r10 => q(.,i) movddup (%r11), %xmm14 # real(hh(i)) movddup 8(%r11), %xmm15 # imag(hh(i)) .macro mac_loop2 qoff, X, Y movaps \X, %xmm13 mulpd %xmm14, %xmm13 movaps \Y, %xmm12 mulpd %xmm15, %xmm12 addsubpd %xmm12, %xmm13 addpd \qoff(%r10), %xmm13 movaps %xmm13, \qoff(%r10) .endm mac_loop2 0, %xmm0, %xmm6 mac_loop2 16, %xmm1, %xmm7 .if \nrows>=4 mac_loop2 32, %xmm2, %xmm8 mac_loop2 48, %xmm3, %xmm9 .if \nrows==6 mac_loop2 64, %xmm4, %xmm10 mac_loop2 80, %xmm5, %xmm11 .endif .endif .purgem mac_loop2 addq $16, %r11 jmp 1b 2: .endm #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # FORTRAN Interface: # # subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq) # # integer, intent(in) :: nb, nq, ldq # complex*16, intent(inout) :: q(ldq,*) # complex*16, intent(in) :: hh(*) # # Parameter mapping to registers # parameter 1: %rdi : q # parameter 2: %rsi : hh # parameter 3: %rdx : nb # parameter 4: %rcx : nq # parameter 5: %r8 : ldq # #------------------------------------------------------------------------------- #!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL #!f> interface #!f> subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq) bind(C,name="single_hh_trafo_complex") #!f> use, intrinsic :: iso_c_binding #!f> integer(kind=c_int) :: nb, nq, ldq #!f> complex(kind=c_double) :: q(*) #!f> complex(kind=c_double) :: hh(nb,2) #!f> end subroutine #!f> end interface #!f>#endif .align 16,0x90 single_hh_trafo_complex: # Get integer parameters into corresponding registers movslq (%rdx), %rdx # nb movslq (%rcx), %rcx # nq movslq (%r8), %r8 # ldq # Get ldq in bytes addq %r8, %r8 addq %r8, %r8 addq %r8, %r8 addq %r8, %r8 # 16*ldq, i.e. ldq in bytes cloop_s: cmpq $4, %rcx # if %rcx <= 4 jump out of loop jle cloop_e hh_trafo_complex 6 # transform 6 rows addq $96, %rdi # increment q start adress by 96 bytes (6 rows) subq $6, %rcx # decrement nq jmp cloop_s cloop_e: cmpq $2, %rcx # if %rcx <= 2 jump to test_2 jle test_2 hh_trafo_complex 4 # transform 4 rows jmp return2 test_2: cmpq $0, %rcx # if %rcx <= 0 jump to return jle return2 hh_trafo_complex 2 # transform 2 rows return2: ret .align 16,0x90 #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # Declare that we do not need an executable stack here .section .note.GNU-stack,"",@progbits elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c0000644000312500001440000007516412717516040020707 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_SSE_INTRINSICS #undef __AVX__ #endif //Forward declaration __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface !f> subroutine quad_hh_trafo_real_sse_4hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="quad_hh_trafo_real_sse_4hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> real(kind=c_double) :: q(*) !f> real(kind=c_double) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif */ void quad_hh_trafo_real_sse_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_real_sse_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 4 householder vectors simultaneously double s_1_2 = hh[(ldh)+1]; double s_1_3 = hh[(ldh*2)+2]; double s_2_3 = hh[(ldh*2)+1]; double s_1_4 = hh[(ldh*3)+3]; double s_2_4 = hh[(ldh*3)+2]; double s_3_4 = hh[(ldh*3)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 s_1_2 += hh[2-1] * hh[(2+ldh)]; s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; // loop counter = 3 s_1_2 += hh[3-1] * hh[(3+ldh)]; s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; s_1_3 += hh[3-2] * hh[3+(ldh*2)]; s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; #pragma ivdep for (i = 4; i < nb; i++) { s_1_2 += hh[i-1] * hh[(i+ldh)]; s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; s_1_3 += hh[i-2] * hh[i+(ldh*2)]; s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; s_1_4 += hh[i-3] * hh[i+(ldh*3)]; } // printf("s_1_2: %f\n", s_1_2); // printf("s_1_3: %f\n", s_1_3); // printf("s_2_3: %f\n", s_2_3); // printf("s_1_4: %f\n", s_1_4); // printf("s_2_4: %f\n", s_2_4); // printf("s_3_4: %f\n", s_3_4); // Production level kernel calls with padding for (i = 0; i < nq-4; i+=6) { hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } if (nq == i) { return; } else { if (nq-i > 2) { hh_trafo_kernel_4_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } else { hh_trafo_kernel_2_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } } } #if 0 void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 4 householder vectors simultaneously double s_1_2 = hh[(ldh)+1]; double s_1_3 = hh[(ldh*2)+2]; double s_2_3 = hh[(ldh*2)+1]; double s_1_4 = hh[(ldh*3)+3]; double s_2_4 = hh[(ldh*3)+2]; double s_3_4 = hh[(ldh*3)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 s_1_2 += hh[2-1] * hh[(2+ldh)]; s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)]; s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)]; // loop counter = 3 s_1_2 += hh[3-1] * hh[(3+ldh)]; s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)]; s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)]; s_1_3 += hh[3-2] * hh[3+(ldh*2)]; s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)]; #pragma ivdep for (i = 4; i < nb; i++) { s_1_2 += hh[i-1] * hh[(i+ldh)]; s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)]; s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; s_1_3 += hh[i-2] * hh[i+(ldh*2)]; s_2_4 += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; s_1_4 += hh[i-3] * hh[i+(ldh*3)]; } // Production level kernel calls with padding #ifdef __AVX__ for (i = 0; i < nq; i+=12) { hh_trafo_kernel_12_AVX_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } #else for (i = 0; i < nq; i+=6) { hh_trafo_kernel_6_SSE_4hv(&q[i], hh, nb, ldq, ldh, s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4); } #endif } #endif /** * Unrolled kernel that computes * 6 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [6 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m128d a1_1 = _mm_load_pd(&q[ldq*3]); __m128d a2_1 = _mm_load_pd(&q[ldq*2]); __m128d a3_1 = _mm_load_pd(&q[ldq]); __m128d a4_1 = _mm_load_pd(&q[0]); __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); register __m128d x1 = a1_1; __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); __m128d a3_2 = _mm_load_pd(&q[ldq+2]); __m128d a4_2 = _mm_load_pd(&q[0+2]); register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); register __m128d x2 = a1_2; __m128d a1_3 = _mm_load_pd(&q[(ldq*3)+4]); __m128d a2_3 = _mm_load_pd(&q[(ldq*2)+4]); __m128d a3_3 = _mm_load_pd(&q[ldq+4]); __m128d a4_3 = _mm_load_pd(&q[0+4]); register __m128d w3 = _mm_add_pd(a4_3, _mm_mul_pd(a3_3, h_4_3)); w3 = _mm_add_pd(w3, _mm_mul_pd(a2_3, h_4_2)); w3 = _mm_add_pd(w3, _mm_mul_pd(a1_3, h_4_1)); register __m128d z3 = _mm_add_pd(a3_3, _mm_mul_pd(a2_3, h_3_2)); z3 = _mm_add_pd(z3, _mm_mul_pd(a1_3, h_3_1)); register __m128d y3 = _mm_add_pd(a2_3, _mm_mul_pd(a1_3, h_2_1)); register __m128d x3 = a1_3; __m128d q1; __m128d q2; __m128d q3; __m128d h1; __m128d h2; __m128d h3; __m128d h4; for(i = 4; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-3]); q1 = _mm_load_pd(&q[i*ldq]); q2 = _mm_load_pd(&q[(i*ldq)+2]); q3 = _mm_load_pd(&q[(i*ldq)+4]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); h2 = _mm_loaddup_pd(&hh[ldh+i-2]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); w3 = _mm_add_pd(w3, _mm_mul_pd(q3,h4)); } h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q3 = _mm_load_pd(&q[(nb*ldq)+4]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); z3 = _mm_add_pd(z3, _mm_mul_pd(q3,h3)); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); ///////////////////////////////////////////////////// // Rank-1 update of Q [6 x nb+3] ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(&hh[0]); h1 = tau1; x1 = _mm_mul_pd(x1, h1); x2 = _mm_mul_pd(x2, h1); x3 = _mm_mul_pd(x3, h1); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); h1 = tau2; h2 = _mm_mul_pd(h1, vs_1_2); y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); y3 = _mm_sub_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); h1 = tau3; h2 = _mm_mul_pd(h1, vs_1_3); h3 = _mm_mul_pd(h1, vs_2_3); z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); z3 = _mm_sub_pd(_mm_mul_pd(z3,h1), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2))); __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); h1 = tau4; h2 = _mm_mul_pd(h1, vs_1_4); h3 = _mm_mul_pd(h1, vs_2_4); h4 = _mm_mul_pd(h1, vs_3_4); w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); w3 = _mm_sub_pd(_mm_mul_pd(w3,h1), _mm_add_pd(_mm_mul_pd(z3,h4), _mm_add_pd(_mm_mul_pd(y3,h3), _mm_mul_pd(x3,h2)))); q1 = _mm_load_pd(&q[0]); q2 = _mm_load_pd(&q[2]); q3 = _mm_load_pd(&q[4]); q1 = _mm_sub_pd(q1, w1); q2 = _mm_sub_pd(q2, w2); q3 = _mm_sub_pd(q3, w3); _mm_store_pd(&q[0],q1); _mm_store_pd(&q[2],q2); _mm_store_pd(&q[4],q3); h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq]); q2 = _mm_load_pd(&q[ldq+2]); q3 = _mm_load_pd(&q[ldq+4]); q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); q3 = _mm_sub_pd(q3, _mm_add_pd(z3, _mm_mul_pd(w3, h4))); _mm_store_pd(&q[ldq],q1); _mm_store_pd(&q[ldq+2],q2); _mm_store_pd(&q[ldq+4],q3); h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_load_pd(&q[ldq*2]); q2 = _mm_load_pd(&q[(ldq*2)+2]); q3 = _mm_load_pd(&q[(ldq*2)+4]); q1 = _mm_sub_pd(q1, y1); q2 = _mm_sub_pd(q2, y2); q3 = _mm_sub_pd(q3, y3); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); _mm_store_pd(&q[ldq*2],q1); _mm_store_pd(&q[(ldq*2)+2],q2); _mm_store_pd(&q[(ldq*2)+4],q3); h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_load_pd(&q[ldq*3]); q2 = _mm_load_pd(&q[(ldq*3)+2]); q3 = _mm_load_pd(&q[(ldq*3)+4]); q1 = _mm_sub_pd(q1, x1); q2 = _mm_sub_pd(q2, x2); q3 = _mm_sub_pd(q3, x3); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4)); q3 = _mm_sub_pd(q3, _mm_mul_pd(w3, h4)); h2 = _mm_loaddup_pd(&hh[ldh+1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); _mm_store_pd(&q[ldq*3], q1); _mm_store_pd(&q[(ldq*3)+2], q2); _mm_store_pd(&q[(ldq*3)+4], q3); for (i = 4; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-3]); q1 = _mm_load_pd(&q[i*ldq]); q2 = _mm_load_pd(&q[(i*ldq)+2]); q3 = _mm_load_pd(&q[(i*ldq)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1,h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2,h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3,h1)); h2 = _mm_loaddup_pd(&hh[ldh+i-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1,h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2,h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3,h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1,h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2,h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3,h3)); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_sub_pd(q1, _mm_mul_pd(w1,h4)); q2 = _mm_sub_pd(q2, _mm_mul_pd(w2,h4)); q3 = _mm_sub_pd(q3, _mm_mul_pd(w3,h4)); _mm_store_pd(&q[i*ldq],q1); _mm_store_pd(&q[(i*ldq)+2],q2); _mm_store_pd(&q[(i*ldq)+4],q3); } h1 = _mm_loaddup_pd(&hh[nb-3]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q3 = _mm_load_pd(&q[(nb*ldq)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3)); q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3)); q3 = _mm_sub_pd(q3, _mm_mul_pd(z3, h3)); _mm_store_pd(&q[nb*ldq],q1); _mm_store_pd(&q[(nb*ldq)+2],q2); _mm_store_pd(&q[(nb*ldq)+4],q3); h1 = _mm_loaddup_pd(&hh[nb-2]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+1)*ldq)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2)); q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2)); q3 = _mm_sub_pd(q3, _mm_mul_pd(y3, h2)); _mm_store_pd(&q[(nb+1)*ldq],q1); _mm_store_pd(&q[((nb+1)*ldq)+2],q2); _mm_store_pd(&q[((nb+1)*ldq)+4],q3); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); q3 = _mm_load_pd(&q[((nb+2)*ldq)+4]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); q3 = _mm_sub_pd(q3, _mm_mul_pd(x3, h1)); _mm_store_pd(&q[(nb+2)*ldq],q1); _mm_store_pd(&q[((nb+2)*ldq)+2],q2); _mm_store_pd(&q[((nb+2)*ldq)+4],q3); } /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m128d a1_1 = _mm_load_pd(&q[ldq*3]); __m128d a2_1 = _mm_load_pd(&q[ldq*2]); __m128d a3_1 = _mm_load_pd(&q[ldq]); __m128d a4_1 = _mm_load_pd(&q[0]); __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); __m128d x1 = a1_1; __m128d a1_2 = _mm_load_pd(&q[(ldq*3)+2]); __m128d a2_2 = _mm_load_pd(&q[(ldq*2)+2]); __m128d a3_2 = _mm_load_pd(&q[ldq+2]); __m128d a4_2 = _mm_load_pd(&q[0+2]); __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3)); w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2)); w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1)); __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2)); z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1)); __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1)); __m128d x2 = a1_2; __m128d q1; __m128d q2; __m128d h1; __m128d h2; __m128d h3; __m128d h4; for(i = 4; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-3]); h2 = _mm_loaddup_pd(&hh[ldh+i-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); q2 = _mm_load_pd(&q[(i*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4)); } h1 = _mm_loaddup_pd(&hh[nb-3]); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3)); h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); ///////////////////////////////////////////////////// // Rank-1 update of Q [4 x nb+3] ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(&hh[0]); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); h1 = tau1; x1 = _mm_mul_pd(x1, h1); x2 = _mm_mul_pd(x2, h1); h1 = tau2; h2 = _mm_mul_pd(h1, vs_1_2); y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_sub_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); h1 = tau3; h2 = _mm_mul_pd(h1, vs_1_3); h3 = _mm_mul_pd(h1, vs_2_3); z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); z2 = _mm_sub_pd(_mm_mul_pd(z2,h1), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))); h1 = tau4; h2 = _mm_mul_pd(h1, vs_1_4); h3 = _mm_mul_pd(h1, vs_2_4); h4 = _mm_mul_pd(h1, vs_3_4); w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); w2 = _mm_sub_pd(_mm_mul_pd(w2,h1), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))); q1 = _mm_load_pd(&q[0]); q2 = _mm_load_pd(&q[2]); q1 = _mm_sub_pd(q1, w1); q2 = _mm_sub_pd(q2, w2); _mm_store_pd(&q[0],q1); _mm_store_pd(&q[2],q2); h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq]); q2 = _mm_load_pd(&q[ldq+2]); q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); q2 = _mm_sub_pd(q2, _mm_add_pd(z2, _mm_mul_pd(w2, h4))); _mm_store_pd(&q[ldq],q1); _mm_store_pd(&q[ldq+2],q2); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_load_pd(&q[ldq*2]); q2 = _mm_load_pd(&q[(ldq*2)+2]); q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); q2 = _mm_sub_pd(q2, _mm_add_pd(y2, _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4)))); _mm_store_pd(&q[ldq*2],q1); _mm_store_pd(&q[(ldq*2)+2],q2); h2 = _mm_loaddup_pd(&hh[ldh+1]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_load_pd(&q[ldq*3]); q2 = _mm_load_pd(&q[(ldq*3)+2]); q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); q2 = _mm_sub_pd(q2, _mm_add_pd(x2, _mm_add_pd(_mm_mul_pd(y2, h2), _mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(w2, h4))))); _mm_store_pd(&q[ldq*3], q1); _mm_store_pd(&q[(ldq*3)+2], q2); for (i = 4; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-3]); h2 = _mm_loaddup_pd(&hh[ldh+i-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); _mm_store_pd(&q[i*ldq],q1); q2 = _mm_load_pd(&q[(i*ldq)+2]); q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2, h4), _mm_mul_pd(z2, h3)), _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2)))); _mm_store_pd(&q[(i*ldq)+2],q2); } h1 = _mm_loaddup_pd(&hh[nb-3]); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); q2 = _mm_sub_pd(q2, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z2, h3), _mm_mul_pd(y2, h2)) , _mm_mul_pd(x2, h1))); _mm_store_pd(&q[nb*ldq],q1); _mm_store_pd(&q[(nb*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); q2 = _mm_sub_pd(q2, _mm_add_pd( _mm_mul_pd(y2, h2) , _mm_mul_pd(x2, h1))); _mm_store_pd(&q[(nb+1)*ldq],q1); _mm_store_pd(&q[((nb+1)*ldq)+2],q2); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1)); _mm_store_pd(&q[(nb+2)*ldq],q1); _mm_store_pd(&q[((nb+2)*ldq)+2],q2); } /** * Unrolled kernel that computes * 2 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_2_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [2 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m128d a1_1 = _mm_load_pd(&q[ldq*3]); __m128d a2_1 = _mm_load_pd(&q[ldq*2]); __m128d a3_1 = _mm_load_pd(&q[ldq]); __m128d a4_1 = _mm_load_pd(&q[0]); __m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]); __m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]); __m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]); __m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]); __m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]); __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3)); w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2)); w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1)); __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2)); z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1)); __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1)); __m128d x1 = a1_1; __m128d q1; __m128d h1; __m128d h2; __m128d h3; __m128d h4; for(i = 4; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-3]); h2 = _mm_loaddup_pd(&hh[ldh+i-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4)); } h1 = _mm_loaddup_pd(&hh[nb-3]); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3)); h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[(ldh*1)+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); ///////////////////////////////////////////////////// // Rank-1 update of Q [2 x nb+3] ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(&hh[0]); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]); __m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]); __m128d vs_1_2 = _mm_loaddup_pd(&s_1_2); __m128d vs_1_3 = _mm_loaddup_pd(&s_1_3); __m128d vs_2_3 = _mm_loaddup_pd(&s_2_3); __m128d vs_1_4 = _mm_loaddup_pd(&s_1_4); __m128d vs_2_4 = _mm_loaddup_pd(&s_2_4); __m128d vs_3_4 = _mm_loaddup_pd(&s_3_4); h1 = tau1; x1 = _mm_mul_pd(x1, h1); h1 = tau2; h2 = _mm_mul_pd(h1, vs_1_2); y1 = _mm_sub_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); h1 = tau3; h2 = _mm_mul_pd(h1, vs_1_3); h3 = _mm_mul_pd(h1, vs_2_3); z1 = _mm_sub_pd(_mm_mul_pd(z1,h1), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))); h1 = tau4; h2 = _mm_mul_pd(h1, vs_1_4); h3 = _mm_mul_pd(h1, vs_2_4); h4 = _mm_mul_pd(h1, vs_3_4); w1 = _mm_sub_pd(_mm_mul_pd(w1,h1), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))); q1 = _mm_load_pd(&q[0]); q1 = _mm_sub_pd(q1, w1); _mm_store_pd(&q[0],q1); h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]); q1 = _mm_load_pd(&q[ldq]); q1 = _mm_sub_pd(q1, _mm_add_pd(z1, _mm_mul_pd(w1, h4))); _mm_store_pd(&q[ldq],q1); h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]); q1 = _mm_load_pd(&q[ldq*2]); q1 = _mm_sub_pd(q1, _mm_add_pd(y1, _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4)))); _mm_store_pd(&q[ldq*2],q1); h2 = _mm_loaddup_pd(&hh[ldh+1]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]); q1 = _mm_load_pd(&q[ldq*3]); q1 = _mm_sub_pd(q1, _mm_add_pd(x1, _mm_add_pd(_mm_mul_pd(y1, h2), _mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(w1, h4))))); _mm_store_pd(&q[ldq*3], q1); for (i = 4; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-3]); h2 = _mm_loaddup_pd(&hh[ldh+i-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-1]); h4 = _mm_loaddup_pd(&hh[(ldh*3)+i]); q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1, h4), _mm_mul_pd(z1, h3)), _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2)))); _mm_store_pd(&q[i*ldq],q1); } h1 = _mm_loaddup_pd(&hh[nb-3]); h2 = _mm_loaddup_pd(&hh[ldh+nb-2]); h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_sub_pd(q1, _mm_add_pd(_mm_add_pd(_mm_mul_pd(z1, h3), _mm_mul_pd(y1, h2)) , _mm_mul_pd(x1, h1))); _mm_store_pd(&q[nb*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-2]); h2 = _mm_loaddup_pd(&hh[ldh+nb-1]); q1 = _mm_load_pd(&q[(nb+1)*ldq]); q1 = _mm_sub_pd(q1, _mm_add_pd( _mm_mul_pd(y1, h2) , _mm_mul_pd(x1, h1))); _mm_store_pd(&q[(nb+1)*ldq],q1); h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[(nb+2)*ldq]); q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[(nb+2)*ldq],q1); } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c0000644000312500001440000014200612717516040021417 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #include #define __forceinline __attribute__((always_inline)) #ifdef HAVE_SSE_INTRINSICS #undef __AVX__ #endif //Forward declaration static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s); /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface !f> subroutine double_hh_trafo_complex_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_sse_2hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> complex(kind=c_double) :: q(*) !f> complex(kind=c_double) :: hh(pnb,2) !f> end subroutine !f> end interface !f>#endif */ void double_hh_trafo_complex_sse_2hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; double complex s = conj(hh[(ldh)+1])*1.0; for (i = 2; i < nb; i++) { s += hh[i-1] * conj(hh[(i+ldh)]); } #if 1 for (i = 0; i < nq; i+=4) { hh_trafo_complex_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } #else for (i = 0; i < nq-2; i+=3) { hh_trafo_complex_kernel_3_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } if (nq-i > 1) { hh_trafo_complex_kernel_2_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i > 0) { hh_trafo_complex_kernel_1_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } #endif } static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m128d x1, x2, x3, x4; __m128d y1, y2, y3, y4; __m128d q1, q2, q3, q4; __m128d h1_real, h1_imag, h2_real, h2_imag; __m128d tmp1, tmp2, tmp3, tmp4; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]); x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif y1 = _mm_load_pd(&q_dbl[0]); y2 = _mm_load_pd(&q_dbl[2]); y3 = _mm_load_pd(&q_dbl[4]); y4 = _mm_load_pd(&q_dbl[6]); tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h2_imag, x4); #ifdef __ELPA_USE_FMA__ y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif tmp1 = _mm_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, q2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, q3); #ifdef __ELPA_USE_FMA__ y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h2_imag, q4); #ifdef __ELPA_USE_FMA__ y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #else x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #else x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); h2_real = _mm_xor_pd(h2_real, sign); h2_imag = _mm_xor_pd(h2_imag, sign); tmp2 = _mm_loadu_pd(s_dbl); tmp1 = _mm_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif _mm_storeu_pd(s_dbl, tmp2); h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_imag = _mm_loaddup_pd(&s_dbl[1]); tmp1 = _mm_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, y2); #ifdef __ELPA_USE_FMA__ y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp3 = _mm_mul_pd(h1_imag, y3); #ifdef __ELPA_USE_FMA__ y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #else y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #endif tmp4 = _mm_mul_pd(h1_imag, y4); #ifdef __ELPA_USE_FMA__ y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #else y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))); #endif tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h2_imag, x4); #ifdef __ELPA_USE_FMA__ y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif q1 = _mm_load_pd(&q_dbl[0]); q2 = _mm_load_pd(&q_dbl[2]); q3 = _mm_load_pd(&q_dbl[4]); q4 = _mm_load_pd(&q_dbl[6]); q1 = _mm_add_pd(q1, y1); q2 = _mm_add_pd(q2, y2); q3 = _mm_add_pd(q3, y3); q4 = _mm_add_pd(q4, y4); _mm_store_pd(&q_dbl[0], q1); _mm_store_pd(&q_dbl[2], q2); _mm_store_pd(&q_dbl[4], q3); _mm_store_pd(&q_dbl[6], q4); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]); q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]); q1 = _mm_add_pd(q1, x1); q2 = _mm_add_pd(q2, x2); q3 = _mm_add_pd(q3, x3); q4 = _mm_add_pd(q4, x4); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h2_imag, y4); #ifdef __ELPA_USE_FMA__ q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(ldq*2)+0], q1); _mm_store_pd(&q_dbl[(ldq*2)+2], q2); _mm_store_pd(&q_dbl[(ldq*2)+4], q3); _mm_store_pd(&q_dbl[(ldq*2)+6], q4); for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h2_imag, y4); #ifdef __ELPA_USE_FMA__ q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); _mm_store_pd(&q_dbl[(2*i*ldq)+6], q4); } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif tmp4 = _mm_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #else q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); _mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4); } static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m128d x1, x2, x3; __m128d y1, y2, y3; __m128d q1, q2, q3; __m128d h1_real, h1_imag, h2_real, h2_imag; __m128d tmp1, tmp2, tmp3; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif y1 = _mm_load_pd(&q_dbl[0]); y2 = _mm_load_pd(&q_dbl[2]); y3 = _mm_load_pd(&q_dbl[4]); tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif tmp1 = _mm_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, q2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, q3); #ifdef __ELPA_USE_FMA__ y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #else x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); h2_real = _mm_xor_pd(h2_real, sign); h2_imag = _mm_xor_pd(h2_imag, sign); tmp2 = _mm_loadu_pd(s_dbl); tmp1 = _mm_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif _mm_storeu_pd(s_dbl, tmp2); h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_imag = _mm_loaddup_pd(&s_dbl[1]); tmp1 = _mm_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, y2); #ifdef __ELPA_USE_FMA__ y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp3 = _mm_mul_pd(h1_imag, y3); #ifdef __ELPA_USE_FMA__ y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #else y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))); #endif tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, x3); #ifdef __ELPA_USE_FMA__ y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif q1 = _mm_load_pd(&q_dbl[0]); q2 = _mm_load_pd(&q_dbl[2]); q3 = _mm_load_pd(&q_dbl[4]); q1 = _mm_add_pd(q1, y1); q2 = _mm_add_pd(q2, y2); q3 = _mm_add_pd(q3, y3); _mm_store_pd(&q_dbl[0], q1); _mm_store_pd(&q_dbl[2], q2); _mm_store_pd(&q_dbl[4], q3); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]); q1 = _mm_add_pd(q1, x1); q2 = _mm_add_pd(q2, x2); q3 = _mm_add_pd(q3, x3); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(ldq*2)+0], q1); _mm_store_pd(&q_dbl[(ldq*2)+2], q2); _mm_store_pd(&q_dbl[(ldq*2)+4], q3); for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h2_imag, y3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*i*ldq)+4], q3); } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif tmp3 = _mm_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #else q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3); } static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m128d x1, x2; __m128d y1, y2; __m128d q1, q2; __m128d h1_real, h1_imag, h2_real, h2_imag; __m128d tmp1, tmp2; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); x2 = _mm_load_pd(&q_dbl[(2*ldq)+2]); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif y1 = _mm_load_pd(&q_dbl[0]); y2 = _mm_load_pd(&q_dbl[2]); tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif tmp1 = _mm_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, q2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_msubadd_pd(h2_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm_add_pd(x2, _mm_msubadd_pd(h1_real, q2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else x2 = _mm_add_pd(x2, _mm_addsub_pd( _mm_mul_pd(h1_real, q2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else x2 = _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); h2_real = _mm_xor_pd(h2_real, sign); h2_imag = _mm_xor_pd(h2_imag, sign); tmp2 = _mm_loadu_pd(s_dbl); tmp1 = _mm_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif _mm_storeu_pd(s_dbl, tmp2); h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_imag = _mm_loaddup_pd(&s_dbl[1]); tmp1 = _mm_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp2 = _mm_mul_pd(h1_imag, y2); #ifdef __ELPA_USE_FMA__ y2 = _mm_maddsub_pd(h1_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #else y2 = _mm_addsub_pd( _mm_mul_pd(h1_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1))); #endif tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, x2); #ifdef __ELPA_USE_FMA__ y2 = _mm_add_pd(y2, _mm_maddsub_pd(h2_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else y2 = _mm_add_pd(y2, _mm_addsub_pd( _mm_mul_pd(h2_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif q1 = _mm_load_pd(&q_dbl[0]); q2 = _mm_load_pd(&q_dbl[2]); q1 = _mm_add_pd(q1, y1); q2 = _mm_add_pd(q2, y2); _mm_store_pd(&q_dbl[0], q1); _mm_store_pd(&q_dbl[2], q2); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); q2 = _mm_load_pd(&q_dbl[(ldq*2)+2]); q1 = _mm_add_pd(q1, x1); q2 = _mm_add_pd(q2, x2); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(ldq*2)+0], q1); _mm_store_pd(&q_dbl[(ldq*2)+2], q2); for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*i*ldq)+2]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h2_imag, y2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h2_real, y2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h2_real, y2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); q2 = _mm_load_pd(&q_dbl[(2*nb*ldq)+2]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif tmp2 = _mm_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm_add_pd(q2, _mm_maddsub_pd(h1_real, x2, _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #else q2 = _mm_add_pd(q2, _mm_addsub_pd( _mm_mul_pd(h1_real, x2), _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm_store_pd(&q_dbl[(2*nb*ldq)+2], q2); } static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; double* s_dbl = (double*)(&s); __m128d x1; __m128d y1; __m128d q1; __m128d h1_real, h1_imag, h2_real, h2_imag; __m128d tmp1; int i=0; __m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000); x1 = _mm_load_pd(&q_dbl[(2*ldq)+0]); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif y1 = _mm_load_pd(&q_dbl[0]); tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h2_imag = _mm_xor_pd(h2_imag, sign); #endif tmp1 = _mm_mul_pd(h2_imag, q1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_msubadd_pd(h2_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm_xor_pd(h1_imag, sign); #endif q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); tmp1 = _mm_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm_add_pd(x1, _mm_msubadd_pd(h1_real, q1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else x1 = _mm_add_pd(x1, _mm_addsub_pd( _mm_mul_pd(h1_real, q1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[0]); h1_imag = _mm_loaddup_pd(&hh_dbl[1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else x1 = _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]); h1_real = _mm_xor_pd(h1_real, sign); h1_imag = _mm_xor_pd(h1_imag, sign); h2_real = _mm_xor_pd(h2_real, sign); h2_imag = _mm_xor_pd(h2_imag, sign); __m128d tmp2 = _mm_loadu_pd(s_dbl); tmp1 = _mm_mul_pd(h2_imag, tmp2); #ifdef __ELPA_USE_FMA__ tmp2 = _mm_maddsub_pd(h2_real, tmp2, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else tmp2 = _mm_addsub_pd( _mm_mul_pd(h2_real, tmp2), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif _mm_storeu_pd(s_dbl, tmp2); h2_real = _mm_loaddup_pd(&s_dbl[0]); h2_imag = _mm_loaddup_pd(&s_dbl[1]); tmp1 = _mm_mul_pd(h1_imag, y1); #ifdef __ELPA_USE_FMA__ y1 = _mm_maddsub_pd(h1_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #else y1 = _mm_addsub_pd( _mm_mul_pd(h1_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1))); #endif tmp1 = _mm_mul_pd(h2_imag, x1); #ifdef __ELPA_USE_FMA__ y1 = _mm_add_pd(y1, _mm_maddsub_pd(h2_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else y1 = _mm_add_pd(y1, _mm_addsub_pd( _mm_mul_pd(h2_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif q1 = _mm_load_pd(&q_dbl[0]); q1 = _mm_add_pd(q1, y1); _mm_store_pd(&q_dbl[0], q1); h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(ldq*2)+0]); q1 = _mm_add_pd(q1, x1); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(ldq*2)+0], q1); for (i = 2; i < nb; i++) { q1 = _mm_load_pd(&q_dbl[(2*i*ldq)+0]); h1_real = _mm_loaddup_pd(&hh_dbl[(i-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((i-1)*2)+1]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]); h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]); tmp1 = _mm_mul_pd(h2_imag, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h2_real, y1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h2_real, y1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*i*ldq)+0], q1); } h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]); h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]); q1 = _mm_load_pd(&q_dbl[(2*nb*ldq)+0]); tmp1 = _mm_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm_add_pd(q1, _mm_maddsub_pd(h1_real, x1, _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #else q1 = _mm_add_pd(q1, _mm_addsub_pd( _mm_mul_pd(h1_real, x1), _mm_shuffle_pd(tmp1, tmp1, _MM_SHUFFLE2(0,1)))); #endif _mm_store_pd(&q_dbl[(2*nb*ldq)+0], q1); } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real.F900000644000312500001440000004506012717516040017360 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! -------------------------------------------------------------------------------------------------- ! ! This file contains the compute intensive kernels for the Householder transformations. ! It should be compiled with the highest possible optimization level. ! ! On Intel use -O3 -xSSE4.2 (or the SSE level fitting to your CPU) ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! -------------------------------------------------------------------------------------------------- #include "config-f90.h" #ifdef DESPERATELY_WANT_ASSUMED_SIZE #define PACK_REAL_TO_COMPLEX #else #undef PACK_REAL_TO_COMPLEX #endif #ifndef DESPERATELY_WANT_ASSUMED_SIZE module real_generic_kernel private public double_hh_trafo_generic contains #endif subroutine double_hh_trafo_generic(q, hh, nb, nq, ldq, ldh) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif use iso_c_binding implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk), intent(inout) :: q(ldq,*) real(kind=rk), intent(in) :: hh(ldh,*) #else real(kind=rk), intent(inout) :: q(1:ldq,1:nb+1) real(kind=rk), intent(in) :: hh(1:ldh,1:6) #endif real(kind=rk) :: s integer(kind=ik) :: i ! equivalence(q(1,1),q_complex(1,1)) ! Safety only: #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: double_hh_trafo_generic") #endif if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!' ! Calculate dot product of the two Householder vectors s = hh(2,2)*1 do i=3,nb s = s+hh(i,2)*hh(i-1,1) enddo ! Do the Householder transformations #ifndef DESPERATELY_WANT_ASSUMED_SIZE ! ! assign real data to compplex pointer ! call c_f_pointer(c_loc(q), q_complex, [size(q,dim=1)/2,size(q,dim=2)]) #endif ! Always a multiple of 4 Q-rows is transformed, even if nq is smaller do i=1,nq-8,12 #ifdef DESPERATELY_WANT_ASSUMED_SIZE call hh_trafo_kernel_12_generic(q(i,1),hh, nb, ldq, ldh, s) #else call hh_trafo_kernel_12_generic(q(i:ldq,1:nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) #endif enddo ! i > nq-8 now, i.e. at most 8 rows remain if(nq-i+1 > 4) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE print *,"calling 8" call hh_trafo_kernel_8_generic(q(i,1),hh, nb, ldq, ldh, s) #else call hh_trafo_kernel_8_generic(q(i:ldq,1:nb+1), hh(1:ldh,1:2), nb, ldq, ldh, s) #endif else if(nq-i+1 > 0) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE print *,"calling 4" call hh_trafo_kernel_4_generic(q(i,1),hh, nb, ldq, ldh, s) #else call hh_trafo_kernel_4_generic(q(i:ldq,1:+nb+1),hh(1:ldh,1:2), nb, ldq, ldh, s) #endif endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: double_hh_trafo_generic") #endif end subroutine double_hh_trafo_generic ! -------------------------------------------------------------------------------------------------- ! The following kernels perform the Householder transformation on Q for 12/8/4 rows. ! Please note that Q is declared complex*16 here. ! This is a hint for compilers that packed arithmetic can be used for Q ! (relevant for Intel SSE and BlueGene double hummer CPUs). ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_12_generic(q, hh, nb, ldq, ldh, s) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq/2,*) real(kind=rk), intent(in) :: hh(ldh,*) #else real(kind=rk), intent(inout) :: q(:,:) real(kind=rk), intent(in) :: hh(ldh,2) #endif real(kind=rk), intent(in) :: s #ifdef PACK_REAL_TO_COMPLEX complex(kind=ck) :: x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6 #else real(kind=rk) :: x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, & y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12 #endif real(kind=rk) :: h1, h2, tau1, tau2 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_kernel_12_generic") #endif x1 = q(1,2) x2 = q(2,2) x3 = q(3,2) x4 = q(4,2) x5 = q(5,2) x6 = q(6,2) #ifndef PACK_REAL_TO_COMPLEX x7 = q(7,2) x8 = q(8,2) x9 = q(9,2) x10 = q(10,2) x11 = q(11,2) x12 = q(12,2) #endif y1 = q(1 ,1) + q(1, 2)*hh(2,2) y2 = q(2 ,1) + q(2, 2)*hh(2,2) y3 = q(3 ,1) + q(3, 2)*hh(2,2) y4 = q(4 ,1) + q(4, 2)*hh(2,2) y5 = q(5 ,1) + q(5, 2)*hh(2,2) y6 = q(6 ,1) + q(6, 2)*hh(2,2) #ifndef PACK_REAL_TO_COMPLEX y7 = q(7 ,1) + q(7, 2)*hh(2,2) y8 = q(8 ,1) + q(8, 2)*hh(2,2) y9 = q(9 ,1) + q(9, 2)*hh(2,2) y10 = q(10,1) + q(10,2)*hh(2,2) y11 = q(11,1) + q(11,2)*hh(2,2) y12 = q(12,1) + q(12,2)*hh(2,2) #endif !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) x1 = x1 + q(1, i)*h1 y1 = y1 + q(1, i)*h2 x2 = x2 + q(2, i)*h1 y2 = y2 + q(2, i)*h2 x3 = x3 + q(3, i)*h1 y3 = y3 + q(3, i)*h2 x4 = x4 + q(4, i)*h1 y4 = y4 + q(4, i)*h2 x5 = x5 + q(5, i)*h1 y5 = y5 + q(5, i)*h2 x6 = x6 + q(6, i)*h1 y6 = y6 + q(6, i)*h2 #ifndef PACK_REAL_TO_COMPLEX x7 = x7 + q(7, i)*h1 y7 = y7 + q(7, i)*h2 x8 = x8 + q(8, i)*h1 y8 = y8 + q(8, i)*h2 x9 = x9 + q(9, i)*h1 y9 = y9 + q(9, i)*h2 x10 = x10 + q(10,i)*h1 y10 = y10 + q(10,i)*h2 x11 = x11 + q(11,i)*h1 y11 = y11 + q(11,i)*h2 x12 = x12 + q(12,i)*h1 y12 = y12 + q(12,i)*h2 #endif enddo x1 = x1 + q(1,nb+1)*hh(nb,1) x2 = x2 + q(2,nb+1)*hh(nb,1) x3 = x3 + q(3,nb+1)*hh(nb,1) x4 = x4 + q(4,nb+1)*hh(nb,1) x5 = x5 + q(5,nb+1)*hh(nb,1) x6 = x6 + q(6,nb+1)*hh(nb,1) #ifndef PACK_REAL_TO_COMPLEX x7 = x7 + q(7, nb+1)*hh(nb,1) x8 = x8 + q(8, nb+1)*hh(nb,1) x9 = x9 + q(9, nb+1)*hh(nb,1) x10 = x10 + q(10,nb+1)*hh(nb,1) x11 = x11 + q(11,nb+1)*hh(nb,1) x12 = x12 + q(12,nb+1)*hh(nb,1) #endif tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x1 = x1 *h1 x2 = x2 *h1 x3 = x3 *h1 x4 = x4 *h1 x5 = x5 *h1 x6 = x6 *h1 #ifndef PACK_REAL_TO_COMPLEX x7 = x7 *h1 x8 = x8 *h1 x9 = x9 *h1 x10 = x10*h1 x11 = x11*h1 x12 = x12*h1 #endif h1 = -tau2 h2 = -tau2*s y1 = y1 *h1 + x1 *h2 y2 = y2 *h1 + x2 *h2 y3 = y3 *h1 + x3 *h2 y4 = y4 *h1 + x4 *h2 y5 = y5 *h1 + x5 *h2 y6 = y6 *h1 + x6 *h2 #ifndef PACK_REAL_TO_COMPLEX y7 = y7 *h1 + x7 *h2 y8 = y8 *h1 + x8 *h2 y9 = y9 *h1 + x9 *h2 y10 = y10*h1 + x10*h2 y11 = y11*h1 + x11*h2 y12 = y12*h1 + x12*h2 #endif q(1,1) = q(1, 1) + y1 q(2,1) = q(2, 1) + y2 q(3,1) = q(3, 1) + y3 q(4,1) = q(4, 1) + y4 q(5,1) = q(5, 1) + y5 q(6,1) = q(6, 1) + y6 #ifndef PACK_REAL_TO_COMPLEX q(7 ,1) = q(7, 1) + y7 q(8 ,1) = q(8, 1) + y8 q(9 ,1) = q(9, 1) + y9 q(10,1) = q(10,1) + y10 q(11,1) = q(11,1) + y11 q(12,1) = q(12,1) + y12 #endif q(1, 2) = q(1, 2) + x1 + y1 *hh(2,2) q(2, 2) = q(2, 2) + x2 + y2 *hh(2,2) q(3, 2) = q(3, 2) + x3 + y3 *hh(2,2) q(4, 2) = q(4, 2) + x4 + y4 *hh(2,2) q(5, 2) = q(5, 2) + x5 + y5 *hh(2,2) q(6, 2) = q(6, 2) + x6 + y6 *hh(2,2) #ifndef PACK_REAL_TO_COMPLEX q(7, 2) = q(7, 2) + x7 + y7 *hh(2,2) q(8, 2) = q(8, 2) + x8 + y8 *hh(2,2) q(9, 2) = q(9, 2) + x9 + y9 *hh(2,2) q(10,2) = q(10,2) + x10 + y10*hh(2,2) q(11,2) = q(11,2) + x11 + y11*hh(2,2) q(12,2) = q(12,2) + x12 + y12*hh(2,2) #endif !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1, i) = q(1,i) + x1 *h1 + y1 *h2 q(2, i) = q(2,i) + x2 *h1 + y2 *h2 q(3, i) = q(3,i) + x3 *h1 + y3 *h2 q(4, i) = q(4,i) + x4 *h1 + y4 *h2 q(5, i) = q(5,i) + x5 *h1 + y5 *h2 q(6, i) = q(6,i) + x6 *h1 + y6 *h2 #ifndef PACK_REAL_TO_COMPLEX q(7, i) = q(7, i) + x7 *h1 + y7 *h2 q(8, i) = q(8, i) + x8 *h1 + y8 *h2 q(9, i) = q(9, i) + x9 *h1 + y9 *h2 q(10,i) = q(10,i) + x10*h1 + y10*h2 q(11,i) = q(11,i) + x11*h1 + y11*h2 q(12,i) = q(12,i) + x12*h1 + y12*h2 #endif enddo q(1, nb+1) = q(1, nb+1) + x1 *hh(nb,1) q(2, nb+1) = q(2, nb+1) + x2 *hh(nb,1) q(3, nb+1) = q(3, nb+1) + x3 *hh(nb,1) q(4, nb+1) = q(4, nb+1) + x4 *hh(nb,1) q(5, nb+1) = q(5, nb+1) + x5 *hh(nb,1) q(6, nb+1) = q(6, nb+1) + x6 *hh(nb,1) #ifndef PACK_REAL_TO_COMPLEX q(7, nb+1) = q(7, nb+1) + x7 *hh(nb,1) q(8, nb+1) = q(8, nb+1) + x8 *hh(nb,1) q(9, nb+1) = q(9, nb+1) + x9 *hh(nb,1) q(10,nb+1) = q(10,nb+1) + x10*hh(nb,1) q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1) q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_kernel_12_generic") #endif end subroutine hh_trafo_kernel_12_generic ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_8_generic(q, hh, nb, ldq, ldh, s) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq/2,*) real(kind=rk), intent(in) :: hh(ldh,*) #else real(kind=rk), intent(inout) :: q(:,:) real(kind=rk), intent(in) :: hh(ldh,2) #endif real(kind=rk), intent(in) :: s #ifdef PACK_REAL_TO_COMPLEX complex(kind=ck) :: x1, x2, x3, x4, y1, y2, y3, y4 #else real(kind=rk) :: x1, x2, x3, x4, x5, x6, x7, x8, & y1, y2, y3, y4, y5, y6, y7, y8 #endif real(kind=rk) :: h1, h2, tau1, tau2 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_kernel_8_generic") #endif x1 = q(1,2) x2 = q(2,2) x3 = q(3,2) x4 = q(4,2) #ifndef PACK_REAL_TO_COMPLEX x5 = q(5,2) x6 = q(6,2) x7 = q(7,2) x8 = q(8,2) #endif y1 = q(1,1) + q(1,2)*hh(2,2) y2 = q(2,1) + q(2,2)*hh(2,2) y3 = q(3,1) + q(3,2)*hh(2,2) y4 = q(4,1) + q(4,2)*hh(2,2) #ifndef PACK_REAL_TO_COMPLEX y5 = q(5,1) + q(5,2)*hh(2,2) y6 = q(6,1) + q(6,2)*hh(2,2) y7 = q(7,1) + q(7,2)*hh(2,2) y8 = q(8,1) + q(8,2)*hh(2,2) #endif !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) x1 = x1 + q(1,i)*h1 y1 = y1 + q(1,i)*h2 x2 = x2 + q(2,i)*h1 y2 = y2 + q(2,i)*h2 x3 = x3 + q(3,i)*h1 y3 = y3 + q(3,i)*h2 x4 = x4 + q(4,i)*h1 y4 = y4 + q(4,i)*h2 #ifndef PACK_REAL_TO_COMPLEX x5 = x5 + q(5,i)*h1 y5 = y5 + q(5,i)*h2 x6 = x6 + q(6,i)*h1 y6 = y6 + q(6,i)*h2 x7 = x7 + q(7,i)*h1 y7 = y7 + q(7,i)*h2 x8 = x8 + q(8,i)*h1 y8 = y8 + q(8,i)*h2 #endif enddo x1 = x1 + q(1,nb+1)*hh(nb,1) x2 = x2 + q(2,nb+1)*hh(nb,1) x3 = x3 + q(3,nb+1)*hh(nb,1) x4 = x4 + q(4,nb+1)*hh(nb,1) #ifndef PACK_REAL_TO_COMPLEX x5 = x5 + q(5,nb+1)*hh(nb,1) x6 = x6 + q(6,nb+1)*hh(nb,1) x7 = x7 + q(7,nb+1)*hh(nb,1) x8 = x8 + q(8,nb+1)*hh(nb,1) #endif tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 x3 = x3*h1 x4 = x4*h1 #ifndef PACK_REAL_TO_COMPLEX x5 = x5*h1 x6 = x6*h1 x7 = x7*h1 x8 = x8*h1 #endif h1 = -tau2 h2 = -tau2*s y1 = y1*h1 + x1*h2 y2 = y2*h1 + x2*h2 y3 = y3*h1 + x3*h2 y4 = y4*h1 + x4*h2 #ifndef PACK_REAL_TO_COMPLEX y5 = y5*h1 + x5*h2 y6 = y6*h1 + x6*h2 y7 = y7*h1 + x7*h2 y8 = y8*h1 + x8*h2 #endif q(1,1) = q(1,1) + y1 q(2,1) = q(2,1) + y2 q(3,1) = q(3,1) + y3 q(4,1) = q(4,1) + y4 #ifndef PACK_REAL_TO_COMPLEX q(5,1) = q(5,1) + y5 q(6,1) = q(6,1) + y6 q(7,1) = q(7,1) + y7 q(8,1) = q(8,1) + y8 #endif q(1,2) = q(1,2) + x1 + y1*hh(2,2) q(2,2) = q(2,2) + x2 + y2*hh(2,2) q(3,2) = q(3,2) + x3 + y3*hh(2,2) q(4,2) = q(4,2) + x4 + y4*hh(2,2) #ifndef PACK_REAL_TO_COMPLEX q(5,2) = q(5,2) + x5 + y5*hh(2,2) q(6,2) = q(6,2) + x6 + y6*hh(2,2) q(7,2) = q(7,2) + x7 + y7*hh(2,2) q(8,2) = q(8,2) + x8 + y8*hh(2,2) #endif !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1,i) = q(1,i) + x1*h1 + y1*h2 q(2,i) = q(2,i) + x2*h1 + y2*h2 q(3,i) = q(3,i) + x3*h1 + y3*h2 q(4,i) = q(4,i) + x4*h1 + y4*h2 #ifndef PACK_REAL_TO_COMPLEX q(5,i) = q(5,i) + x5*h1 + y5*h2 q(6,i) = q(6,i) + x6*h1 + y6*h2 q(7,i) = q(7,i) + x7*h1 + y7*h2 q(8,i) = q(8,i) + x8*h1 + y8*h2 #endif enddo q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1) q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1) #ifndef PACK_REAL_TO_COMPLEX q(5,nb+1) = q(5,nb+1) + x5*hh(nb,1) q(6,nb+1) = q(6,nb+1) + x6*hh(nb,1) q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1) q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_kernel_8_generic") #endif end subroutine hh_trafo_kernel_8_generic ! -------------------------------------------------------------------------------------------------- subroutine hh_trafo_kernel_4_generic(q, hh, nb, ldq, ldh, s) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq/2,*) real(kind=rk), intent(in) :: hh(ldh,*) #else real(kind=rk), intent(inout) :: q(:,:) !q(1:ldq/2,1:nb+1) real(kind=rk), intent(in) :: hh(ldh,2) #endif real(kind=rk), intent(in) :: s #ifdef PACK_REAL_TO_COMPLEX complex(kind=ck) :: x1, x2, y1, y2 #else real(kind=rk) :: x1, x2, x3, x4, y1, y2, y3, y4 #endif real(kind=rk) :: h1, h2, tau1, tau2 integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel generic: hh_trafo_kernel_4_generic") #endif x1 = q(1,2) x2 = q(2,2) #ifndef PACK_REAL_TO_COMPLEX x3 = q(3,2) x4 = q(4,2) #endif y1 = q(1,1) + q(1,2)*hh(2,2) y2 = q(2,1) + q(2,2)*hh(2,2) #ifndef PACK_REAL_TO_COMPLEX y3 = q(3,1) + q(3,2)*hh(2,2) y4 = q(4,1) + q(4,2)*hh(2,2) #endif !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) x1 = x1 + q(1,i)*h1 y1 = y1 + q(1,i)*h2 x2 = x2 + q(2,i)*h1 y2 = y2 + q(2,i)*h2 #ifndef PACK_REAL_TO_COMPLEX x3 = x3 + q(3,i)*h1 y3 = y3 + q(3,i)*h2 x4 = x4 + q(4,i)*h1 y4 = y4 + q(4,i)*h2 #endif enddo x1 = x1 + q(1,nb+1)*hh(nb,1) x2 = x2 + q(2,nb+1)*hh(nb,1) #ifndef PACK_REAL_TO_COMPLEX x3 = x3 + q(3,nb+1)*hh(nb,1) x4 = x4 + q(4,nb+1)*hh(nb,1) #endif tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x1 = x1*h1 x2 = x2*h1 #ifndef PACK_REAL_TO_COMPLEX x3 = x3*h1 x4 = x4*h1 #endif h1 = -tau2 h2 = -tau2*s y1 = y1*h1 + x1*h2 y2 = y2*h1 + x2*h2 #ifndef PACK_REAL_TO_COMPLEX y3 = y3*h1 + x3*h2 y4 = y4*h1 + x4*h2 #endif q(1,1) = q(1,1) + y1 q(2,1) = q(2,1) + y2 #ifndef PACK_REAL_TO_COMPLEX q(3,1) = q(3,1) + y3 q(4,1) = q(4,1) + y4 #endif q(1,2) = q(1,2) + x1 + y1*hh(2,2) q(2,2) = q(2,2) + x2 + y2*hh(2,2) #ifndef PACK_REAL_TO_COMPLEX q(3,2) = q(3,2) + x3 + y3*hh(2,2) q(4,2) = q(4,2) + x4 + y4*hh(2,2) #endif !DEC$ VECTOR ALIGNED do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1,i) = q(1,i) + x1*h1 + y1*h2 q(2,i) = q(2,i) + x2*h1 + y2*h2 #ifndef PACK_REAL_TO_COMPLEX q(3,i) = q(3,i) + x3*h1 + y3*h2 q(4,i) = q(4,i) + x4*h1 + y4*h2 #endif enddo q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) #ifndef PACK_REAL_TO_COMPLEX q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1) q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel generic: hh_trafo_kernel_4_generic") #endif end subroutine hh_trafo_kernel_4_generic #ifndef DESPERATELY_WANT_ASSUMED_SIZE end module real_generic_kernel #endif ! -------------------------------------------------------------------------------------------------- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c0000644000312500001440000004740712717516040022311 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #include #define __forceinline __attribute__((always_inline)) #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c) #endif #ifdef __AVX2__ #define __ELPA_USE_FMA__ #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c) #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c) #endif #endif //Forward declaration static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq); /* !f>#ifdef HAVE_AVX !f> interface !f> subroutine single_hh_trafo_complex_avx_avx2_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_avx_avx2_1hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq !f> complex(kind=c_double) :: q(*) !f> complex(kind=c_double) :: hh(pnb,2) !f> end subroutine !f> end interface !f>#endif */ void single_hh_trafo_complex_avx_avx2_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; //int ldh = *pldh; for (i = 0; i < nq-8; i+=12) { hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq); } if (nq-i > 4) { hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq); } else if (nq-i > 0) { hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq); } } static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; __m256d x1, x2, x3, x4, x5, x6; __m256d q1, q2, q3, q4, q5, q6; __m256d h1_real, h1_imag; __m256d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[0]); x2 = _mm256_load_pd(&q_dbl[4]); x3 = _mm256_load_pd(&q_dbl[8]); x4 = _mm256_load_pd(&q_dbl[12]); x5 = _mm256_load_pd(&q_dbl[16]); x6 = _mm256_load_pd(&q_dbl[20]); for (i = 1; i < nb; i++) { h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]); q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif tmp5 = _mm256_mul_pd(h1_imag, q5); #ifdef __ELPA_USE_FMA__ x5 = _mm256_add_pd(x5, _mm256_FMSUBADD_pd(h1_real, q5, _mm256_shuffle_pd(tmp5, tmp5, 0x5))); #else x5 = _mm256_add_pd(x5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q5), _mm256_shuffle_pd(tmp5, tmp5, 0x5))); #endif tmp6 = _mm256_mul_pd(h1_imag, q6); #ifdef __ELPA_USE_FMA__ x6 = _mm256_add_pd(x6, _mm256_FMSUBADD_pd(h1_real, q6, _mm256_shuffle_pd(tmp6, tmp6, 0x5))); #else x6 = _mm256_add_pd(x6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q6), _mm256_shuffle_pd(tmp6, tmp6, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #else x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #else x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #endif tmp5 = _mm256_mul_pd(h1_imag, x5); #ifdef __ELPA_USE_FMA__ x5 = _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)); #else x5 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)); #endif tmp6 = _mm256_mul_pd(h1_imag, x6); #ifdef __ELPA_USE_FMA__ x6 = _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)); #else x6 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)); #endif q1 = _mm256_load_pd(&q_dbl[0]); q2 = _mm256_load_pd(&q_dbl[4]); q3 = _mm256_load_pd(&q_dbl[8]); q4 = _mm256_load_pd(&q_dbl[12]); q5 = _mm256_load_pd(&q_dbl[16]); q6 = _mm256_load_pd(&q_dbl[20]); q1 = _mm256_add_pd(q1, x1); q2 = _mm256_add_pd(q2, x2); q3 = _mm256_add_pd(q3, x3); q4 = _mm256_add_pd(q4, x4); q5 = _mm256_add_pd(q5, x5); q6 = _mm256_add_pd(q6, x6); _mm256_store_pd(&q_dbl[0], q1); _mm256_store_pd(&q_dbl[4], q2); _mm256_store_pd(&q_dbl[8], q3); _mm256_store_pd(&q_dbl[12], q4); _mm256_store_pd(&q_dbl[16], q5); _mm256_store_pd(&q_dbl[20], q6); for (i = 1; i < nb; i++) { h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]); q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif tmp5 = _mm256_mul_pd(h1_imag, x5); #ifdef __ELPA_USE_FMA__ q5 = _mm256_add_pd(q5, _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5))); #else q5 = _mm256_add_pd(q5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5))); #endif tmp6 = _mm256_mul_pd(h1_imag, x6); #ifdef __ELPA_USE_FMA__ q6 = _mm256_add_pd(q6, _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5))); #else q6 = _mm256_add_pd(q6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); _mm256_store_pd(&q_dbl[(2*i*ldq)+16], q5); _mm256_store_pd(&q_dbl[(2*i*ldq)+20], q6); } } static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; __m256d x1, x2, x3, x4; __m256d q1, q2, q3, q4; __m256d h1_real, h1_imag; __m256d tmp1, tmp2, tmp3, tmp4; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[0]); x2 = _mm256_load_pd(&q_dbl[4]); x3 = _mm256_load_pd(&q_dbl[8]); x4 = _mm256_load_pd(&q_dbl[12]); for (i = 1; i < nb; i++) { h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, q3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, q4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #else x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #else x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)); #endif q1 = _mm256_load_pd(&q_dbl[0]); q2 = _mm256_load_pd(&q_dbl[4]); q3 = _mm256_load_pd(&q_dbl[8]); q4 = _mm256_load_pd(&q_dbl[12]); q1 = _mm256_add_pd(q1, x1); q2 = _mm256_add_pd(q2, x2); q3 = _mm256_add_pd(q3, x3); q4 = _mm256_add_pd(q4, x4); _mm256_store_pd(&q_dbl[0], q1); _mm256_store_pd(&q_dbl[4], q2); _mm256_store_pd(&q_dbl[8], q3); _mm256_store_pd(&q_dbl[12], q4); for (i = 1; i < nb; i++) { h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]); q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif tmp3 = _mm256_mul_pd(h1_imag, x3); #ifdef __ELPA_USE_FMA__ q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #else q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5))); #endif tmp4 = _mm256_mul_pd(h1_imag, x4); #ifdef __ELPA_USE_FMA__ q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #else q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4); } } static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq) { double* q_dbl = (double*)q; double* hh_dbl = (double*)hh; __m256d x1, x2; __m256d q1, q2; __m256d h1_real, h1_imag; __m256d tmp1, tmp2; int i=0; __m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); x1 = _mm256_load_pd(&q_dbl[0]); x2 = _mm256_load_pd(&q_dbl[4]); for (i = 1; i < nb; i++) { h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); #ifndef __ELPA_USE_FMA__ // conjugate h1_imag = _mm256_xor_pd(h1_imag, sign); #endif q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); tmp1 = _mm256_mul_pd(h1_imag, q1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, q2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif } h1_real = _mm256_broadcast_sd(&hh_dbl[0]); h1_imag = _mm256_broadcast_sd(&hh_dbl[1]); h1_real = _mm256_xor_pd(h1_real, sign); h1_imag = _mm256_xor_pd(h1_imag, sign); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #else x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #else x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)); #endif q1 = _mm256_load_pd(&q_dbl[0]); q2 = _mm256_load_pd(&q_dbl[4]); q1 = _mm256_add_pd(q1, x1); q2 = _mm256_add_pd(q2, x2); _mm256_store_pd(&q_dbl[0], q1); _mm256_store_pd(&q_dbl[4], q2); for (i = 1; i < nb; i++) { h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]); h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]); q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]); q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]); tmp1 = _mm256_mul_pd(h1_imag, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #else q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5))); #endif tmp2 = _mm256_mul_pd(h1_imag, x2); #ifdef __ELPA_USE_FMA__ q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #else q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5))); #endif _mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); } } elpa-2016.05.001/src/elpa2_kernels/mod_fortran_interfaces.F900000644000312500001440000000020412717402663020340 00000000000000#include "config-f90.h" module kernel_interfaces implicit none #include "elpa/elpa_generated_fortran_interfaces.h" end module elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c0000644000312500001440000004465712717516040020710 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_SSE_INTRINSICS #undef __AVX__ #endif //Forward declaration __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); /* !f>#ifdef HAVE_SSE_INTRINSICS !f> interface !f> subroutine double_hh_trafo_real_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_sse_2hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> real(kind=c_double) :: q(*) !f> real(kind=c_double) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif */ void double_hh_trafo_real_sse_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_sse_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar product to compute // 2 householder vectors simultaneously double s = hh[(ldh)+1]*1.0; #pragma ivdep for (i = 2; i < nb; i++) { s += hh[i-1] * hh[(i+ldh)]; } // Production level kernel calls with padding for (i = 0; i < nq-8; i+=12) { hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } if (nq == i) { return; } else { if (nq-i > 4) { hh_trafo_kernel_8_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } else if (nq-i > 0) { hh_trafo_kernel_4_SSE_2hv(&q[i], hh, nb, ldq, ldh, s); } } } /** * Unrolled kernel that computes * 12 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [12 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); __m128d sign = (__m128d)_mm_set1_epi64(smallsign); __m128d x1 = _mm_load_pd(&q[ldq]); __m128d x2 = _mm_load_pd(&q[ldq+2]); __m128d x3 = _mm_load_pd(&q[ldq+4]); __m128d x4 = _mm_load_pd(&q[ldq+6]); __m128d x5 = _mm_load_pd(&q[ldq+8]); __m128d x6 = _mm_load_pd(&q[ldq+10]); __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h2; __m128d q1 = _mm_load_pd(q); __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); __m128d q2 = _mm_load_pd(&q[2]); __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); __m128d q3 = _mm_load_pd(&q[4]); __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); __m128d q4 = _mm_load_pd(&q[6]); __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); __m128d q5 = _mm_load_pd(&q[8]); __m128d y5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); __m128d q6 = _mm_load_pd(&q[10]); __m128d y6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); for(i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); q2 = _mm_load_pd(&q[(i*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); q3 = _mm_load_pd(&q[(i*ldq)+4]); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); q4 = _mm_load_pd(&q[(i*ldq)+6]); x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); q5 = _mm_load_pd(&q[(i*ldq)+8]); x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); y5 = _mm_add_pd(y5, _mm_mul_pd(q5,h2)); q6 = _mm_load_pd(&q[(i*ldq)+10]); x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); y6 = _mm_add_pd(y6, _mm_mul_pd(q6,h2)); } h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); q2 = _mm_load_pd(&q[(nb*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); q3 = _mm_load_pd(&q[(nb*ldq)+4]); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); q4 = _mm_load_pd(&q[(nb*ldq)+6]); x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); q5 = _mm_load_pd(&q[(nb*ldq)+8]); x5 = _mm_add_pd(x5, _mm_mul_pd(q5,h1)); q6 = _mm_load_pd(&q[(nb*ldq)+10]); x6 = _mm_add_pd(x6, _mm_mul_pd(q6,h1)); ///////////////////////////////////////////////////// // Rank-2 update of Q [12 x nb+1] ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(hh); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs = _mm_loaddup_pd(&s); h1 = _mm_xor_pd(tau1, sign); x1 = _mm_mul_pd(x1, h1); x2 = _mm_mul_pd(x2, h1); x3 = _mm_mul_pd(x3, h1); x4 = _mm_mul_pd(x4, h1); x5 = _mm_mul_pd(x5, h1); x6 = _mm_mul_pd(x6, h1); h1 = _mm_xor_pd(tau2, sign); h2 = _mm_mul_pd(h1, vs); y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); y5 = _mm_add_pd(_mm_mul_pd(y5,h1), _mm_mul_pd(x5,h2)); y6 = _mm_add_pd(_mm_mul_pd(y6,h1), _mm_mul_pd(x6,h2)); q1 = _mm_load_pd(q); q1 = _mm_add_pd(q1, y1); _mm_store_pd(q,q1); q2 = _mm_load_pd(&q[2]); q2 = _mm_add_pd(q2, y2); _mm_store_pd(&q[2],q2); q3 = _mm_load_pd(&q[4]); q3 = _mm_add_pd(q3, y3); _mm_store_pd(&q[4],q3); q4 = _mm_load_pd(&q[6]); q4 = _mm_add_pd(q4, y4); _mm_store_pd(&q[6],q4); q5 = _mm_load_pd(&q[8]); q5 = _mm_add_pd(q5, y5); _mm_store_pd(&q[8],q5); q6 = _mm_load_pd(&q[10]); q6 = _mm_add_pd(q6, y6); _mm_store_pd(&q[10],q6); h2 = _mm_loaddup_pd(&hh[ldh+1]); q1 = _mm_load_pd(&q[ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); _mm_store_pd(&q[ldq],q1); q2 = _mm_load_pd(&q[ldq+2]); q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); _mm_store_pd(&q[ldq+2],q2); q3 = _mm_load_pd(&q[ldq+4]); q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2))); _mm_store_pd(&q[ldq+4],q3); q4 = _mm_load_pd(&q[ldq+6]); q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); _mm_store_pd(&q[ldq+6],q4); q5 = _mm_load_pd(&q[ldq+8]); q5 = _mm_add_pd(q5, _mm_add_pd(x5, _mm_mul_pd(y5, h2))); _mm_store_pd(&q[ldq+8],q5); q6 = _mm_load_pd(&q[ldq+10]); q6 = _mm_add_pd(q6, _mm_add_pd(x6, _mm_mul_pd(y6, h2))); _mm_store_pd(&q[ldq+10],q6); for (i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); _mm_store_pd(&q[i*ldq],q1); q2 = _mm_load_pd(&q[(i*ldq)+2]); q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); _mm_store_pd(&q[(i*ldq)+2],q2); q3 = _mm_load_pd(&q[(i*ldq)+4]); q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2))); _mm_store_pd(&q[(i*ldq)+4],q3); q4 = _mm_load_pd(&q[(i*ldq)+6]); q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); _mm_store_pd(&q[(i*ldq)+6],q4); q5 = _mm_load_pd(&q[(i*ldq)+8]); q5 = _mm_add_pd(q5, _mm_add_pd(_mm_mul_pd(x5,h1), _mm_mul_pd(y5, h2))); _mm_store_pd(&q[(i*ldq)+8],q5); q6 = _mm_load_pd(&q[(i*ldq)+10]); q6 = _mm_add_pd(q6, _mm_add_pd(_mm_mul_pd(x6,h1), _mm_mul_pd(y6, h2))); _mm_store_pd(&q[(i*ldq)+10],q6); } h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[nb*ldq],q1); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); _mm_store_pd(&q[(nb*ldq)+2],q2); q3 = _mm_load_pd(&q[(nb*ldq)+4]); q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); _mm_store_pd(&q[(nb*ldq)+4],q3); q4 = _mm_load_pd(&q[(nb*ldq)+6]); q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); _mm_store_pd(&q[(nb*ldq)+6],q4); q5 = _mm_load_pd(&q[(nb*ldq)+8]); q5 = _mm_add_pd(q5, _mm_mul_pd(x5, h1)); _mm_store_pd(&q[(nb*ldq)+8],q5); q6 = _mm_load_pd(&q[(nb*ldq)+10]); q6 = _mm_add_pd(q6, _mm_mul_pd(x6, h1)); _mm_store_pd(&q[(nb*ldq)+10],q6); } /** * Unrolled kernel that computes * 8 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); __m128d sign = (__m128d)_mm_set1_epi64(smallsign); __m128d x1 = _mm_load_pd(&q[ldq]); __m128d x2 = _mm_load_pd(&q[ldq+2]); __m128d x3 = _mm_load_pd(&q[ldq+4]); __m128d x4 = _mm_load_pd(&q[ldq+6]); __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h2; __m128d q1 = _mm_load_pd(q); __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); __m128d q2 = _mm_load_pd(&q[2]); __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); __m128d q3 = _mm_load_pd(&q[4]); __m128d y3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); __m128d q4 = _mm_load_pd(&q[6]); __m128d y4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); for(i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); q2 = _mm_load_pd(&q[(i*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); q3 = _mm_load_pd(&q[(i*ldq)+4]); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); y3 = _mm_add_pd(y3, _mm_mul_pd(q3,h2)); q4 = _mm_load_pd(&q[(i*ldq)+6]); x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); y4 = _mm_add_pd(y4, _mm_mul_pd(q4,h2)); } h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); q2 = _mm_load_pd(&q[(nb*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); q3 = _mm_load_pd(&q[(nb*ldq)+4]); x3 = _mm_add_pd(x3, _mm_mul_pd(q3,h1)); q4 = _mm_load_pd(&q[(nb*ldq)+6]); x4 = _mm_add_pd(x4, _mm_mul_pd(q4,h1)); ///////////////////////////////////////////////////// // Rank-2 update of Q [8 x nb+1] ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(hh); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs = _mm_loaddup_pd(&s); h1 = _mm_xor_pd(tau1, sign); x1 = _mm_mul_pd(x1, h1); x2 = _mm_mul_pd(x2, h1); x3 = _mm_mul_pd(x3, h1); x4 = _mm_mul_pd(x4, h1); h1 = _mm_xor_pd(tau2, sign); h2 = _mm_mul_pd(h1, vs); y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); y3 = _mm_add_pd(_mm_mul_pd(y3,h1), _mm_mul_pd(x3,h2)); y4 = _mm_add_pd(_mm_mul_pd(y4,h1), _mm_mul_pd(x4,h2)); q1 = _mm_load_pd(q); q1 = _mm_add_pd(q1, y1); _mm_store_pd(q,q1); q2 = _mm_load_pd(&q[2]); q2 = _mm_add_pd(q2, y2); _mm_store_pd(&q[2],q2); q3 = _mm_load_pd(&q[4]); q3 = _mm_add_pd(q3, y3); _mm_store_pd(&q[4],q3); q4 = _mm_load_pd(&q[6]); q4 = _mm_add_pd(q4, y4); _mm_store_pd(&q[6],q4); h2 = _mm_loaddup_pd(&hh[ldh+1]); q1 = _mm_load_pd(&q[ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); _mm_store_pd(&q[ldq],q1); q2 = _mm_load_pd(&q[ldq+2]); q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); _mm_store_pd(&q[ldq+2],q2); q3 = _mm_load_pd(&q[ldq+4]); q3 = _mm_add_pd(q3, _mm_add_pd(x3, _mm_mul_pd(y3, h2))); _mm_store_pd(&q[ldq+4],q3); q4 = _mm_load_pd(&q[ldq+6]); q4 = _mm_add_pd(q4, _mm_add_pd(x4, _mm_mul_pd(y4, h2))); _mm_store_pd(&q[ldq+6],q4); for (i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); _mm_store_pd(&q[i*ldq],q1); q2 = _mm_load_pd(&q[(i*ldq)+2]); q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); _mm_store_pd(&q[(i*ldq)+2],q2); q3 = _mm_load_pd(&q[(i*ldq)+4]); q3 = _mm_add_pd(q3, _mm_add_pd(_mm_mul_pd(x3,h1), _mm_mul_pd(y3, h2))); _mm_store_pd(&q[(i*ldq)+4],q3); q4 = _mm_load_pd(&q[(i*ldq)+6]); q4 = _mm_add_pd(q4, _mm_add_pd(_mm_mul_pd(x4,h1), _mm_mul_pd(y4, h2))); _mm_store_pd(&q[(i*ldq)+6],q4); } h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[nb*ldq],q1); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); _mm_store_pd(&q[(nb*ldq)+2],q2); q3 = _mm_load_pd(&q[(nb*ldq)+4]); q3 = _mm_add_pd(q3, _mm_mul_pd(x3, h1)); _mm_store_pd(&q[(nb*ldq)+4],q3); q4 = _mm_load_pd(&q[(nb*ldq)+6]); q4 = _mm_add_pd(q4, _mm_mul_pd(x4, h1)); _mm_store_pd(&q[(nb*ldq)+6],q4); } /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 2 update is performed */ __forceinline void hh_trafo_kernel_4_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [4 x nb+1] * hh // hh contains two householder vectors, with offset 1 ///////////////////////////////////////////////////// int i; // Needed bit mask for floating point sign flip __m64 smallsign = _mm_set_pi32(0x80000000, 0x00000000); __m128d sign = (__m128d)_mm_set1_epi64(smallsign); __m128d x1 = _mm_load_pd(&q[ldq]); __m128d x2 = _mm_load_pd(&q[ldq+2]); __m128d h1 = _mm_loaddup_pd(&hh[ldh+1]); __m128d h2; __m128d q1 = _mm_load_pd(q); __m128d y1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); __m128d q2 = _mm_load_pd(&q[2]); __m128d y2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); for(i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); q1 = _mm_load_pd(&q[i*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2)); q2 = _mm_load_pd(&q[(i*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2)); } h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1)); q2 = _mm_load_pd(&q[(nb*ldq)+2]); x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1)); ///////////////////////////////////////////////////// // Rank-2 update of Q [12 x nb+1] ///////////////////////////////////////////////////// __m128d tau1 = _mm_loaddup_pd(hh); __m128d tau2 = _mm_loaddup_pd(&hh[ldh]); __m128d vs = _mm_loaddup_pd(&s); h1 = _mm_xor_pd(tau1, sign); x1 = _mm_mul_pd(x1, h1); x2 = _mm_mul_pd(x2, h1); h1 = _mm_xor_pd(tau2, sign); h2 = _mm_mul_pd(h1, vs); y1 = _mm_add_pd(_mm_mul_pd(y1,h1), _mm_mul_pd(x1,h2)); y2 = _mm_add_pd(_mm_mul_pd(y2,h1), _mm_mul_pd(x2,h2)); q1 = _mm_load_pd(q); q1 = _mm_add_pd(q1, y1); _mm_store_pd(q,q1); q2 = _mm_load_pd(&q[2]); q2 = _mm_add_pd(q2, y2); _mm_store_pd(&q[2],q2); h2 = _mm_loaddup_pd(&hh[ldh+1]); q1 = _mm_load_pd(&q[ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(x1, _mm_mul_pd(y1, h2))); _mm_store_pd(&q[ldq],q1); q2 = _mm_load_pd(&q[ldq+2]); q2 = _mm_add_pd(q2, _mm_add_pd(x2, _mm_mul_pd(y2, h2))); _mm_store_pd(&q[ldq+2],q2); for (i = 2; i < nb; i++) { h1 = _mm_loaddup_pd(&hh[i-1]); h2 = _mm_loaddup_pd(&hh[ldh+i]); q1 = _mm_load_pd(&q[i*ldq]); q1 = _mm_add_pd(q1, _mm_add_pd(_mm_mul_pd(x1,h1), _mm_mul_pd(y1, h2))); _mm_store_pd(&q[i*ldq],q1); q2 = _mm_load_pd(&q[(i*ldq)+2]); q2 = _mm_add_pd(q2, _mm_add_pd(_mm_mul_pd(x2,h1), _mm_mul_pd(y2, h2))); _mm_store_pd(&q[(i*ldq)+2],q2); } h1 = _mm_loaddup_pd(&hh[nb-1]); q1 = _mm_load_pd(&q[nb*ldq]); q1 = _mm_add_pd(q1, _mm_mul_pd(x1, h1)); _mm_store_pd(&q[nb*ldq],q1); q2 = _mm_load_pd(&q[(nb*ldq)+2]); q2 = _mm_add_pd(q2, _mm_mul_pd(x2, h1)); _mm_store_pd(&q[(nb*ldq)+2],q2); } elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_complex_simple.F900000644000312500001440000001361312717516040021454 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! -------------------------------------------------------------------------------------------------- ! ! This file contains the compute intensive kernels for the Householder transformations. ! ! This is the small and simple version (no hand unrolling of loops etc.) but for some ! compilers this performs better than a sophisticated version with transformed and unrolled loops. ! ! It should be compiled with the highest possible optimization level. ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ! -------------------------------------------------------------------------------------------------- #include "config-f90.h" module complex_generic_simple_kernel private public single_hh_trafo_complex_generic_simple contains subroutine single_hh_trafo_complex_generic_simple(q, hh, nb, nq, ldq) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, nq, ldq #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(*) #else complex(kind=ck), intent(inout) :: q(1:ldq,1:nb) complex(kind=ck), intent(in) :: hh(1:nb) #endif integer(kind=ik) :: i complex(kind=ck) :: h1, tau1, x(nq) #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple") #endif ! Just one Householder transformation x(1:nq) = q(1:nq,1) do i=2,nb x(1:nq) = x(1:nq) + q(1:nq,i)*conjg(hh(i)) enddo tau1 = hh(1) x(1:nq) = x(1:nq)*(-tau1) q(1:nq,1) = q(1:nq,1) + x(1:nq) do i=2,nb q(1:nq,i) = q(1:nq,i) + x(1:nq)*hh(i) enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple") #endif end subroutine single_hh_trafo_complex_generic_simple ! -------------------------------------------------------------------------------------------------- subroutine double_hh_trafo_complex_generic_simple(q, hh, nb, nq, ldq, ldh) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: nb, nq, ldq, ldh #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck), intent(inout) :: q(ldq,*) complex(kind=ck), intent(in) :: hh(ldh,*) #else complex(kind=ck), intent(inout) :: q(1:ldq,1:nb+1) complex(kind=ck), intent(in) :: hh(1:ldh,1:2) #endif complex(kind=ck) :: s, h1, h2, tau1, tau2, x(nq), y(nq) integer(kind=ik) :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("kernel complex generic simple: double_hh_trafo_complex_generic_simple") #endif ! Calculate dot product of the two Householder vectors s = conjg(hh(2,2))*1 do i=3,nb s = s+(conjg(hh(i,2))*hh(i-1,1)) enddo ! Do the Householder transformations x(1:nq) = q(1:nq,2) y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2)) do i=3,nb h1 = conjg(hh(i-1,1)) h2 = conjg(hh(i,2)) x(1:nq) = x(1:nq) + q(1:nq,i)*h1 y(1:nq) = y(1:nq) + q(1:nq,i)*h2 enddo x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1)) tau1 = hh(1,1) tau2 = hh(1,2) h1 = -tau1 x(1:nq) = x(1:nq)*h1 h1 = -tau2 h2 = -tau2*s y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2 q(1:nq,1) = q(1:nq,1) + y(1:nq) q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2) do i=3,nb h1 = hh(i-1,1) h2 = hh(i,2) q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2 enddo q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple") #endif end subroutine double_hh_trafo_complex_generic_simple end module complex_generic_simple_kernel ! -------------------------------------------------------------------------------------------------- elpa-2016.05.001/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c0000644000312500001440000016162312717516040021567 00000000000000// This file is part of ELPA. // // The ELPA library was originally created by the ELPA consortium, // consisting of the following organizations: // // - Max Planck Computing and Data Facility (MPCDF), formerly known as // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), // - Bergische Universität Wuppertal, Lehrstuhl für angewandte // Informatik, // - Technische Universität München, Lehrstuhl für Informatik mit // Schwerpunkt Wissenschaftliches Rechnen , // - Fritz-Haber-Institut, Berlin, Abt. Theorie, // - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, // and // - IBM Deutschland GmbH // // This particular source code file contains additions, changes and // enhancements authored by Intel Corporation which is not part of // the ELPA consortium. // // More information can be found here: // http://elpa.mpcdf.mpg.de/ // // ELPA is free software: you can redistribute it and/or modify // it under the terms of the version 3 of the license of the // GNU Lesser General Public License as published by the Free // Software Foundation. // // ELPA is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with ELPA. If not, see // // ELPA reflects a substantial effort on the part of the original // ELPA consortium, and we ask you to respect the spirit of the // license that we chose: i.e., please contribute any changes you // may have back to the original ELPA library distribution, and keep // any derivatives of ELPA under the same license that we chose for // the original distribution, the GNU Lesser General Public License. // // // -------------------------------------------------------------------------------------------------- // // This file contains the compute intensive kernels for the Householder transformations. // It should be compiled with the highest possible optimization level. // // On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3 // On Intel Sandy Bridge use -O3 -mavx // // Copyright of the original code rests with the authors inside the ELPA // consortium. The copyright of any additional modifications shall rest // with their original authors, but shall adhere to the licensing terms // distributed along with the original code in the file "COPYING". // // Author: Alexander Heinecke (alexander.heinecke@mytum.de) // Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de) // -------------------------------------------------------------------------------------------------- #include "config-f90.h" #include #define __forceinline __attribute__((always_inline)) static #ifdef HAVE_AVX2 #ifdef __FMA4__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c) #define _mm256_NFMA_pd(a,b,c) _mm256_nmacc_pd(a,b,c) #define _mm256_FMSUB_pd(a,b,c) _mm256_msub(a,b,c) #endif #ifdef __AVX2__ #define __ELPA_USE_FMA__ #define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c) #define _mm256_NFMA_pd(a,b,c) _mm256_fnmadd_pd(a,b,c) #define _mm256_FMSUB_pd(a,b,c) _mm256_fmsub_pd(a,b,c) #endif #endif //Forward declaration static void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); static void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); /* !f>#ifdef HAVE_AVX !f> interface !f> subroutine hexa_hh_trafo_real_avx_avx2_6hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="hexa_hh_trafo_real_avx_avx2_6hv") !f> use, intrinsic :: iso_c_binding !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh !f> real(kind=c_double) :: q(*) !f> real(kind=c_double) :: hh(pnb,6) !f> end subroutine !f> end interface !f>#endif */ void hexa_hh_trafo_real_avx_avx2_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_real_avx_avx2_6hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 6 householder vectors simultaneously double scalarprods[15]; // scalarprods[0] = s_1_2; // scalarprods[1] = s_1_3; // scalarprods[2] = s_2_3; // scalarprods[3] = s_1_4; // scalarprods[4] = s_2_4; // scalarprods[5] = s_3_4; // scalarprods[6] = s_1_5; // scalarprods[7] = s_2_5; // scalarprods[8] = s_3_5; // scalarprods[9] = s_4_5; // scalarprods[10] = s_1_6; // scalarprods[11] = s_2_6; // scalarprods[12] = s_3_6; // scalarprods[13] = s_4_6; // scalarprods[14] = s_5_6; scalarprods[0] = hh[(ldh+1)]; scalarprods[1] = hh[(ldh*2)+2]; scalarprods[2] = hh[(ldh*2)+1]; scalarprods[3] = hh[(ldh*3)+3]; scalarprods[4] = hh[(ldh*3)+2]; scalarprods[5] = hh[(ldh*3)+1]; scalarprods[6] = hh[(ldh*4)+4]; scalarprods[7] = hh[(ldh*4)+3]; scalarprods[8] = hh[(ldh*4)+2]; scalarprods[9] = hh[(ldh*4)+1]; scalarprods[10] = hh[(ldh*5)+5]; scalarprods[11] = hh[(ldh*5)+4]; scalarprods[12] = hh[(ldh*5)+3]; scalarprods[13] = hh[(ldh*5)+2]; scalarprods[14] = hh[(ldh*5)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 scalarprods[0] += hh[1] * hh[(2+ldh)]; scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; // loop counter = 3 scalarprods[0] += hh[2] * hh[(3+ldh)]; scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; scalarprods[1] += hh[1] * hh[3+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; // loop counter = 4 scalarprods[0] += hh[3] * hh[(4+ldh)]; scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; scalarprods[1] += hh[2] * hh[4+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; scalarprods[3] += hh[1] * hh[4+(ldh*3)]; scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; // loop counter = 5 scalarprods[0] += hh[4] * hh[(5+ldh)]; scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; scalarprods[1] += hh[3] * hh[5+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; scalarprods[3] += hh[2] * hh[5+(ldh*3)]; scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; scalarprods[6] += hh[1] * hh[5+(ldh*4)]; scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; #pragma ivdep for (i = 6; i < nb; i++) { scalarprods[0] += hh[i-1] * hh[(i+ldh)]; scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; } // printf("s_1_2: %f\n", scalarprods[0]); // printf("s_1_3: %f\n", scalarprods[1]); // printf("s_2_3: %f\n", scalarprods[2]); // printf("s_1_4: %f\n", scalarprods[3]); // printf("s_2_4: %f\n", scalarprods[4]); // printf("s_3_4: %f\n", scalarprods[5]); // printf("s_1_5: %f\n", scalarprods[6]); // printf("s_2_5: %f\n", scalarprods[7]); // printf("s_3_5: %f\n", scalarprods[8]); // printf("s_4_5: %f\n", scalarprods[9]); // printf("s_1_6: %f\n", scalarprods[10]); // printf("s_2_6: %f\n", scalarprods[11]); // printf("s_3_6: %f\n", scalarprods[12]); // printf("s_4_6: %f\n", scalarprods[13]); // printf("s_5_6: %f\n", scalarprods[14]); // Production level kernel calls with padding #ifdef __AVX__ for (i = 0; i < nq-4; i+=8) { hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { return; } else { hh_trafo_kernel_4_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } #else for (i = 0; i < nq-2; i+=4) { hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } if (nq == i) { return; } else { hh_trafo_kernel_2_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } #endif } #if 0 void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) { int i; int nb = *pnb; int nq = *pldq; int ldq = *pldq; int ldh = *pldh; // calculating scalar products to compute // 6 householder vectors simultaneously double scalarprods[15]; // scalarprods[0] = s_1_2; // scalarprods[1] = s_1_3; // scalarprods[2] = s_2_3; // scalarprods[3] = s_1_4; // scalarprods[4] = s_2_4; // scalarprods[5] = s_3_4; // scalarprods[6] = s_1_5; // scalarprods[7] = s_2_5; // scalarprods[8] = s_3_5; // scalarprods[9] = s_4_5; // scalarprods[10] = s_1_6; // scalarprods[11] = s_2_6; // scalarprods[12] = s_3_6; // scalarprods[13] = s_4_6; // scalarprods[14] = s_5_6; scalarprods[0] = hh[(ldh+1)]; scalarprods[1] = hh[(ldh*2)+2]; scalarprods[2] = hh[(ldh*2)+1]; scalarprods[3] = hh[(ldh*3)+3]; scalarprods[4] = hh[(ldh*3)+2]; scalarprods[5] = hh[(ldh*3)+1]; scalarprods[6] = hh[(ldh*4)+4]; scalarprods[7] = hh[(ldh*4)+3]; scalarprods[8] = hh[(ldh*4)+2]; scalarprods[9] = hh[(ldh*4)+1]; scalarprods[10] = hh[(ldh*5)+5]; scalarprods[11] = hh[(ldh*5)+4]; scalarprods[12] = hh[(ldh*5)+3]; scalarprods[13] = hh[(ldh*5)+2]; scalarprods[14] = hh[(ldh*5)+1]; // calculate scalar product of first and fourth householder vector // loop counter = 2 scalarprods[0] += hh[1] * hh[(2+ldh)]; scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; // loop counter = 3 scalarprods[0] += hh[2] * hh[(3+ldh)]; scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; scalarprods[1] += hh[1] * hh[3+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; // loop counter = 4 scalarprods[0] += hh[3] * hh[(4+ldh)]; scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; scalarprods[1] += hh[2] * hh[4+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; scalarprods[3] += hh[1] * hh[4+(ldh*3)]; scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; // loop counter = 5 scalarprods[0] += hh[4] * hh[(5+ldh)]; scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; scalarprods[1] += hh[3] * hh[5+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; scalarprods[3] += hh[2] * hh[5+(ldh*3)]; scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; scalarprods[6] += hh[1] * hh[5+(ldh*4)]; scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; #pragma ivdep for (i = 6; i < nb; i++) { scalarprods[0] += hh[i-1] * hh[(i+ldh)]; scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; } // printf("s_1_2: %f\n", scalarprods[0]); // printf("s_1_3: %f\n", scalarprods[1]); // printf("s_2_3: %f\n", scalarprods[2]); // printf("s_1_4: %f\n", scalarprods[3]); // printf("s_2_4: %f\n", scalarprods[4]); // printf("s_3_4: %f\n", scalarprods[5]); // printf("s_1_5: %f\n", scalarprods[6]); // printf("s_2_5: %f\n", scalarprods[7]); // printf("s_3_5: %f\n", scalarprods[8]); // printf("s_4_5: %f\n", scalarprods[9]); // printf("s_1_6: %f\n", scalarprods[10]); // printf("s_2_6: %f\n", scalarprods[11]); // printf("s_3_6: %f\n", scalarprods[12]); // printf("s_4_6: %f\n", scalarprods[13]); // printf("s_5_6: %f\n", scalarprods[14]); // Production level kernel calls with padding #ifdef __AVX__ for (i = 0; i < nq; i+=8) { hh_trafo_kernel_8_AVX_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } #else for (i = 0; i < nq; i+=4) { hh_trafo_kernel_4_SSE_6hv(&q[i], hh, nb, ldq, ldh, scalarprods); } #endif } #endif /** * Unrolled kernel that computes * 8 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_8_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m256d a1_1 = _mm256_load_pd(&q[ldq*5]); __m256d a2_1 = _mm256_load_pd(&q[ldq*4]); __m256d a3_1 = _mm256_load_pd(&q[ldq*3]); __m256d a4_1 = _mm256_load_pd(&q[ldq*2]); __m256d a5_1 = _mm256_load_pd(&q[ldq]); __m256d a6_1 = _mm256_load_pd(&q[0]); __m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); __m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); __m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); __m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); __m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); #ifdef __ELPA_USE_FMA__ register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1); t1 = _mm256_FMA_pd(a4_1, h_6_4, t1); t1 = _mm256_FMA_pd(a3_1, h_6_3, t1); t1 = _mm256_FMA_pd(a2_1, h_6_2, t1); t1 = _mm256_FMA_pd(a1_1, h_6_1, t1); #else register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1)); #endif __m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); __m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); __m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); __m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); #ifdef __ELPA_USE_FMA__ register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1); v1 = _mm256_FMA_pd(a3_1, h_5_3, v1); v1 = _mm256_FMA_pd(a2_1, h_5_2, v1); v1 = _mm256_FMA_pd(a1_1, h_5_1, v1); #else register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4)); v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3)); v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2)); v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1)); #endif __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); #else register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); #endif __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); #ifdef __ELPA_USE_FMA__ register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); #else register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); #endif register __m256d x1 = a1_1; __m256d a1_2 = _mm256_load_pd(&q[(ldq*5)+4]); __m256d a2_2 = _mm256_load_pd(&q[(ldq*4)+4]); __m256d a3_2 = _mm256_load_pd(&q[(ldq*3)+4]); __m256d a4_2 = _mm256_load_pd(&q[(ldq*2)+4]); __m256d a5_2 = _mm256_load_pd(&q[(ldq)+4]); __m256d a6_2 = _mm256_load_pd(&q[4]); #ifdef __ELPA_USE_FMA__ register __m256d t2 = _mm256_FMA_pd(a5_2, h_6_5, a6_2); t2 = _mm256_FMA_pd(a4_2, h_6_4, t2); t2 = _mm256_FMA_pd(a3_2, h_6_3, t2); t2 = _mm256_FMA_pd(a2_2, h_6_2, t2); t2 = _mm256_FMA_pd(a1_2, h_6_1, t2); register __m256d v2 = _mm256_FMA_pd(a4_2, h_5_4, a5_2); v2 = _mm256_FMA_pd(a3_2, h_5_3, v2); v2 = _mm256_FMA_pd(a2_2, h_5_2, v2); v2 = _mm256_FMA_pd(a1_2, h_5_1, v2); register __m256d w2 = _mm256_FMA_pd(a3_2, h_4_3, a4_2); w2 = _mm256_FMA_pd(a2_2, h_4_2, w2); w2 = _mm256_FMA_pd(a1_2, h_4_1, w2); register __m256d z2 = _mm256_FMA_pd(a2_2, h_3_2, a3_2); z2 = _mm256_FMA_pd(a1_2, h_3_1, z2); register __m256d y2 = _mm256_FMA_pd(a1_2, h_2_1, a2_2); #else register __m256d t2 = _mm256_add_pd(a6_2, _mm256_mul_pd(a5_2, h_6_5)); t2 = _mm256_add_pd(t2, _mm256_mul_pd(a4_2, h_6_4)); t2 = _mm256_add_pd(t2, _mm256_mul_pd(a3_2, h_6_3)); t2 = _mm256_add_pd(t2, _mm256_mul_pd(a2_2, h_6_2)); t2 = _mm256_add_pd(t2, _mm256_mul_pd(a1_2, h_6_1)); register __m256d v2 = _mm256_add_pd(a5_2, _mm256_mul_pd(a4_2, h_5_4)); v2 = _mm256_add_pd(v2, _mm256_mul_pd(a3_2, h_5_3)); v2 = _mm256_add_pd(v2, _mm256_mul_pd(a2_2, h_5_2)); v2 = _mm256_add_pd(v2, _mm256_mul_pd(a1_2, h_5_1)); register __m256d w2 = _mm256_add_pd(a4_2, _mm256_mul_pd(a3_2, h_4_3)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(a2_2, h_4_2)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(a1_2, h_4_1)); register __m256d z2 = _mm256_add_pd(a3_2, _mm256_mul_pd(a2_2, h_3_2)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(a1_2, h_3_1)); register __m256d y2 = _mm256_add_pd(a2_2, _mm256_mul_pd(a1_2, h_2_1)); #endif register __m256d x2 = a1_2; __m256d q1; __m256d q2; __m256d h1; __m256d h2; __m256d h3; __m256d h4; __m256d h5; __m256d h6; for(i = 6; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-5]); q1 = _mm256_load_pd(&q[i*ldq]); q2 = _mm256_load_pd(&q[(i*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); w2 = _mm256_FMA_pd(q2, h4, w2); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); #ifdef __ELPA_USE_FMA__ v1 = _mm256_FMA_pd(q1, h5, v1); v2 = _mm256_FMA_pd(q2, h5, v2); #else v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); #ifdef __ELPA_USE_FMA__ t1 = _mm256_FMA_pd(q1, h6, t1); t2 = _mm256_FMA_pd(q2, h6, t2); #else t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6)); t2 = _mm256_add_pd(t2, _mm256_mul_pd(q2,h6)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-5]); q1 = _mm256_load_pd(&q[nb*ldq]); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); w2 = _mm256_FMA_pd(q2, h4, w2); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); #ifdef __ELPA_USE_FMA__ v1 = _mm256_FMA_pd(q1, h5, v1); v2 = _mm256_FMA_pd(q2, h5, v2); #else v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); v2 = _mm256_add_pd(v2, _mm256_mul_pd(q2,h5)); #endif h1 = _mm256_broadcast_sd(&hh[nb-4]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); w2 = _mm256_FMA_pd(q2, h4, w2); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); w2 = _mm256_add_pd(w2, _mm256_mul_pd(q2,h4)); #endif h1 = _mm256_broadcast_sd(&hh[nb-3]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); z2 = _mm256_FMA_pd(q2, h3, z2); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); z2 = _mm256_add_pd(z2, _mm256_mul_pd(q2,h3)); #endif h1 = _mm256_broadcast_sd(&hh[nb-2]); q1 = _mm256_load_pd(&q[(nb+3)*ldq]); q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); y2 = _mm256_FMA_pd(q2, h2, y2); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2)); #endif h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+4)*ldq]); q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); x2 = _mm256_FMA_pd(q2, h1, x2); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1)); #endif ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(&hh[0]); x1 = _mm256_mul_pd(x1, tau1); x2 = _mm256_mul_pd(x2, tau1); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]); h2 = _mm256_mul_pd(tau2, vs_1_2); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2)); y2 = _mm256_FMSUB_pd(y2, tau2, _mm256_mul_pd(x2,h2)); #else y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2)); y2 = _mm256_sub_pd(_mm256_mul_pd(y2,tau2), _mm256_mul_pd(x2,h2)); #endif __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); __m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]); __m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]); h2 = _mm256_mul_pd(tau3, vs_1_3); h3 = _mm256_mul_pd(tau3, vs_2_3); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); z2 = _mm256_FMSUB_pd(z2, tau3, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))); #else z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); z2 = _mm256_sub_pd(_mm256_mul_pd(z2,tau3), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))); #endif __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); __m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]); __m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]); h2 = _mm256_mul_pd(tau4, vs_1_4); h3 = _mm256_mul_pd(tau4, vs_2_4); __m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]); h4 = _mm256_mul_pd(tau4, vs_3_4); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); w2 = _mm256_FMSUB_pd(w2, tau4, _mm256_FMA_pd(z2, h4, _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); #else w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); w2 = _mm256_sub_pd(_mm256_mul_pd(w2,tau4), _mm256_add_pd(_mm256_mul_pd(z2,h4), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); #endif __m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]); __m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]); __m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]); h2 = _mm256_mul_pd(tau5, vs_1_5); h3 = _mm256_mul_pd(tau5, vs_2_5); __m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]); __m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]); h4 = _mm256_mul_pd(tau5, vs_3_5); h5 = _mm256_mul_pd(tau5, vs_4_5); #ifdef __ELPA_USE_FMA__ v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); v2 = _mm256_FMSUB_pd(v2, tau5, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2)))); #else v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); v2 = _mm256_sub_pd(_mm256_mul_pd(v2,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2)))); #endif __m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]); __m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]); __m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]); h2 = _mm256_mul_pd(tau6, vs_1_6); h3 = _mm256_mul_pd(tau6, vs_2_6); __m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]); __m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]); __m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]); h4 = _mm256_mul_pd(tau6, vs_3_6); h5 = _mm256_mul_pd(tau6, vs_4_6); h6 = _mm256_mul_pd(tau6, vs_5_6); #ifdef __ELPA_USE_FMA__ t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))))); t2 = _mm256_FMSUB_pd(t2, tau6, _mm256_FMA_pd(v2, h6, _mm256_add_pd(_mm256_FMA_pd(w2, h5, _mm256_mul_pd(z2,h4)), _mm256_FMA_pd(y2, h3, _mm256_mul_pd(x2,h2))))); #else t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))))); t2 = _mm256_sub_pd(_mm256_mul_pd(t2,tau6), _mm256_add_pd( _mm256_mul_pd(v2,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w2,h5), _mm256_mul_pd(z2,h4)), _mm256_add_pd(_mm256_mul_pd(y2,h3), _mm256_mul_pd(x2,h2))))); #endif ///////////////////////////////////////////////////// // Rank-1 update of Q [8 x nb+3] ///////////////////////////////////////////////////// q1 = _mm256_load_pd(&q[0]); q2 = _mm256_load_pd(&q[4]); q1 = _mm256_sub_pd(q1, t1); q2 = _mm256_sub_pd(q2, t2); _mm256_store_pd(&q[0],q1); _mm256_store_pd(&q[4],q2); h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); q1 = _mm256_load_pd(&q[ldq]); q2 = _mm256_load_pd(&q[(ldq+4)]); q1 = _mm256_sub_pd(q1, v1); q2 = _mm256_sub_pd(q2, v2); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); q2 = _mm256_NFMA_pd(t2, h6, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); #endif _mm256_store_pd(&q[ldq],q1); _mm256_store_pd(&q[(ldq+4)],q2); h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); q1 = _mm256_load_pd(&q[ldq*2]); q2 = _mm256_load_pd(&q[(ldq*2)+4]); q1 = _mm256_sub_pd(q1, w1); q2 = _mm256_sub_pd(q2, w2); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); q2 = _mm256_NFMA_pd(v2, h5, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); q2 = _mm256_NFMA_pd(t2, h6, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); #endif _mm256_store_pd(&q[ldq*2],q1); _mm256_store_pd(&q[(ldq*2)+4],q2); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); q1 = _mm256_load_pd(&q[ldq*3]); q2 = _mm256_load_pd(&q[(ldq*3)+4]); q1 = _mm256_sub_pd(q1, z1); q2 = _mm256_sub_pd(q2, z2); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); q2 = _mm256_NFMA_pd(v2, h5, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); q2 = _mm256_NFMA_pd(t2, h6, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); #endif _mm256_store_pd(&q[ldq*3],q1); _mm256_store_pd(&q[(ldq*3)+4],q2); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); q1 = _mm256_load_pd(&q[ldq*4]); q2 = _mm256_load_pd(&q[(ldq*4)+4]); q1 = _mm256_sub_pd(q1, y1); q2 = _mm256_sub_pd(q2, y2); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); q2 = _mm256_NFMA_pd(v2, h5, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); q2 = _mm256_NFMA_pd(t2, h6, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); #endif _mm256_store_pd(&q[ldq*4],q1); _mm256_store_pd(&q[(ldq*4)+4],q2); h2 = _mm256_broadcast_sd(&hh[(ldh)+1]); q1 = _mm256_load_pd(&q[ldq*5]); q2 = _mm256_load_pd(&q[(ldq*5)+4]); q1 = _mm256_sub_pd(q1, x1); q2 = _mm256_sub_pd(q2, x2); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); q2 = _mm256_NFMA_pd(v2, h5, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); q2 = _mm256_NFMA_pd(t2, h6, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); #endif _mm256_store_pd(&q[ldq*5],q1); _mm256_store_pd(&q[(ldq*5)+4],q2); for (i = 6; i < nb; i++) { q1 = _mm256_load_pd(&q[i*ldq]); q2 = _mm256_load_pd(&q[(i*ldq)+4]); h1 = _mm256_broadcast_sd(&hh[i-5]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); q2 = _mm256_NFMA_pd(v2, h5, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); q2 = _mm256_NFMA_pd(t2, h6, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(t2, h6)); #endif _mm256_store_pd(&q[i*ldq],q1); _mm256_store_pd(&q[(i*ldq)+4],q2); } h1 = _mm256_broadcast_sd(&hh[nb-5]); q1 = _mm256_load_pd(&q[nb*ldq]); q2 = _mm256_load_pd(&q[(nb*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); q2 = _mm256_NFMA_pd(v2, h5, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(v2, h5)); #endif _mm256_store_pd(&q[nb*ldq],q1); _mm256_store_pd(&q[(nb*ldq)+4],q2); h1 = _mm256_broadcast_sd(&hh[nb-4]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); q2 = _mm256_load_pd(&q[((nb+1)*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); q2 = _mm256_NFMA_pd(w2, h4, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(w2, h4)); #endif _mm256_store_pd(&q[(nb+1)*ldq],q1); _mm256_store_pd(&q[((nb+1)*ldq)+4],q2); h1 = _mm256_broadcast_sd(&hh[nb-3]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); q2 = _mm256_load_pd(&q[((nb+2)*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); q2 = _mm256_NFMA_pd(z2, h3, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(z2, h3)); #endif _mm256_store_pd(&q[(nb+2)*ldq],q1); _mm256_store_pd(&q[((nb+2)*ldq)+4],q2); h1 = _mm256_broadcast_sd(&hh[nb-2]); q1 = _mm256_load_pd(&q[(nb+3)*ldq]); q2 = _mm256_load_pd(&q[((nb+3)*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); q2 = _mm256_NFMA_pd(y2, h2, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(y2, h2)); #endif _mm256_store_pd(&q[(nb+3)*ldq],q1); _mm256_store_pd(&q[((nb+3)*ldq)+4],q2); h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+4)*ldq]); q2 = _mm256_load_pd(&q[((nb+4)*ldq)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); q2 = _mm256_NFMA_pd(x2, h1, q2); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); q2 = _mm256_sub_pd(q2, _mm256_mul_pd(x2, h1)); #endif _mm256_store_pd(&q[(nb+4)*ldq],q1); _mm256_store_pd(&q[((nb+4)*ldq)+4],q2); } /** * Unrolled kernel that computes * 4 rows of Q simultaneously, a * matrix vector product with two householder * vectors + a rank 1 update is performed */ __forceinline void hh_trafo_kernel_4_AVX_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods) { ///////////////////////////////////////////////////// // Matrix Vector Multiplication, Q [8 x nb+3] * hh // hh contains four householder vectors ///////////////////////////////////////////////////// int i; __m256d a1_1 = _mm256_load_pd(&q[ldq*5]); __m256d a2_1 = _mm256_load_pd(&q[ldq*4]); __m256d a3_1 = _mm256_load_pd(&q[ldq*3]); __m256d a4_1 = _mm256_load_pd(&q[ldq*2]); __m256d a5_1 = _mm256_load_pd(&q[ldq]); __m256d a6_1 = _mm256_load_pd(&q[0]); __m256d h_6_5 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); __m256d h_6_4 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); __m256d h_6_3 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); __m256d h_6_2 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); __m256d h_6_1 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); #ifdef __ELPA_USE_FMA__ register __m256d t1 = _mm256_FMA_pd(a5_1, h_6_5, a6_1); t1 = _mm256_FMA_pd(a4_1, h_6_4, t1); t1 = _mm256_FMA_pd(a3_1, h_6_3, t1); t1 = _mm256_FMA_pd(a2_1, h_6_2, t1); t1 = _mm256_FMA_pd(a1_1, h_6_1, t1); #else register __m256d t1 = _mm256_add_pd(a6_1, _mm256_mul_pd(a5_1, h_6_5)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a4_1, h_6_4)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a3_1, h_6_3)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a2_1, h_6_2)); t1 = _mm256_add_pd(t1, _mm256_mul_pd(a1_1, h_6_1)); #endif __m256d h_5_4 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); __m256d h_5_3 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); __m256d h_5_2 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); __m256d h_5_1 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); #ifdef __ELPA_USE_FMA__ register __m256d v1 = _mm256_FMA_pd(a4_1, h_5_4, a5_1); v1 = _mm256_FMA_pd(a3_1, h_5_3, v1); v1 = _mm256_FMA_pd(a2_1, h_5_2, v1); v1 = _mm256_FMA_pd(a1_1, h_5_1, v1); #else register __m256d v1 = _mm256_add_pd(a5_1, _mm256_mul_pd(a4_1, h_5_4)); v1 = _mm256_add_pd(v1, _mm256_mul_pd(a3_1, h_5_3)); v1 = _mm256_add_pd(v1, _mm256_mul_pd(a2_1, h_5_2)); v1 = _mm256_add_pd(v1, _mm256_mul_pd(a1_1, h_5_1)); #endif __m256d h_4_3 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); __m256d h_4_2 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); __m256d h_4_1 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ register __m256d w1 = _mm256_FMA_pd(a3_1, h_4_3, a4_1); w1 = _mm256_FMA_pd(a2_1, h_4_2, w1); w1 = _mm256_FMA_pd(a1_1, h_4_1, w1); #else register __m256d w1 = _mm256_add_pd(a4_1, _mm256_mul_pd(a3_1, h_4_3)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a2_1, h_4_2)); w1 = _mm256_add_pd(w1, _mm256_mul_pd(a1_1, h_4_1)); #endif __m256d h_2_1 = _mm256_broadcast_sd(&hh[ldh+1]); __m256d h_3_2 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); __m256d h_3_1 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); #ifdef __ELPA_USE_FMA__ register __m256d z1 = _mm256_FMA_pd(a2_1, h_3_2, a3_1); z1 = _mm256_FMA_pd(a1_1, h_3_1, z1); register __m256d y1 = _mm256_FMA_pd(a1_1, h_2_1, a2_1); #else register __m256d z1 = _mm256_add_pd(a3_1, _mm256_mul_pd(a2_1, h_3_2)); z1 = _mm256_add_pd(z1, _mm256_mul_pd(a1_1, h_3_1)); register __m256d y1 = _mm256_add_pd(a2_1, _mm256_mul_pd(a1_1, h_2_1)); #endif register __m256d x1 = a1_1; __m256d q1; __m256d h1; __m256d h2; __m256d h3; __m256d h4; __m256d h5; __m256d h6; for(i = 6; i < nb; i++) { h1 = _mm256_broadcast_sd(&hh[i-5]); q1 = _mm256_load_pd(&q[i*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); #ifdef __ELPA_USE_FMA__ v1 = _mm256_FMA_pd(q1, h5, v1); #else v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); #ifdef __ELPA_USE_FMA__ t1 = _mm256_FMA_pd(q1, h6, t1); #else t1 = _mm256_add_pd(t1, _mm256_mul_pd(q1,h6)); #endif } h1 = _mm256_broadcast_sd(&hh[nb-5]); q1 = _mm256_load_pd(&q[nb*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); #ifdef __ELPA_USE_FMA__ v1 = _mm256_FMA_pd(q1, h5, v1); #else v1 = _mm256_add_pd(v1, _mm256_mul_pd(q1,h5)); #endif h1 = _mm256_broadcast_sd(&hh[nb-4]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMA_pd(q1, h4, w1); #else w1 = _mm256_add_pd(w1, _mm256_mul_pd(q1,h4)); #endif h1 = _mm256_broadcast_sd(&hh[nb-3]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMA_pd(q1, h3, z1); #else z1 = _mm256_add_pd(z1, _mm256_mul_pd(q1,h3)); #endif h1 = _mm256_broadcast_sd(&hh[nb-2]); q1 = _mm256_load_pd(&q[(nb+3)*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMA_pd(q1, h2, y1); #else y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2)); #endif h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+4)*ldq]); #ifdef __ELPA_USE_FMA__ x1 = _mm256_FMA_pd(q1, h1, x1); #else x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1)); #endif ///////////////////////////////////////////////////// // Apply tau, correct wrong calculation using pre-calculated scalar products ///////////////////////////////////////////////////// __m256d tau1 = _mm256_broadcast_sd(&hh[0]); x1 = _mm256_mul_pd(x1, tau1); __m256d tau2 = _mm256_broadcast_sd(&hh[ldh]); __m256d vs_1_2 = _mm256_broadcast_sd(&scalarprods[0]); h2 = _mm256_mul_pd(tau2, vs_1_2); #ifdef __ELPA_USE_FMA__ y1 = _mm256_FMSUB_pd(y1, tau2, _mm256_mul_pd(x1,h2)); #else y1 = _mm256_sub_pd(_mm256_mul_pd(y1,tau2), _mm256_mul_pd(x1,h2)); #endif __m256d tau3 = _mm256_broadcast_sd(&hh[ldh*2]); __m256d vs_1_3 = _mm256_broadcast_sd(&scalarprods[1]); __m256d vs_2_3 = _mm256_broadcast_sd(&scalarprods[2]); h2 = _mm256_mul_pd(tau3, vs_1_3); h3 = _mm256_mul_pd(tau3, vs_2_3); #ifdef __ELPA_USE_FMA__ z1 = _mm256_FMSUB_pd(z1, tau3, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))); #else z1 = _mm256_sub_pd(_mm256_mul_pd(z1,tau3), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))); #endif __m256d tau4 = _mm256_broadcast_sd(&hh[ldh*3]); __m256d vs_1_4 = _mm256_broadcast_sd(&scalarprods[3]); __m256d vs_2_4 = _mm256_broadcast_sd(&scalarprods[4]); h2 = _mm256_mul_pd(tau4, vs_1_4); h3 = _mm256_mul_pd(tau4, vs_2_4); __m256d vs_3_4 = _mm256_broadcast_sd(&scalarprods[5]); h4 = _mm256_mul_pd(tau4, vs_3_4); #ifdef __ELPA_USE_FMA__ w1 = _mm256_FMSUB_pd(w1, tau4, _mm256_FMA_pd(z1, h4, _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); #else w1 = _mm256_sub_pd(_mm256_mul_pd(w1,tau4), _mm256_add_pd(_mm256_mul_pd(z1,h4), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); #endif __m256d tau5 = _mm256_broadcast_sd(&hh[ldh*4]); __m256d vs_1_5 = _mm256_broadcast_sd(&scalarprods[6]); __m256d vs_2_5 = _mm256_broadcast_sd(&scalarprods[7]); h2 = _mm256_mul_pd(tau5, vs_1_5); h3 = _mm256_mul_pd(tau5, vs_2_5); __m256d vs_3_5 = _mm256_broadcast_sd(&scalarprods[8]); __m256d vs_4_5 = _mm256_broadcast_sd(&scalarprods[9]); h4 = _mm256_mul_pd(tau5, vs_3_5); h5 = _mm256_mul_pd(tau5, vs_4_5); #ifdef __ELPA_USE_FMA__ v1 = _mm256_FMSUB_pd(v1, tau5, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2)))); #else v1 = _mm256_sub_pd(_mm256_mul_pd(v1,tau5), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2)))); #endif __m256d tau6 = _mm256_broadcast_sd(&hh[ldh*5]); __m256d vs_1_6 = _mm256_broadcast_sd(&scalarprods[10]); __m256d vs_2_6 = _mm256_broadcast_sd(&scalarprods[11]); h2 = _mm256_mul_pd(tau6, vs_1_6); h3 = _mm256_mul_pd(tau6, vs_2_6); __m256d vs_3_6 = _mm256_broadcast_sd(&scalarprods[12]); __m256d vs_4_6 = _mm256_broadcast_sd(&scalarprods[13]); __m256d vs_5_6 = _mm256_broadcast_sd(&scalarprods[14]); h4 = _mm256_mul_pd(tau6, vs_3_6); h5 = _mm256_mul_pd(tau6, vs_4_6); h6 = _mm256_mul_pd(tau6, vs_5_6); #ifdef __ELPA_USE_FMA__ t1 = _mm256_FMSUB_pd(t1, tau6, _mm256_FMA_pd(v1, h6, _mm256_add_pd(_mm256_FMA_pd(w1, h5, _mm256_mul_pd(z1,h4)), _mm256_FMA_pd(y1, h3, _mm256_mul_pd(x1,h2))))); #else t1 = _mm256_sub_pd(_mm256_mul_pd(t1,tau6), _mm256_add_pd( _mm256_mul_pd(v1,h6), _mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(w1,h5), _mm256_mul_pd(z1,h4)), _mm256_add_pd(_mm256_mul_pd(y1,h3), _mm256_mul_pd(x1,h2))))); #endif ///////////////////////////////////////////////////// // Rank-1 update of Q [4 x nb+3] ///////////////////////////////////////////////////// q1 = _mm256_load_pd(&q[0]); q1 = _mm256_sub_pd(q1, t1); _mm256_store_pd(&q[0],q1); h6 = _mm256_broadcast_sd(&hh[(ldh*5)+1]); q1 = _mm256_load_pd(&q[ldq]); q1 = _mm256_sub_pd(q1, v1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); #endif _mm256_store_pd(&q[ldq],q1); h5 = _mm256_broadcast_sd(&hh[(ldh*4)+1]); q1 = _mm256_load_pd(&q[ldq*2]); q1 = _mm256_sub_pd(q1, w1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); #endif _mm256_store_pd(&q[ldq*2],q1); h4 = _mm256_broadcast_sd(&hh[(ldh*3)+1]); q1 = _mm256_load_pd(&q[ldq*3]); q1 = _mm256_sub_pd(q1, z1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); #endif _mm256_store_pd(&q[ldq*3],q1); h3 = _mm256_broadcast_sd(&hh[(ldh*2)+1]); q1 = _mm256_load_pd(&q[ldq*4]); q1 = _mm256_sub_pd(q1, y1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); #endif _mm256_store_pd(&q[ldq*4],q1); h2 = _mm256_broadcast_sd(&hh[(ldh)+1]); q1 = _mm256_load_pd(&q[ldq*5]); q1 = _mm256_sub_pd(q1, x1); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+5]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); #endif _mm256_store_pd(&q[ldq*5],q1); for (i = 6; i < nb; i++) { q1 = _mm256_load_pd(&q[i*ldq]); h1 = _mm256_broadcast_sd(&hh[i-5]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+i-4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+i-3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+i-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+i-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); #endif h6 = _mm256_broadcast_sd(&hh[(ldh*5)+i]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(t1, h6, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(t1, h6)); #endif _mm256_store_pd(&q[i*ldq],q1); } h1 = _mm256_broadcast_sd(&hh[nb-5]); q1 = _mm256_load_pd(&q[nb*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-4]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); #endif h5 = _mm256_broadcast_sd(&hh[(ldh*4)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(v1, h5, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(v1, h5)); #endif _mm256_store_pd(&q[nb*ldq],q1); h1 = _mm256_broadcast_sd(&hh[nb-4]); q1 = _mm256_load_pd(&q[(nb+1)*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-3]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); #endif h4 = _mm256_broadcast_sd(&hh[(ldh*3)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(w1, h4, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(w1, h4)); #endif _mm256_store_pd(&q[(nb+1)*ldq],q1); h1 = _mm256_broadcast_sd(&hh[nb-3]); q1 = _mm256_load_pd(&q[(nb+2)*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-2]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); #endif h3 = _mm256_broadcast_sd(&hh[(ldh*2)+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(z1, h3, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(z1, h3)); #endif _mm256_store_pd(&q[(nb+2)*ldq],q1); h1 = _mm256_broadcast_sd(&hh[nb-2]); q1 = _mm256_load_pd(&q[(nb+3)*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif h2 = _mm256_broadcast_sd(&hh[ldh+nb-1]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(y1, h2, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(y1, h2)); #endif _mm256_store_pd(&q[(nb+3)*ldq],q1); h1 = _mm256_broadcast_sd(&hh[nb-1]); q1 = _mm256_load_pd(&q[(nb+4)*ldq]); #ifdef __ELPA_USE_FMA__ q1 = _mm256_NFMA_pd(x1, h1, q1); #else q1 = _mm256_sub_pd(q1, _mm256_mul_pd(x1, h1)); #endif _mm256_store_pd(&q[(nb+4)*ldq],q1); } elpa-2016.05.001/src/mod_compute_hh_trafo_real.F900000644000312500001440000006241212717516040016271 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF module compute_hh_trafo_real #include "config-f90.h" use elpa_mpi implicit none #ifdef WITH_OPENMP public compute_hh_trafo_real_cpu_openmp #else public compute_hh_trafo_real_cpu #endif contains #ifdef WITH_OPENMP subroutine compute_hh_trafo_real_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & off, ncols, istripe, & my_thread, thread_width, THIS_REAL_ELPA_KERNEL) #else subroutine compute_hh_trafo_real_cpu (a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & off, ncols, istripe, last_stripe_width, & THIS_REAL_ELPA_KERNEL) #endif use precision use elpa2_utilities use single_hh_trafo_real #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) use real_generic_simple_kernel, only : double_hh_trafo_generic_simple #endif #if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE)) use real_generic_kernel, only : double_hh_trafo_generic #endif #if defined(WITH_REAL_BGP_KERNEL) use real_bgp_kernel, only : double_hh_trafo_bgp #endif #if defined(WITH_REAL_BGQ_KERNEL) use real_bgq_kernel, only : double_hh_trafo_bgq #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif #if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) use kernel_interfaces #endif implicit none real(kind=rk), intent(inout) :: kernel_time integer(kind=lik) :: kernel_flops integer(kind=ik), intent(in) :: nbw, max_blk_size real(kind=rk) :: bcast_buffer(nbw,max_blk_size) integer(kind=ik), intent(in) :: a_off integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count #ifndef WITH_OPENMP integer(kind=ik), intent(in) :: last_stripe_width real(kind=rk) :: a(stripe_width,a_dim2,stripe_count) #else integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width real(kind=rk) :: a(stripe_width,a_dim2,stripe_count,max_threads) #endif integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL ! Private variables in OMP regions (my_thread) should better be in the argument list! integer(kind=ik) :: off, ncols, istripe #ifdef WITH_OPENMP integer(kind=ik) :: my_thread, noff #endif integer(kind=ik) :: j, nl, jj, jjj real(kind=rk) :: w(nbw,6), ttt #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%start("compute_hh_trafo_real_cpu_openmp") #else call timer%start("compute_hh_trafo_real_cpu") #endif #endif ttt = mpi_wtime() #ifndef WITH_OPENMP nl = merge(stripe_width, last_stripe_width, istripe. #ifdef HAVE_CONFIG_H #include "config-f90.h" #endif !> \mainpage Ftimings !> !> An almost pure-fortran attempt to play with tree structures, which evolved !> into the timing library used e.g. by the VERTEX supernova code. !> !> All you need to know is contained in the \ref ftimings::timer_t derived type. module ftimings use ftimings_type use ftimings_value use, intrinsic :: iso_fortran_env, only : error_unit, output_unit implicit none save private ! this is mainly needed for Doxygen, they are ! by implicitly reachable as type-bound procedures ! of timer_t, however Doxygen does not document them ! if they are not also public public timer_start, timer_stop, timer_free, timer_print, & timer_enable, timer_disable, timer_is_enabled, & timer_in_entries, timer_get, timer_since, timer_sort, & timer_set_print_options, & timer_measure_flops, & timer_measure_allocated_memory, & timer_measure_virtual_memory, & timer_measure_max_allocated_memory, & timer_measure_memory_bandwidth character(len=name_length), private, parameter :: own = "(own)" character(len=name_length), private, parameter :: below = "(below threshold)" !> Type for a timer instance. !> !> Typical usage: !> \code{.f90} !> type(timer_t) :: timer !> !> call timer%enable() !> !> call timer%start("section") !> ... !> call timer%start("subsection") !> ... !> call timer%stop("subsection") !> ... !> call timer%stop("section") !> !> call timer%print() !> \endcode !> !> Every first call to timer%start() at a certain point in the graph !> allocates a small amount of memory. If the timer is no longer needed, !> all that memory can be freed again with !> !> \code{.f90} !> call timer%free() !> \endcode type, public :: timer_t logical, private :: active = .false. !< If set to .false., most operations return immediately without any action logical, private :: record_allocated_memory = .false. !< IF set to .true., record also the current resident set size logical, private :: record_virtual_memory = .false. !< IF set to .true., record also the virtual memory logical, private :: record_max_allocated_memory = .false. !< IF set to .true., record also the max resident set size ("high water mark") logical, private :: record_flop_counts = .false. !< If set to .true., record also FLOP counts via PAPI calls logical, private :: record_memory_bandwidth = .false. !< If set to .true., record also FLOP counts via PAPI calls logical, private :: print_allocated_memory = .false. logical, private :: print_max_allocated_memory = .false. logical, private :: print_virtual_memory = .false. logical, private :: print_flop_count = .false. logical, private :: print_flop_rate = .false. logical, private :: print_ldst = .false. logical, private :: print_memory_bandwidth = .false. logical, private :: print_ai = .false. integer, private :: bytes_per_ldst = 8 type(node_t), private, pointer :: root => NULL() !< Start of graph type(node_t), private, pointer :: current_node => NULL() !< Current position in the graph contains procedure, pass :: start => timer_start procedure, pass :: stop => timer_stop procedure, pass :: free => timer_free procedure, pass :: print => timer_print procedure, pass :: enable => timer_enable procedure, pass :: disable => timer_disable procedure, pass :: is_enabled => timer_is_enabled procedure, pass :: measure_flops => timer_measure_flops procedure, pass :: measure_allocated_memory => timer_measure_allocated_memory procedure, pass :: measure_virtual_memory => timer_measure_virtual_memory procedure, pass :: measure_max_allocated_memory => timer_measure_max_allocated_memory procedure, pass :: measure_memory_bandwidth => timer_measure_memory_bandwidth procedure, pass :: set_print_options => timer_set_print_options procedure, pass :: in_entries => timer_in_entries procedure, pass :: get => timer_get procedure, pass :: since => timer_since procedure, pass :: sort => timer_sort end type ! Private type node_t, representing a graph node ! type :: node_t character(len=name_length) :: name ! Descriptive name, used when printing the timings integer :: count = 0 ! Number of node_stop calls type(value_t) :: value ! The actual counter data, see ftimings_values.F90 logical :: is_running = .false. ! .true. if still running type(node_t), pointer :: firstChild => NULL() type(node_t), pointer :: lastChild => NULL() type(node_t), pointer :: parent => NULL() type(node_t), pointer :: nextSibling => NULL() class(timer_t), pointer :: timer contains procedure, pass :: now => node_now procedure, pass :: start => node_start procedure, pass :: stop => node_stop procedure, pass :: get_value => node_get_value procedure, pass :: new_child => node_new_child procedure, pass :: get_child => node_get_child procedure, pass :: sum_of_children => node_sum_of_children procedure, pass :: sum_of_children_with_name => node_sum_of_children_with_name procedure, pass :: sum_of_children_below => node_sum_of_children_below procedure, pass :: print => node_print procedure, pass :: print_graph => node_print_graph procedure, pass :: sort_children => node_sort_children end type interface function microseconds_since_epoch() result(us) bind(C, name="ftimings_microseconds_since_epoch") use, intrinsic :: iso_c_binding implicit none integer(kind=C_INT64_T) :: us end function end interface #ifdef HAVE_LIBPAPI interface function flop_init() result(ret) bind(C, name="ftimings_flop_init") use, intrinsic :: iso_c_binding implicit none integer(kind=C_INT) :: ret end function end interface interface function loads_stores_init() result(ret) bind(C, name="ftimings_loads_stores_init") use, intrinsic :: iso_c_binding implicit none integer(kind=C_INT) :: ret end function end interface interface subroutine papi_counters(flops, ldst) bind(C, name="ftimings_papi_counters") use, intrinsic :: iso_c_binding implicit none integer(kind=C_LONG_LONG), intent(out) :: flops, ldst end subroutine end interface #endif interface function resident_set_size() result(rsssize) bind(C, name="ftimings_resident_set_size") use, intrinsic :: iso_c_binding implicit none integer(kind=C_LONG) :: rsssize end function end interface interface function virtual_memory() result(virtualmem) bind(C, name="ftimings_virtual_memory") use, intrinsic :: iso_c_binding implicit none integer(kind=C_LONG) :: virtualmem end function end interface interface function max_resident_set_size() result(maxrsssize) bind(C, name="ftimings_highwater_mark") use, intrinsic :: iso_c_binding implicit none integer(kind=C_LONG) :: maxrsssize end function end interface contains !> Activate the timer, without this, most methods are non-ops. !> subroutine timer_enable(self) class(timer_t), intent(inout), target :: self self%active = .true. end subroutine !> Call with enabled = .true. to also record amount of newly allocated memory. !> By default, memory usage is not recored. Call with .false. to deactivate again. !> !> This opens /proc/self/statm, parses it, and closes it agagain and is thus !> quite costly, use when appropriate. !> subroutine timer_measure_allocated_memory(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled self%record_allocated_memory = enabled end subroutine !> Call with enabled = .true. to also record amount of newly created virtual memory. !> By default, memory usage is not recored. Call with .false. to deactivate again. !> !> This opens /proc/self/statm, parses it, and closes it agagain and is thus !> quite costly, use when appropriate. !> subroutine timer_measure_virtual_memory(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled self%record_virtual_memory = enabled end subroutine !> Call with enabled = .true. to also record amount of newly increase of max. !> resident memory !> By default, memory usage is not recored. Call with .false. to deactivate again. !> !> This opens /proc/self/status, parses it, and closes it agagain and is thus !> quite costly, use when appropriate. !> subroutine timer_measure_max_allocated_memory(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled self%record_max_allocated_memory = enabled end subroutine !> Call with enabled = .true. to also record the memory bandwidth with PAPI !> By default, this is not recorded. Call with .false. to deactivate again. !> subroutine timer_measure_memory_bandwidth(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled if (enabled) then #ifdef HAVE_LIBPAPI if (loads_stores_init() == 1) then self%record_memory_bandwidth = .true. else write(0,'(a)') "ftimings: Could not initialize PAPI, disabling memory bandwidth counter" self%record_memory_bandwidth = .false. endif #else write(0,'(a)') "ftimings: not compiled with PAPI support, disabling memory bandwidth counter" self%record_memory_bandwidth = .false. #endif else ! explicitly set to .false. by caller self%record_memory_bandwidth = .false. endif end subroutine !> Call with enabled = .true. to also record FLOP counts via PAPI calls. !> By default no FLOPS are recored. Call with .false. to deactivate again. !> subroutine timer_measure_flops(self, enabled) class(timer_t), intent(inout) :: self logical, intent(in) :: enabled if (enabled) then #ifdef HAVE_LIBPAPI if (flop_init() == 1) then self%record_flop_counts = .true. else write(0,'(a)') "ftimings: Could not initialize PAPI, disabling FLOP counter" self%record_flop_counts = .false. endif #else write(0,'(a)') "ftimings: not compiled with PAPI support, disabling FLOP counter" self%record_flop_counts = .false. #endif else ! Explicitly set to .false. by caller self%record_flop_counts = .false. endif end subroutine !> Deactivate the timer !> subroutine timer_disable(self) class(timer_t), intent(inout), target :: self self%active = .false. end subroutine !> Return whether the timer is currently running !> function timer_is_enabled(self) result(is) class(timer_t), intent(inout), target :: self logical :: is is = self%active end function !> Control what to print on following %print calls !> !> \param print_allocated_memory Amount of newly allocated, !> resident memory !> \param print_virtual_memory Amount of newly created virtual !> memory !> \param print_max_allocated_memory Amount of new increase of max. !> resident memory ("high water mark") !> \param print_flop_count Number of floating point operations !> \param print_flop_rate Rate of floating point operations per second !> \param print_ldst Number of loads+stores !> \param print_memory_bandwidth Rate of loads+stores per second !> \param print_ai Arithmetic intensity, that is number of !> floating point operations per !> number of load and store !> operations (currently untested) !> \param bytes_per_ldst For calculating the AI, assume this number !> of bytes per load or store (default: 8) subroutine timer_set_print_options(self, & print_allocated_memory, & print_virtual_memory, & print_max_allocated_memory, & print_flop_count, & print_flop_rate, & print_ldst, & print_memory_bandwidth, & print_ai, & bytes_per_ldst) class(timer_t), intent(inout) :: self logical, intent(in), optional :: & print_allocated_memory, & print_virtual_memory, & print_max_allocated_memory, & print_flop_count, & print_flop_rate, & print_ldst, & print_memory_bandwidth, & print_ai integer, intent(in), optional :: bytes_per_ldst if (present(print_allocated_memory)) then self%print_allocated_memory = print_allocated_memory if ((.not. self%record_allocated_memory) .and. self%print_allocated_memory) then write(0,'(a)') "ftimings: Warning: RSS size recording was disabled, expect zeros!" endif endif if (present(print_virtual_memory)) then self%print_virtual_memory = print_virtual_memory if ((.not. self%record_virtual_memory) .and. self%print_virtual_memory) then write(0,'(a)') "ftimings: Warning: Virtual memory recording was disabled, expect zeros!" endif endif if (present(print_max_allocated_memory)) then self%print_max_allocated_memory = print_max_allocated_memory if ((.not. self%record_max_allocated_memory) .and. self%print_max_allocated_memory) then write(0,'(a)') "ftimings: Warning: HWM recording was disabled, expect zeros!" endif endif if (present(print_flop_count)) then self%print_flop_count = print_flop_count if ((.not. self%record_flop_counts) .and. self%print_flop_count) then write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!" endif endif if (present(print_flop_rate)) then self%print_flop_rate = print_flop_rate if ((.not. self%record_flop_counts) .and. self%print_flop_rate) then write(0,'(a)') "ftimings: Warning: FLOP counter was disabled, expect zeros!" endif endif if (present(print_ldst)) then self%print_ldst = print_ldst if ((.not. self%record_memory_bandwidth) .and. self%print_ldst) then write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros!" endif endif if (present(print_memory_bandwidth)) then self%print_memory_bandwidth = print_memory_bandwidth if ((.not. self%record_memory_bandwidth) .and. self%print_memory_bandwidth) then write(0,'(a)') "ftimings: Warning: Load+Store counters were disabled, expect zeros for memory bandwidth!" endif endif if (present(print_ai)) then self%print_ai = print_ai if (.not. (self%record_memory_bandwidth .and. self%record_flop_counts)) then write(0,'(a)') "ftimings: Warning: Memory bandwidth or FLOP counters were disabled, expect invalid values for AI" endif endif if (present(bytes_per_ldst)) then self%bytes_per_ldst = bytes_per_ldst endif end subroutine !> Start a timing section !> !> \param name A descriptive name !> \param replace If .true. (default .false.), replace any entries at the !> current position with the same name. If .false., add the !> time to a possibly existing entry !> !> Care must be taken to balance any invocations of %start() and %stop(), e.g. !> the following is valid !> !> \code{.f90} !> call timer%start("A") !> call timer%start("B") !> call timer%stop("B") !> call timer%stop("A") !> \endcode !> !> while the following is not !> !> \code{.f90} !> call timer%start("A") !> call timer%start("B") !> call timer%stop("A") !> call timer%stop("B") !> \endcode !> subroutine timer_start(self, name, replace) class(timer_t), intent(inout), target :: self character(len=*), intent(in) :: name logical, intent(in), optional :: replace type(node_t), pointer :: node !$ integer :: omp_get_thread_num, omp_get_num_threads, omp_get_level, omp_get_ancestor_thread_num !$ integer :: i if (.not. self%active) then return endif ! Deal with nested parallelization !$ do i = 0, omp_get_level() !$ if (omp_get_ancestor_thread_num(i) > 0) then !$ return !$ endif !$ end do !$omp master if (.not. associated(self%current_node)) then ! First call to timer_start() allocate(self%root) self%root%name = "[Root]" self%root%timer => self call self%root%start() nullify(self%root%firstChild) nullify(self%root%lastChild) nullify(self%root%parent) nullify(self%root%nextSibling) self%current_node => self%root endif if (string_eq(self%current_node%name, name)) then !$omp critical write(error_unit,*) "Recursion error! Printing tree so far.." write(error_unit,*) "Got %start(""" // trim(name) // """), while %start(""" // trim(name) // """) was still active" !$ write(*,*) "omp_get_thread_num() = ", omp_get_thread_num() !$ write(*,*) "omp_get_num_threads() = ", omp_get_num_threads() !$ write(*,*) "omp_get_level() = ", omp_get_level() !$ do i = 0, omp_get_level() !$ write(*,*) "omp_get_ancestor_thread_num(", i, ") = ", omp_get_ancestor_thread_num(i) !$ end do call self%root%print_graph(0) !$omp end critical stop "timer_start() while same timer was active" endif node => self%current_node%get_child(name) if (.not. associated(node)) then node => self%current_node%new_child(name) else if (present(replace)) then if (replace) then node%value = null_value node%count = 0 if (associated(node%firstChild)) then call deallocate_node(node%firstChild) nullify(node%firstChild) nullify(node%lastChild) endif endif endif call node%start() self%current_node => node !$omp end master end subroutine !> End a timing segment, \sa timer_start !> !> \param name The exact same name as was used for %start(). !> If not provided, close the currently active region. !> If given, warns if it does not match the last %start() !> call on stderr and disables the current timer instance. !> subroutine timer_stop(self, name) class(timer_t), intent(inout), target :: self character(len=*), intent(in), optional :: name logical :: error !$ integer :: omp_get_level, omp_get_ancestor_thread_num !$ integer :: i if (.not. self%active) then return endif ! Deal with nested parallelization !$ do i = 0, omp_get_level() !$ if (omp_get_ancestor_thread_num(i) > 0) then !$ return !$ endif !$ end do !$omp master error = .false. if (.not. associated(self%current_node)) then write(error_unit,'(a)') "Called timer_stop() without first calling any timer_start(), disabling timings" call self%free() self%active = .false. error = .true. else if (present(name)) then if (.not. string_eq(self%current_node%name, name)) then write(error_unit,'(a)') "Expected %stop(""" // trim(self%current_node%name) // """),& & but got %stop(""" // trim(name) // """), disabling timings" call self%free() self%active = .false. error = .true. endif endif if (.not. error) then call self%current_node%stop() ! climb up to parent if (.not. associated(self%current_node%parent)) then write(error_unit,'(a)') "Error: No valid parent node found for node '" // trim(self%current_node%name) // "'" call self%free() self%active = .false. endif self%current_node => self%current_node%parent endif !$omp end master end subroutine !> Deallocate all objects associated with (but not including) self !> subroutine timer_free(self) class(timer_t), intent(inout), target :: self if (associated(self%root)) then call deallocate_node(self%root) endif nullify(self%root) nullify(self%current_node) end subroutine !> Print a timing graph !> !> \param name1 If given, first descend one level to the node with name name1 !> \param name2 If given, also descend another level to the node with name2 there !> \param name3 etc. !> \param name4 etc. !> \param threshold If given, subsume any entries with a value of threshold !> seconds in a single node "(below threshold)" !> \param is_sorted Assume a sorted graph for inserting "(own)" and "(below threshold)" !> \param unit The unit number on which to print, default stdout !> subroutine timer_print(self, name1, name2, name3, name4, threshold, is_sorted, unit) class(timer_t), intent(in), target :: self character(len=*), intent(in), optional :: name1, name2, name3, name4 real(kind=rk), intent(in), optional :: threshold logical, intent(in), optional :: is_sorted integer, intent(in), optional :: unit integer :: unit_act type(node_t), pointer :: node character(len=64) :: format_spec ! I hate fortran's string handling character(len=name_length), parameter :: group = "Group" character(len=12), parameter :: seconds = " [s]" character(len=12), parameter :: fract = " fraction" character(len=12), parameter :: ram = " alloc. RAM" character(len=12), parameter :: vmem = " alloc. VM" character(len=12), parameter :: hwm = " alloc. HWM" character(len=12), parameter :: flop_rate = " Mflop/s" character(len=12), parameter :: flop_count = " Mflop" character(len=12), parameter :: ldst = "loads+stores" character(len=12), parameter :: bandwidth = " mem bandw." character(len=12), parameter :: ai = "arithm. Int." character(len=12), parameter :: dash = "============" if (.not. self%active) then return endif if (present(unit)) then unit_act = unit else unit_act = output_unit endif node => self%root if (present(name1)) then node => node%get_child(name1) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name1) // """" return endif end if if (present(name2)) then node => node%get_child(name2) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name2) // """" return endif end if if (present(name3)) then node => node%get_child(name3) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name3) // """" return endif end if if (present(name4)) then node => node%get_child(name4) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name4) // """" return endif end if ! I really do hate it .. write(format_spec,'("("" /= "",a",i0,",2x,a12,1x,a12)")') name_length write(unit_act, format_spec, advance='no') adjustl(group), seconds, fract if (self%print_allocated_memory) then write(unit_act,'(1x,a12)',advance='no') ram endif if (self%print_virtual_memory) then write(unit_act,'(1x,a12)',advance='no') vmem endif if (self%print_max_allocated_memory) then write(unit_act,'(1x,a12)',advance='no') hwm endif if (self%print_flop_count) then write(unit_act,'(1x,a12)',advance='no') flop_count endif if (self%print_flop_rate) then write(unit_act,'(1x,a12)',advance='no') flop_rate endif if (self%print_ldst) then write(unit_act,'(1x,a12)',advance='no') ldst endif if (self%print_memory_bandwidth) then write(unit_act,'(1x,a12)',advance='no') bandwidth endif if (self%print_ai) then write(unit_act,'(1x,a12)',advance='no') ai endif write(unit_act,'(a)') "" write(format_spec,'("("" | "",a",i0,",1x,2(1x,a12))")') name_length write(unit_act, format_spec, advance='no') "", dash, dash if (self%print_allocated_memory) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_virtual_memory) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_max_allocated_memory) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_flop_count) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_flop_rate) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_ldst) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_memory_bandwidth) then write(unit_act,'(1x,a12)',advance='no') dash endif if (self%print_ai) then write(unit_act,'(1x,a12)',advance='no') dash endif write(unit_act,'(a)') "" call node%print_graph(0, threshold, is_sorted, unit=unit) end subroutine !> Return the sum of all entries with a certain name below !> a given node. Specify the name with the last argument, the !> path to the starting point with the first few parameters !> !> \param name1, .., namei-1 The path to the starting node !> \param namei The name of all sub-entries below this !> node which should be summed together !> !> For example timer%in_entries("foo", "bar", "parallel") returns !> the sum of all entries named "parallel" below the foo->bar node !> function timer_in_entries(self, name1, name2, name3, name4) result(s) use, intrinsic :: iso_fortran_env, only : error_unit class(timer_t), intent(in), target :: self character(len=*), intent(in) :: name1 character(len=*), intent(in), optional :: name2, name3, name4 real(kind=rk) :: s type(node_t), pointer :: node ! the starting node type(value_t) :: val character(len=name_length) :: name ! the name of the sections s = 0._rk if (.not. self%active) then return endif node => self%root name = name1 if (present(name2)) then node => node%get_child(name1) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name1) // """" return endif name = name2 end if if (present(name3)) then node => node%get_child(name2) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name2) // """" return endif name = name3 end if if (present(name4)) then node => node%get_child(name3) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name3) // """" return endif name = name4 end if val = node%sum_of_children_with_name(name) s = real(val%micros, kind=rk) * 1e-6_rk end function !> Access a specific, already stopped entry of the graph by specifying the !> names of the nodes along the graph from the root node !> !> The result is only meaningfull if the entry was never appended by !> additional %start() calls. !> function timer_get(self, name1, name2, name3, name4, name5, name6) result(s) class(timer_t), intent(in), target :: self ! this is clunky, but what can you do.. character(len=*), intent(in), optional :: name1, name2, name3, name4, name5, name6 real(kind=rk) :: s type(node_t), pointer :: node s = 0._rk if (.not. self%active) then return endif node => self%root if (present(name1)) then node => node%get_child(name1) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name1) // """" return endif end if if (present(name2)) then node => node%get_child(name2) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name2) // """" return endif end if if (present(name3)) then node => node%get_child(name3) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name3) // """" return endif end if if (present(name4)) then node => node%get_child(name4) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name4) // """" return endif end if if (present(name5)) then node => node%get_child(name5) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name5) // """" return endif end if if (present(name6)) then node => node%get_child(name6) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name6) // """" return endif end if if (node%is_running) then write(error_unit,'(a)') "Timer """ // trim(node%name) // """ not yet stopped" return endif s = real(node%value%micros, kind=rk) * 1e-6_rk end function !> Access a specific, not yet stopped entry of the graph by specifying the !> names of the nodes along the graph from the root node and return the !> seconds that have passed since the entry was created. !> !> The result is only meaningfull if the entry was never appended by !> additional %start() calls. !> function timer_since(self, name1, name2, name3, name4) result(s) class(timer_t), intent(in), target :: self character(len=*), intent(in), optional :: name1, name2, name3, name4 real(kind=rk) :: s type(value_t) :: val type(node_t), pointer :: node s = 0._rk node => self%root if (present(name1)) then node => node%get_child(name1) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name1) // """" return endif end if if (present(name2)) then node => node%get_child(name2) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name2) // """" return endif end if if (present(name3)) then node => node%get_child(name3) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name3) // """" return endif end if if (present(name4)) then node => node%get_child(name4) if (.not. associated(node)) then write(error_unit,'(a)') "Could not descend to """ // trim(name4) // """" return endif end if if (node%is_running .neqv. .true.) then write(error_unit,'(a)') "Timer """ // trim(node%name) // """ already stopped" return endif val = node%value + node%now() s = real(val%micros, kind=rk) * 1e-6_rk end function !> Sort the graph on each level. !> Warning: This irrevocable destroys the old ordering. !> subroutine timer_sort(self) class(timer_t), intent(inout), target :: self type(node_t), pointer :: node call sort_nodes(self%root, node) node => self%root do while (associated(node)) call node%sort_children() node => node%nextSibling enddo end subroutine ! Now methods of node_t: ! This is the function that actually returns the current timestamp and all other counters function node_now(self) result(val) use, intrinsic :: iso_c_binding class(node_t), intent(in) :: self type(value_t) :: val ! current time val%micros = microseconds_since_epoch() if (self%timer%record_allocated_memory) then val%rsssize = resident_set_size() endif if (self%timer%record_virtual_memory) then val%virtualmem = virtual_memory() endif if (self%timer%record_max_allocated_memory) then val%maxrsssize = max_resident_set_size() endif #ifdef HAVE_LIBPAPI if (self%timer%record_flop_counts .or. self%timer%record_memory_bandwidth) then call papi_counters(val%flop_count, val%ldst) endif #endif end function subroutine node_start(self) class(node_t), intent(inout) :: self ! take the time self%value = self%value - self%now() self%is_running = .true. end subroutine subroutine node_stop(self) class(node_t), intent(inout) :: self self%count = self%count + 1 ! take the time self%value = self%value + self%now() self%is_running = .false. end subroutine function node_get_value(self) result(val) class(node_t), intent(in) :: self type(value_t) :: val val = self%value if (self%is_running) then ! we have not finished, give time up to NOW val = val + self%now() endif end function function node_new_child(self, name) result(new) class(node_t), intent(inout), target :: self character(len=*), intent(in) :: name type(node_t), pointer :: new if (.not. associated(self%lastChild)) then allocate(self%lastChild) new => self%lastChild self%firstChild => new else allocate(self%lastChild%nextSibling) new => self%lastChild%nextSibling self%lastChild => new endif select type (self) type is (node_t) new%parent => self class default stop "node_new_child(): This should not happen" end select new%name = name new%count = 0 new%timer => self%timer nullify(new%firstChild) nullify(new%lastChild) nullify(new%nextSibling) end function function string_eq(str1, str2) result(eq) character(len=name_length), intent(in) :: str1 character(len=*), intent(in) :: str2 logical :: eq eq = trim(str1) .eq. str2(1:min(len(trim(str2)), name_length)) end function function node_get_child(self, name) result(child) class(node_t), intent(in) :: self character(len=*), intent(in) :: name type(node_t), pointer :: child child => self%firstChild do while (associated(child)) if (string_eq(child%name, name)) then return endif child => child%nextSibling enddo nullify(child) end function recursive subroutine deallocate_node(entry) type(node_t), intent(inout), pointer :: entry type(node_t), pointer :: nextSibling if (associated(entry%firstChild)) then call deallocate_node(entry%firstChild) endif nextSibling => entry%nextSibling deallocate(entry) nullify(entry) if (associated(nextSibling)) then call deallocate_node(nextSibling) endif end subroutine function node_sum_of_children(self) result(sum_time) class(node_t), intent(in) :: self type(node_t), pointer :: cur_entry type(value_t) :: sum_time cur_entry => self%firstChild do while (associated(cur_entry)) sum_time = sum_time + cur_entry%get_value() cur_entry => cur_entry%nextSibling enddo end function recursive function node_sum_of_children_with_name(self, name) result(sum_time) class(node_t), intent(in) :: self character(len=*), intent(in) :: name type(node_t), pointer :: cur_entry type(value_t) :: sum_time cur_entry => self%firstChild do while (associated(cur_entry)) if (string_eq(cur_entry%name, name)) then sum_time = sum_time + cur_entry%value else sum_time = sum_time + cur_entry%sum_of_children_with_name(name) endif cur_entry => cur_entry%nextSibling enddo end function function node_sum_of_children_below(self, threshold) result(sum_time) class(node_t), intent(in) :: self real(kind=rk), intent(in), optional :: threshold type(node_t), pointer :: cur_entry type(value_t) :: sum_time, cur_value if (.not. present(threshold)) then return endif cur_entry => self%firstChild do while (associated(cur_entry)) cur_value = cur_entry%get_value() if (cur_value%micros * 1e-6_rk < threshold) then sum_time = sum_time + cur_value endif cur_entry => cur_entry%nextSibling enddo end function subroutine insert_into_sorted_list(head, node) type(node_t), pointer, intent(inout) :: head type(node_t), target, intent(inout) :: node type(node_t), pointer :: cur if (node%value%micros >= head%value%micros) then node%nextSibling => head head => node return endif cur => head do while (associated(cur%nextSibling)) if (cur%value%micros > node%value%micros .and. node%value%micros >= cur%nextSibling%value%micros) then node%nextSibling => cur%nextSibling cur%nextSibling => node return endif cur => cur%nextSibling end do ! node has to be appended at the end cur%nextSibling => node node%nextSibling => NULL() end subroutine subroutine remove_from_list(head, node) type(node_t), pointer, intent(inout) :: head type(node_t), pointer, intent(in) :: node type(node_t), pointer :: cur if (associated(head,node)) then head => head%nextSibling return endif cur => head do while (associated(cur%nextSibling)) if (associated(cur%nextSibling,node)) then cur%nextSibling => cur%nextSibling%nextSibling return endif cur => cur%nextSibling end do end subroutine subroutine node_print(self, indent_level, total, unit) class(node_t), intent(inout) :: self integer, intent(in) :: indent_level type(value_t), intent(in) :: total type(value_t) :: val integer, intent(in) :: unit character(len=name_length) :: name, suffix if (self%is_running) then name = trim(self%name) // " (running)" else name = self%name endif if (self%count > 1) then write(suffix, '(" (",i0,"x)")') self%count name = trim(name) // " " // trim(suffix) endif if (self%is_running) then val = self%value + self%now() else val = self%value endif call print_value(val, self%timer, indent_level, name, total, unit) end subroutine recursive subroutine node_print_graph(self, indent_level, threshold, is_sorted, total, unit) use, intrinsic :: iso_fortran_env, only : output_unit class(node_t), intent(inout) :: self integer, intent(in) :: indent_level real(kind=rk), intent(in), optional :: threshold logical, intent(in), optional :: is_sorted type(value_t), intent(in), optional :: total integer, intent(in), optional :: unit type(node_t), pointer :: node integer :: i type(value_t) :: cur_value, node_value, own_value, below_threshold_value, total_act type(node_t), pointer :: own_node, threshold_node real(kind=rk) :: threshold_act logical :: is_sorted_act, print_own, print_threshold integer :: unit_act nullify(own_node) nullify(threshold_node) if (present(threshold)) then threshold_act = threshold else threshold_act = 0 endif if (present(is_sorted)) then is_sorted_act = is_sorted else is_sorted_act = .false. endif cur_value = self%get_value() if (present(total)) then total_act = total else total_act = cur_value endif if (present(unit)) then unit_act = unit else unit_act = output_unit endif call self%print(indent_level, total_act, unit_act) own_value = cur_value - self%sum_of_children() below_threshold_value = self%sum_of_children_below(threshold) print_own = associated(self%firstChild) print_threshold = below_threshold_value%micros > 0 ! Deal with "(own)" and "(below threshold)" entries if (is_sorted_act) then ! sort them in if (print_own) then ! insert an "(own)" node allocate(own_node) own_node%value = own_value own_node%name = own own_node%timer => self%timer call insert_into_sorted_list(self%firstChild, own_node) endif if (print_threshold) then ! insert a "(below threshold)" node allocate(threshold_node) threshold_node%value = below_threshold_value threshold_node%name = below threshold_node%timer => self%timer call insert_into_sorted_list(self%firstChild, threshold_node) endif else ! print them first if (print_own) then call print_value(own_value, self%timer, indent_level + 1, own, cur_value, unit_act) endif if (print_threshold) then call print_value(below_threshold_value, self%timer, indent_level + 1, below, cur_value, unit_act) endif endif ! print children node => self%firstChild do while (associated(node)) node_value = node%get_value() if (node_value%micros * 1e-6_rk >= threshold_act & .or. associated(node, threshold_node) & .or. associated(node, own_node)) then call node%print_graph(indent_level + 1, threshold, is_sorted, cur_value, unit_act) endif node => node%nextSibling end do if (is_sorted_act) then ! remove inserted dummy nodes again if (print_own) then call remove_from_list(self%firstChild, own_node) deallocate(own_node) endif if (print_threshold) then call remove_from_list(self%firstChild, threshold_node) deallocate(threshold_node) endif endif end subroutine ! In-place sort a node_t linked list and return the first and last element, subroutine sort_nodes(head, tail) type(node_t), pointer, intent(inout) :: head, tail type(node_t), pointer :: p, q, e type(value_t) :: p_val, q_val integer :: insize, nmerges, psize, qsize, i if (.not. associated(head)) then nullify(tail) return endif insize = 1 do while (.true.) p => head nullify(head) nullify(tail) nmerges = 0 do while(associated(p)) nmerges = nmerges + 1 q => p psize = 0 do i = 1, insize psize = psize + 1 q => q%nextSibling if (.not. associated(q)) then exit endif end do qsize = insize do while (psize > 0 .or. (qsize > 0 .and. associated(q))) if (psize == 0) then e => q q => q%nextSibling qsize = qsize - 1 else if (qsize == 0 .or. (.not. associated(q))) then e => p; p => p%nextSibling psize = psize - 1 else p_val = p%get_value() q_val = q%get_value() if (p_val%micros >= q_val%micros) then e => p p => p%nextSibling psize = psize - 1 else e => q q => q%nextSibling qsize = qsize - 1 end if end if if (associated(tail)) then tail%nextSibling => e else head => e endif tail => e end do p => q end do nullify(tail%nextSibling) if (nmerges <= 1) then return endif insize = insize * 2 end do end subroutine recursive subroutine node_sort_children(self) class(node_t), intent(inout) :: self type(node_t), pointer :: node call sort_nodes(self%firstChild, self%lastChild) node => self%firstChild do while (associated(node)) call node%sort_children() node => node%nextSibling enddo end subroutine subroutine print_value(value, timer, indent_level, label, total, unit) type(value_t), intent(in) :: value type(timer_t), intent(in) :: timer integer, intent(in) :: indent_level character(len=name_length), intent(in) :: label type(value_t), intent(in) :: total integer, intent(in) :: unit character(len=64) :: format_spec write(format_spec,'("(",i0,"x,""|_ "",a",i0,",2x,f12.6,1x,f12.3)")') indent_level * 2 + 1, name_length write(unit,format_spec,advance='no') & label, & real(value%micros, kind=rk) * 1e-6_rk, & real(value%micros, kind=rk) / real(total%micros, kind=rk) if (timer%print_allocated_memory) then write(unit,'(1x,a12)',advance='no') & nice_format(real(value%rsssize, kind=C_DOUBLE)) endif if (timer%print_virtual_memory) then write(unit,'(1x,a12)',advance='no') & nice_format(real(value%virtualmem, kind=C_DOUBLE)) endif if (timer%print_max_allocated_memory) then write(unit,'(1x,a12)',advance='no') & nice_format(real(value%maxrsssize, kind=C_DOUBLE)) endif if (timer%print_flop_count) then write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / 1e6_rk endif if (timer%print_flop_rate) then write(unit,'(1x,f12.2)',advance='no') real(value%flop_count, kind=rk) / value%micros endif if (timer%print_ldst) then write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst, kind=rk)) endif if (timer%print_memory_bandwidth) then write(unit,'(1x,a12)',advance='no') nice_format(real(value%ldst*timer%bytes_per_ldst, kind=rk) / (value%micros * 1e-6_rk)) endif if (timer%print_ai) then write(unit,'(1x,f12.4)',advance='no') real(value%flop_count, kind=rk) / value%ldst / timer%bytes_per_ldst endif write(unit,'(a)') "" end subroutine pure elemental function nice_format(number) result(string) real(kind=C_DOUBLE), intent(in) :: number character(len=12) :: string real(kind=C_DOUBLE), parameter :: & kibi = 2.0_C_DOUBLE**10, & mebi = 2.0_C_DOUBLE**20, & gibi = 2.0_C_DOUBLE**30, & tebi = 2.0_C_DOUBLE**40, & pebi = 2.0_C_DOUBLE**50 if (abs(number) >= pebi) then write(string,'(es12.2)') number else if (abs(number) >= tebi) then write(string,'(f9.2,'' Ti'')') number / tebi else if (abs(number) >= gibi) then write(string,'(f9.2,'' Gi'')') number / gibi else if (abs(number) >= mebi) then write(string,'(f9.2,'' Mi'')') number / mebi else if (abs(number) >= kibi) then write(string,'(f9.2,'' ki'')') number / kibi else write(string,'(f12.2)') number endif end function end module elpa-2016.05.001/src/ftimings/papi.c0000644000312500001440000000766712664056454013562 00000000000000/* Copyright 2014 Lorenz Hüdepohl * * This file is part of ftimings. * * ftimings is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ftimings is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with ftimings. If not, see . */ #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif static int event_set; static int tried_papi_init = 0; static int papi_available = 0; static int flops_available = 0; static int ldst_available = 0; #ifdef HAVE_LIBPAPI #include int ftimings_papi_init(void) { int ret; if (tried_papi_init) { return papi_available; } #pragma omp critical { /* Think about it :) */ if (tried_papi_init) { goto end; } tried_papi_init = 1; event_set = PAPI_NULL; if ((ret = PAPI_library_init(PAPI_VER_CURRENT)) < 0) { fprintf(stderr, "ftimings: %s:%d: PAPI_library_init(%d): %s\n", __FILE__, __LINE__, PAPI_VER_CURRENT, PAPI_strerror(ret)); goto error; } if ((ret = PAPI_create_eventset(&event_set)) < 0) { fprintf(stderr, "ftimings: %s:%d PAPI_create_eventset(): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); goto error; } /* Check FLOP counter availability */ if ((ret = PAPI_query_event(PAPI_DP_OPS)) < 0) { fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_DP_OPS): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); flops_available = 0; } else if ((ret = PAPI_add_event(event_set, PAPI_DP_OPS)) < 0) { fprintf(stderr, "ftimings: %s:%d PAPI_add_event(): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); flops_available = 0; } else { flops_available = 1; } /* Loads + Stores */ if ((ret = PAPI_query_event(PAPI_LD_INS)) < 0) { fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_LD_INS): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); ldst_available = 0; } else if ((ret = PAPI_query_event(PAPI_SR_INS)) < 0) { fprintf(stderr, "ftimings: %s:%d: PAPI_query_event(PAPI_SR_INS): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); ldst_available = 0; } else if ((ret = PAPI_add_event(event_set, PAPI_LD_INS)) < 0) { fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_LD_INS): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); ldst_available = 0; } else if ((ret = PAPI_add_event(event_set, PAPI_SR_INS)) < 0) { fprintf(stderr, "ftimings: %s:%d PAPI_add_event(event_set, PAPI_SR_INS): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); ldst_available = 0; } else { ldst_available = 1; } /* Start */ if ((ret = PAPI_start(event_set)) < 0) { fprintf(stderr, "ftimings: %s:%d PAPI_start(): %s\n", __FILE__, __LINE__, PAPI_strerror(ret)); goto error; } goto end; error: /* PAPI works */ papi_available = 0; end: /* PAPI works */ papi_available = 1; } /* End of critical region */ return papi_available; } int ftimings_flop_init(void) { int ret; if (!tried_papi_init) { ftimings_papi_init(); } return flops_available; } int ftimings_loads_stores_init(void) { int ret; if (!tried_papi_init) { ftimings_papi_init(); } return ldst_available; } void ftimings_papi_counters(long long *flops, long long *ldst) { long long res[3]; int i, ret; if ((ret = PAPI_read(event_set, &res[0])) < 0) { fprintf(stderr, "PAPI_read: %s\n", PAPI_strerror(ret)); exit(1); } i = 0; if (flops_available) { *flops = res[i++]; } else { *flops = 0LL; } if (ldst_available) { *ldst = res[i++]; *ldst += res[i++]; } else { *ldst = 0LL; } } #endif elpa-2016.05.001/src/ftimings/highwater_mark.c0000644000312500001440000000231312664056454015604 00000000000000/* Copyright 2014 Andreas Marek, Lorenz Hüdepohl * * This file is part of ftimings. * * ftimings is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ftimings is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with ftimings. If not, see . */ #include #define _GNU_SOURCE #include #include #include #include long ftimings_highwater_mark() { long hwm = 0L; char line[1024]; FILE* fp = NULL; if ((fp = fopen( "/proc/self/status", "r" )) == NULL ) { return 0L; } /* Read memory size data from /proc/pid/status */ while(fgets(line, sizeof line, fp)) { if (sscanf(line, "VmHWM: %ld kB", &hwm) == 1) { break; } } fclose(fp); return hwm * 1024L; } elpa-2016.05.001/src/ftimings/time.c0000644000312500001440000000275112717402663013550 00000000000000/* Copyright 2014 Lorenz Hüdepohl * * This file is part of ftimings. * * ftimings is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ftimings is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with ftimings. If not, see . */ #include #include #include #include #include #include #ifdef HAVE_CONFIG_H #include "config-f90.h" #endif /* Return number of microseconds since 1.1.1970, in a 64 bit integer. * (with 2^64 us ~ 6 * 10^5 years, this should be sufficiently overflow safe) */ int64_t ftimings_microseconds_since_epoch(void) { struct timeval tv; if (gettimeofday(&tv, NULL) != 0) { perror("gettimeofday"); exit(1); } return (int64_t) (tv.tv_sec) * ((int64_t) 1000000) + (int64_t)(tv.tv_usec); } #ifndef WITH_MPI int64_t t0 = 0; void __attribute__((constructor)) init_time(void) { t0 = ftimings_microseconds_since_epoch(); } double seconds(void) { return (ftimings_microseconds_since_epoch() - t0) / 1e6; } #endif elpa-2016.05.001/src/ftimings/virtual_memory.c0000644000312500001440000000206712664056454015674 00000000000000/* Copyright 2014 Andreas Marek, Lorenz Hüdepohl * * This file is part of ftimings. * * ftimings is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ftimings is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with ftimings. If not, see . */ #include #include long ftimings_virtual_memory() { long rss = 0L; FILE* fp = NULL; if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) { return 0L; } if (fscanf(fp, "%ld", &rss) != 1) { fclose(fp); return (size_t)0L; /* Can't read? */ } fclose(fp); return rss * sysconf( _SC_PAGESIZE); } elpa-2016.05.001/src/ftimings/resident_set_size.c0000644000312500001440000000205512664056454016335 00000000000000/* Copyright 2014 Lorenz Hüdepohl * * This file is part of ftimings. * * ftimings is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ftimings is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with ftimings. If not, see . */ #include #include long ftimings_resident_set_size() { long rss = 0L; FILE* fp = NULL; if ((fp = fopen( "/proc/self/statm", "r" )) == NULL ) { return 0L; } if (fscanf(fp, "%*s%ld", &rss) != 1) { fclose(fp); return (size_t)0L; /* Can't read? */ } fclose(fp); return rss * sysconf( _SC_PAGESIZE); } elpa-2016.05.001/src/ftimings/ftimings_value.F900000644000312500001440000000602612664056454015745 00000000000000! Copyright 2014 Lorenz Hüdepohl ! ! This file is part of ftimings. ! ! ftimings is free software: you can redistribute it and/or modify ! it under the terms of the GNU Lesser General Public License as published by ! the Free Software Foundation, either version 3 of the License, or ! (at your option) any later version. ! ! ftimings is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ftimings. If not, see . #ifdef HAVE_CONFIG_H #include "config-f90.h" #endif module ftimings_value use ftimings_type implicit none public type value_t integer(kind=C_INT64_T) :: micros = 0 ! microseconds spent in this node integer(kind=C_LONG) :: virtualmem = 0 ! newly created virtual memory integer(kind=C_LONG) :: maxrsssize = 0 ! newly used max. resident mem ("high water mark") integer(kind=C_LONG) :: rsssize = 0 ! newly used resident memory integer(kind=C_LONG_LONG) :: flop_count = 0 ! floating point operations done in this node integer(kind=C_LONG_LONG) :: ldst = 0 ! number of loads and stores end type interface operator(+) module procedure value_add end interface interface operator(-) module procedure value_minus module procedure value_inverse end interface type(value_t), parameter :: null_value = value_t(micros = 0, & rsssize = 0, & virtualmem = 0, & maxrsssize = 0, & flop_count = 0) contains pure elemental function value_add(a,b) result(c) class(value_t), intent(in) :: a, b type(value_t) :: c c%micros = a%micros + b%micros c%rsssize = a%rsssize + b%rsssize c%virtualmem = a%virtualmem + b%virtualmem c%maxrsssize = a%maxrsssize + b%maxrsssize #ifdef HAVE_LIBPAPI c%flop_count = a%flop_count + b%flop_count c%ldst = a%ldst + b%ldst #endif end function pure elemental function value_minus(a,b) result(c) class(value_t), intent(in) :: a, b type(value_t) :: c c%micros = a%micros - b%micros c%rsssize = a%rsssize - b%rsssize c%virtualmem = a%virtualmem - b%virtualmem c%maxrsssize = a%maxrsssize - b%maxrsssize #ifdef HAVE_LIBPAPI c%flop_count = a%flop_count - b%flop_count c%ldst = a%ldst - b%ldst #endif end function pure elemental function value_inverse(a) result(neg_a) class(value_t), intent(in) :: a type(value_t) :: neg_a neg_a%micros = - a%micros neg_a%rsssize = - a%rsssize neg_a%virtualmem = - a%virtualmem neg_a%maxrsssize = - a%maxrsssize #ifdef HAVE_LIBPAPI neg_a%flop_count = - a%flop_count neg_a%ldst = - a%ldst #endif end function end module elpa-2016.05.001/src/ftimings/ftimings_type.F900000644000312500001440000000164412664056454015613 00000000000000! Copyright 2014 Lorenz Hüdepohl ! ! This file is part of ftimings. ! ! ftimings is free software: you can redistribute it and/or modify ! it under the terms of the GNU Lesser General Public License as published by ! the Free Software Foundation, either version 3 of the License, or ! (at your option) any later version. ! ! ftimings is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ftimings. If not, see . module ftimings_type use, intrinsic :: iso_c_binding, only : C_INT64_T, C_DOUBLE, C_LONG_LONG, C_LONG, C_INT implicit none integer, parameter :: rk = C_DOUBLE integer, parameter :: name_length = 40 end module elpa-2016.05.001/src/mod_mpi_stubs.F900000644000312500001440000000633212717402663013751 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author Andreas Marek, MPCDF #include "config-f90.h" module elpa_mpi_stubs use precision implicit none public integer(kind=ik), parameter :: MPI_COMM_SELF=1, MPI_COMM_WORLD=1 contains function MPI_WTIME() result(time) use iso_c_binding #ifndef WITH_MPI use time_c #endif implicit none real(kind=c_double) :: time #ifndef WITH_MPI time = seconds() #endif end function subroutine mpi_comm_size(mpi_comm_world, ntasks, mpierr) use precision implicit none integer(kind=ik), intent(in) :: mpi_comm_world integer(kind=ik), intent(inout) :: ntasks integer(kind=ik), intent(inout) :: mpierr ntasks = 1 mpierr = 0 return end subroutine mpi_comm_size subroutine mpi_comm_rank(mpi_comm_world, myid, mpierr) use precision implicit none integer(kind=ik), intent(in) :: mpi_comm_world integer(kind=ik), intent(inout) :: mpierr integer(kind=ik), intent(inout) :: myid myid = 0 mpierr = 0 return end subroutine mpi_comm_rank subroutine mpi_comm_split(mpi_communicator, color, key, new_comm, mpierr) use precision implicit none integer(kind=ik), intent(in) :: mpi_communicator, color, key integer(kind=ik), intent(inout) :: new_comm, mpierr new_comm = mpi_communicator mpierr = 0 return end subroutine mpi_comm_split end module elpa-2016.05.001/src/elpa_qr/0000755000312500001440000000000012717541041012316 500000000000000elpa-2016.05.001/src/elpa_qr/elpa_pdlarfb.F900000644000312500001440000005323712717516040015144 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! #include "config-f90.h" module elpa_pdlarfb use elpa1_compute use qr_utils_mod use elpa_mpi implicit none PRIVATE public :: qr_pdlarfb_1dcomm public :: qr_pdlarft_pdlarfb_1dcomm public :: qr_pdlarft_set_merge_1dcomm public :: qr_pdlarft_tree_merge_1dcomm public :: qr_pdlarfl_1dcomm public :: qr_pdlarfl2_tmatrix_1dcomm public :: qr_tmerge_pdlarfb_1dcomm contains subroutine qr_pdlarfb_1dcomm(m,mb,n,k,a,lda,v,ldv,tau,t,ldt,baseidx,idx,rev,mpicomm,work,lwork) use precision use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: lda,ldv,ldt,lwork real(kind=rk) :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(k,*) ! input variables (global) integer(kind=ik) :: m,mb,n,k,baseidx,idx,rev,mpicomm ! output variables (global) ! derived input variables from QR_PQRPARAM ! local scalars integer(kind=ik) :: localsize,offset,baseoffset integer(kind=ik) :: mpirank,mpiprocs,mpierr if (idx .le. 1) return if (n .le. 0) return ! nothing to do if (k .eq. 1) then call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1), & work,lwork,m,n,idx,mb,rev,mpicomm) return else if (k .eq. 2) then call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt, & work,lwork,m,n,idx,mb,rev,mpicomm) return end if if (lwork .eq. -1) then work(1,1) = DBLE(2*k*n) return end if !print *,'updating trailing matrix with k=',k call MPI_Comm_rank(mpicomm,mpirank,mpierr) call MPI_Comm_size(mpicomm,mpiprocs,mpierr) ! use baseidx as idx here, otherwise the upper triangle part will be lost ! during the calculation, especially in the reversed case call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, & localsize,baseoffset,offset) ! Z' = Y' * A if (localsize .gt. 0) then call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0,work(1,1),k) else work(1:k,1:n) = 0.0d0 end if ! data exchange #ifdef WITH_MPI call mpi_allreduce(work(1,1),work(1,n+1),k*n,mpi_real8,mpi_sum,mpicomm,mpierr) #else work(1:k*n,n+1) = work(1:k*n,1) #endif call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t,ldt,work(1,n+1),k) end subroutine qr_pdlarfb_1dcomm ! generalized pdlarfl2 version ! TODO: include T merge here (seperate by "old" and "new" index) subroutine qr_pdlarft_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,tau,t,ldt,a,lda,baseidx,rev,mpicomm,work,lwork) use precision use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: ldv,ldt,lda,lwork real(kind=rk) :: v(ldv,*),tau(*),t(ldt,*),work(k,*),a(lda,*) ! input variables (global) integer(kind=ik) :: m,mb,n,k,oldk,baseidx,rev,mpicomm ! output variables (global) ! derived input variables from QR_PQRPARAM ! local scalars integer(kind=ik) :: localsize,offset,baseoffset integer(kind=ik) :: mpirank,mpiprocs,mpierr integer(kind=ik) :: icol integer(kind=ik) :: sendoffset,recvoffset,sendsize sendoffset = 1 sendsize = k*(k+n+oldk) recvoffset = sendoffset+(k+n+oldk) if (lwork .eq. -1) then work(1,1) = DBLE(2*(k*k+k*n+oldk)) return end if call MPI_Comm_rank(mpicomm,mpirank,mpierr) call MPI_Comm_size(mpicomm,mpiprocs,mpierr) call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, & localsize,baseoffset,offset) if (localsize .gt. 0) then ! calculate inner product of householdervectors call dsyrk("Upper","Trans",k,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),k) ! calculate matrix matrix product of householder vectors and target matrix ! Z' = Y' * A call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0,work(1,k+1),k) ! TODO: reserved for T merge parts work(1:k,n+k+1:n+k+oldk) = 0.0d0 else work(1:k,1:(n+k+oldk)) = 0.0d0 end if ! exchange data #ifdef WITH_MPI call mpi_allreduce(work(1,sendoffset),work(1,recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr) #else work(1:sendsize,recvoffset) = work(1:sendsize,sendoffset) #endif ! generate T matrix (pdlarft) t(1:k,1:k) = 0.0d0 ! DEBUG: clear buffer first ! T1 = tau1 ! | tauk Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' | ! | 0 Tk-1 | t(k,k) = tau(k) do icol=k-1,1,-1 t(icol,icol+1:k) = -tau(icol)*work(icol,recvoffset+icol:recvoffset+k-1) call dtrmv("Upper","Trans","Nonunit",k-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt) t(icol,icol) = tau(icol) end do ! TODO: elmroth and gustavson ! update matrix (pdlarfb) ! Z' = T * Z' call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0d0,t,ldt,work(1,recvoffset+k),k) ! A = A - Y * V' call dgemm("Notrans","Notrans",localsize,n,k,-1.0d0,v(baseoffset,1),ldv,work(1,recvoffset+k),k,1.0d0,a(offset,1),lda) end subroutine qr_pdlarft_pdlarfb_1dcomm subroutine qr_pdlarft_set_merge_1dcomm(m,mb,n,blocksize,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork) use precision use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: ldv,ldt,lwork real(kind=rk) :: v(ldv,*),t(ldt,*),work(n,*) ! input variables (global) integer(kind=ik) :: m,mb,n,blocksize,baseidx,rev,mpicomm ! output variables (global) ! derived input variables from QR_PQRPARAM ! local scalars integer(kind=ik) :: localsize,offset,baseoffset integer(kind=ik) :: mpirank,mpiprocs,mpierr if (lwork .eq. -1) then work(1,1) = DBLE(2*n*n) return end if call MPI_Comm_rank(mpicomm,mpirank,mpierr) call MPI_Comm_size(mpicomm,mpiprocs,mpierr) call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, & localsize,baseoffset,offset) if (localsize .gt. 0) then call dsyrk("Upper","Trans",n,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),n) else work(1:n,1:n) = 0.0d0 end if #ifdef WITH_MPI call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr) #else work(1:n,n+1:n+1+n-1) = work(1:n,1:n) #endif ! skip Y4'*Y4 part offset = mod(n,blocksize) if (offset .eq. 0) offset=blocksize call qr_tmerge_set_kernel(n,blocksize,t,ldt,work(1,n+1+offset),n) end subroutine qr_pdlarft_set_merge_1dcomm subroutine qr_pdlarft_tree_merge_1dcomm(m,mb,n,blocksize,treeorder,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork) use precision use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: ldv,ldt,lwork real(kind=rk) :: v(ldv,*),t(ldt,*),work(n,*) ! input variables (global) integer(kind=ik) :: m,mb,n,blocksize,treeorder,baseidx,rev,mpicomm ! output variables (global) ! derived input variables from QR_PQRPARAM ! local scalars integer(kind=ik) :: localsize,offset,baseoffset integer(kind=ik) :: mpirank,mpiprocs,mpierr if (lwork .eq. -1) then work(1,1) = DBLE(2*n*n) return end if if (n .le. blocksize) return ! nothing to do call MPI_Comm_rank(mpicomm,mpirank,mpierr) call MPI_Comm_size(mpicomm,mpiprocs,mpierr) call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, & localsize,baseoffset,offset) if (localsize .gt. 0) then call dsyrk("Upper","Trans",n,localsize,1.0d0,v(baseoffset,1),ldv,0.0d0,work(1,1),n) else work(1:n,1:n) = 0.0d0 end if #ifdef WITH_MPI call mpi_allreduce(work(1,1),work(1,n+1),n*n,mpi_real8,mpi_sum,mpicomm,mpierr) #else work(1:n,n+1:n+1+n-1) = work(1:n,1:n) #endif ! skip Y4'*Y4 part offset = mod(n,blocksize) if (offset .eq. 0) offset=blocksize call qr_tmerge_tree_kernel(n,blocksize,treeorder,t,ldt,work(1,n+1+offset),n) end subroutine qr_pdlarft_tree_merge_1dcomm ! apply householder vector to the left ! - assume unitary matrix ! - assume right positions for v subroutine qr_pdlarfl_1dcomm(v,incv,baseidx,a,lda,tau,work,lwork,m,n,idx,mb,rev,mpicomm) use precision use ELPA1 use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: incv,lda,lwork,baseidx real(kind=rk) :: v(*),a(lda,*),work(*) ! input variables (global) integer(kind=ik) :: m,n,mb,rev,idx,mpicomm real(kind=rk) :: tau ! output variables (global) ! local scalars integer(kind=ik) :: mpierr,mpirank,mpiprocs integer(kind=ik) :: sendsize,recvsize,icol integer(kind=ik) :: local_size,local_offset integer(kind=ik) :: v_local_offset ! external functions real(kind=rk), external :: ddot call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) sendsize = n recvsize = sendsize if (lwork .eq. -1) then work(1) = DBLE(sendsize + recvsize) return end if if (n .le. 0) return if (idx .le. 1) return call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, & local_size,v_local_offset,local_offset) !print *,'hl ref',local_size,n v_local_offset = v_local_offset * incv if (local_size > 0) then do icol=1,n work(icol) = dot_product(v(v_local_offset:v_local_offset+local_size-1),a(local_offset:local_offset+local_size-1,icol)) end do else work(1:n) = 0.0d0 end if #ifdef WITH_MPI call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr) #else work(sendsize+1:sendsize+1+sendsize+1+sendsize-1) = work(1:sendsize) #endif if (local_size > 0) then do icol=1,n a(local_offset:local_offset+local_size-1,icol) = a(local_offset:local_offset+local_size-1,icol) & - tau*work(sendsize+icol)*v(v_local_offset:v_local_offset+ & local_size-1) enddo end if end subroutine qr_pdlarfl_1dcomm subroutine qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,work,lwork,m,n,idx,mb,rev,mpicomm) use precision use ELPA1 use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: ldv,lda,lwork,baseidx,ldt real(kind=rk) :: v(ldv,*),a(lda,*),work(*),t(ldt,*) ! input variables (global) integer(kind=ik) :: m,n,mb,rev,idx,mpicomm ! output variables (global) ! local scalars integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top1,mpirank_top2 integer(kind=ik) :: dgemv1_offset,dgemv2_offset integer(kind=ik) :: sendsize, recvsize integer(kind=ik) :: local_size1,local_offset1 integer(kind=ik) :: local_size2,local_offset2 integer(kind=ik) :: local_size_dger,local_offset_dger integer(kind=ik) :: v1_local_offset,v2_local_offset integer(kind=ik) :: v_local_offset_dger real(kind=rk) :: hvdot integer(kind=ik) :: irow,icol,v1col,v2col ! external functions real(kind=rk), external :: ddot call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) sendsize = 2*n recvsize = sendsize if (lwork .eq. -1) then work(1) = sendsize + recvsize return end if dgemv1_offset = 1 dgemv2_offset = dgemv1_offset + n ! in 2x2 matrix case only one householder vector was generated if (idx .le. 2) then call qr_pdlarfl_1dcomm(v(1,2),1,baseidx,a,lda,t(2,2), & work,lwork,m,n,idx,mb,rev,mpicomm) return end if call local_size_offset_1d(m,mb,baseidx,idx,rev,mpirank,mpiprocs, & local_size1,v1_local_offset,local_offset1) call local_size_offset_1d(m,mb,baseidx,idx-1,rev,mpirank,mpiprocs, & local_size2,v2_local_offset,local_offset2) v1_local_offset = v1_local_offset * 1 v2_local_offset = v2_local_offset * 1 v1col = 2 v2col = 1 ! keep buffers clean in case that local_size1/local_size2 are zero work(1:sendsize) = 0.0d0 call dgemv("Trans",local_size1,n,1.0d0,a(local_offset1,1),lda,v(v1_local_offset,v1col),1,0.0d0,work(dgemv1_offset),1) call dgemv("Trans",local_size2,n,t(v2col,v2col),a(local_offset2,1),lda,v(v2_local_offset,v2col),1,0.0d0, & work(dgemv2_offset),1) #ifdef WITH_MPI call mpi_allreduce(work, work(sendsize+1), sendsize, mpi_real8, mpi_sum, mpicomm, mpierr) #else work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize) #endif ! update second vector call daxpy(n,t(1,2),work(sendsize+dgemv1_offset),1,work(sendsize+dgemv2_offset),1) call local_size_offset_1d(m,mb,baseidx,idx-2,rev,mpirank,mpiprocs, & local_size_dger,v_local_offset_dger,local_offset_dger) ! get ranks of processes with topelements mpirank_top1 = MOD((idx-1)/mb,mpiprocs) mpirank_top2 = MOD((idx-2)/mb,mpiprocs) if (mpirank_top1 .eq. mpirank) local_offset1 = local_size1 if (mpirank_top2 .eq. mpirank) then local_offset2 = local_size2 v2_local_offset = local_size2 end if ! use hvdot as temporary variable hvdot = t(v1col,v1col) do icol=1,n ! make use of "1" entries in householder vectors if (mpirank_top1 .eq. mpirank) then a(local_offset1,icol) = a(local_offset1,icol) & - work(sendsize+dgemv1_offset+icol-1)*hvdot end if if (mpirank_top2 .eq. mpirank) then a(local_offset2,icol) = a(local_offset2,icol) & - v(v2_local_offset,v1col)*work(sendsize+dgemv1_offset+icol-1)*hvdot & - work(sendsize+dgemv2_offset+icol-1) end if do irow=1,local_size_dger a(local_offset_dger+irow-1,icol) = a(local_offset_dger+irow-1,icol) & - work(sendsize+dgemv1_offset+icol-1)*v(v_local_offset_dger+irow-1,v1col)*hvdot & - work(sendsize+dgemv2_offset+icol-1)*v(v_local_offset_dger+irow-1,v2col) end do end do end subroutine qr_pdlarfl2_tmatrix_1dcomm ! generalized pdlarfl2 version ! TODO: include T merge here (seperate by "old" and "new" index) subroutine qr_tmerge_pdlarfb_1dcomm(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,baseidx,rev,updatemode,mpicomm,work,lwork) use precision use qr_utils_mod implicit none ! input variables (local) integer(kind=ik) :: ldv,ldt,lda,lwork real(kind=rk) :: v(ldv,*),t(ldt,*),work(*),a(lda,*) ! input variables (global) integer(kind=ik) :: m,mb,n,k,oldk,baseidx,rev,updatemode,mpicomm ! output variables (global) ! derived input variables from QR_PQRPARAM ! local scalars integer(kind=ik) :: localsize,offset,baseoffset integer(kind=ik) :: mpirank,mpiprocs,mpierr integer(kind=ik) :: sendoffset,recvoffset,sendsize integer(kind=ik) :: updateoffset,updatelda,updatesize integer(kind=ik) :: mergeoffset,mergelda,mergesize integer(kind=ik) :: tgenoffset,tgenlda,tgensize if (updatemode .eq. ichar('I')) then updatelda = oldk+k else updatelda = k end if updatesize = updatelda*n mergelda = k mergesize = mergelda*oldk tgenlda = 0 tgensize = 0 sendsize = updatesize + mergesize + tgensize if (lwork .eq. -1) then work(1) = DBLE(2*sendsize) return end if call MPI_Comm_rank(mpicomm,mpirank,mpierr) call MPI_Comm_size(mpicomm,mpiprocs,mpierr) ! use baseidx as idx here, otherwise the upper triangle part will be lost ! during the calculation, especially in the reversed case call local_size_offset_1d(m,mb,baseidx,baseidx,rev,mpirank,mpiprocs, & localsize,baseoffset,offset) sendoffset = 1 if (oldk .gt. 0) then updateoffset = 0 mergeoffset = updateoffset + updatesize tgenoffset = mergeoffset + mergesize sendsize = updatesize + mergesize + tgensize !print *,'sendsize',sendsize,updatesize,mergesize,tgensize !print *,'merging nr of rotations', oldk+k if (localsize .gt. 0) then ! calculate matrix matrix product of householder vectors and target matrix if (updatemode .eq. ichar('I')) then ! Z' = (Y1,Y2)' * A call dgemm("Trans","Notrans",k+oldk,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, & work(sendoffset+updateoffset),updatelda) else ! Z' = Y1' * A call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, & work(sendoffset+updateoffset),updatelda) end if ! calculate parts needed for T merge call dgemm("Trans","Notrans",k,oldk,localsize,1.0d0,v(baseoffset,1),ldv,v(baseoffset,k+1),ldv,0.0d0, & work(sendoffset+mergeoffset),mergelda) else ! cleanup buffer work(sendoffset:sendoffset+sendsize-1) = 0.0d0 end if else ! do not calculate parts for T merge as there is nothing to merge mergeoffset = 0 updateoffset = 0 tgenoffset = updateoffset + updatesize sendsize = updatesize + tgensize if (localsize .gt. 0) then ! calculate matrix matrix product of householder vectors and target matrix ! Z' = (Y1)' * A call dgemm("Trans","Notrans",k,n,localsize,1.0d0,v(baseoffset,1),ldv,a(offset,1),lda,0.0d0, & work(sendoffset+updateoffset),updatelda) else ! cleanup buffer work(sendoffset:sendoffset+sendsize-1) = 0.0d0 end if end if recvoffset = sendoffset + sendsize if (sendsize .le. 0) return ! nothing to do ! exchange data #ifdef WITH_MPI call mpi_allreduce(work(sendoffset),work(recvoffset),sendsize,mpi_real8,mpi_sum,mpicomm,mpierr) #else work(recvoffset:recvoffset+sendsize-1) = work(sendoffset:sendoffset+sendsize-1) #endif updateoffset = recvoffset+updateoffset mergeoffset = recvoffset+mergeoffset tgenoffset = recvoffset+tgenoffset if (oldk .gt. 0) then call qr_pdlarft_merge_kernel_local(oldk,k,t,ldt,work(mergeoffset),mergelda) if (localsize .gt. 0) then if (updatemode .eq. ichar('I')) then ! update matrix (pdlarfb) with complete T call qr_pdlarfb_kernel_local(localsize,n,k+oldk,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, & work(updateoffset),updatelda) else ! update matrix (pdlarfb) with small T (same as update with no old T TODO) call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, & work(updateoffset),updatelda) end if end if else if (localsize .gt. 0) then ! update matrix (pdlarfb) with small T call qr_pdlarfb_kernel_local(localsize,n,k,a(offset,1),lda,v(baseoffset,1),ldv,t(1,1),ldt, & work(updateoffset),updatelda) end if end if end subroutine qr_tmerge_pdlarfb_1dcomm end module elpa_pdlarfb elpa-2016.05.001/src/elpa_qr/elpa_qrkernels.f900000644000312500001440000006103612717516040015574 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! calculates A = A - Y*T'*Z (rev=0) ! calculates A = A - Y*T*Z (rev=1) ! T upper triangle matrix ! assuming zero entries in matrix in upper kxk block subroutine qr_pdlarfb_kernel_local(m,n,k,a,lda,v,ldv,t,ldt,z,ldz) use precision implicit none ! input variables (local) integer(kind=ik) :: lda,ldv,ldt,ldz real(kind=rk) :: a(lda,*),v(ldv,*),t(ldt,*),z(ldz,*) ! input variables (global) integer(kind=ik) :: m,n,k ! local variables real(kind=rk) :: t11 real(kind=rk) :: t12,t22,sum1,sum2 real(kind=rk) :: t13,t23,t33,sum3 real(kind=rk) :: sum4,t44 real(kind=rk) :: y1,y2,y3,y4 real(kind=rk) :: a1 integer(kind=ik) :: icol,irow,v1col,v2col,v3col ! reference implementation if (k .eq. 1) then t11 = t(1,1) do icol=1,n sum1 = z(1,icol) a(1:m,icol) = a(1:m,icol) - t11*sum1*v(1:m,1) enddo return else if (k .eq. 2) then v1col = 2 v2col = 1 t22 = t(1,1) t12 = t(1,2) t11 = t(2,2) do icol=1,n sum1 = t11 * z(v1col,icol) sum2 = t12 * z(v1col,icol) + t22 * z(v2col,icol) do irow=1,m a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 end do end do else if (k .eq. 3) then v1col = 3 v2col = 2 v3col = 1 t33 = t(1,1) t23 = t(1,2) t22 = t(2,2) t13 = t(1,3) t12 = t(2,3) t11 = t(3,3) do icol=1,n ! misusing variables for fetch of z parts y1 = z(v1col,icol) y2 = z(v2col,icol) y3 = z(v3col,icol) sum1 = t11 * y1!+ 0 * y2!+ 0 * y3 sum2 = t12 * y1 + t22 * y2!+ 0 * y3 sum3 = t13 * y1 + t23 * y2 + t33 * y3 do irow=1,m a(irow,icol) = a(irow,icol) - v(irow,v1col) * sum1 - v(irow,v2col) * sum2 - v(irow,v3col) * sum3 end do end do else if (k .eq. 4) then do icol=1,n ! misusing variables for fetch of z parts y1 = z(1,icol) y2 = z(2,icol) y3 = z(3,icol) y4 = z(4,icol) ! dtrmv like - starting from main diagonal and working ! upwards t11 = t(1,1) t22 = t(2,2) t33 = t(3,3) t44 = t(4,4) sum1 = t11 * y1 sum2 = t22 * y2 sum3 = t33 * y3 sum4 = t44 * y4 t11 = t(1,2) t22 = t(2,3) t33 = t(3,4) sum1 = sum1 + t11 * y2 sum2 = sum2 + t22 * y3 sum3 = sum3 + t33 * y4 t11 = t(1,3) t22 = t(2,4) sum1 = sum1 + t11 * y3 sum2 = sum2 + t22 * y4 t11 = t(1,4) sum1 = sum1 + t11 * y4 ! one column of V is calculated ! time to calculate A - Y * V do irow=1,m ! TODO: loop unrolling y1 = v(irow,1) y2 = v(irow,2) y3 = v(irow,3) y4 = v(irow,4) a1 = a(irow,icol) a1 = a1 - y1*sum1 a1 = a1 - y2*sum2 a1 = a1 - y3*sum3 a1 = a1 - y4*sum4 a(irow,icol) = a1 end do end do else ! reference implementation ! V' = T * Z' call dtrmm("Left","Upper","Notrans","Nonunit",k,n,1.0d0,t,ldt,z,ldz) ! A = A - Y * V' call dgemm("Notrans","Notrans",m,n,k,-1.0d0,v,ldv,z,ldz,1.0d0,a,lda) end if end subroutine subroutine qr_pdlarft_merge_kernel_local(oldk,k,t,ldt,yty,ldy) use precision implicit none ! input variables (local) integer(kind=ik) :: ldt,ldy real(kind=rk) :: t(ldt,*),yty(ldy,*) ! input variables (global) integer(kind=ik) :: k,oldk ! output variables (global) ! local scalars integer(kind=ik) :: icol,leftk,rightk ! local scalars for optimized versions integer(kind=ik) :: irow real(kind=rk) :: t11 real(kind=rk) :: yty1,yty2,yty3,yty4,yty5,yty6,yty7,yty8 real(kind=rk) :: reg01,reg02,reg03,reg04,reg05,reg06,reg07,reg08 real(kind=rk) :: final01,final02,final03,final04,final05,final06,final07,final08 if (oldk .eq. 0) return ! nothing to be done leftk = k rightk = oldk ! optimized implementations: if (leftk .eq. 1) then do icol=1,rightk ! multiply inner products with right t matrix ! (dtrmv like) yty1 = yty(1,1) t11 = t(leftk+1,leftk+icol) reg01 = yty1 * t11 do irow=2,icol yty1 = yty(1,irow) t11 = t(leftk+irow,leftk+icol) reg01 = reg01 + yty1 * t11 end do ! multiply intermediate results with left t matrix and store in final t ! matrix t11 = -t(1,1) final01 = t11 * reg01 t(1,leftk+icol) = final01 end do !print *,'efficient tmerge - leftk=1' else if (leftk .eq. 2) then do icol=1,rightk ! multiply inner products with right t matrix ! (dtrmv like) yty1 = yty(1,1) yty2 = yty(2,1) t11 = t(leftk+1,leftk+icol) reg01 = yty1 * t11 reg02 = yty2 * t11 do irow=2,icol yty1 = yty(1,irow) yty2 = yty(2,irow) t11 = t(leftk+irow,leftk+icol) reg01 = reg01 + yty1 * t11 reg02 = reg02 + yty2 * t11 end do ! multiply intermediate results with left t matrix and store in final t ! matrix yty1 = -t(1,1) yty2 = -t(1,2) yty3 = -t(2,2) final01 = reg02 * yty2 final02 = reg02 * yty3 final01 = final01 + reg01 * yty1 t(1,leftk+icol) = final01 t(2,leftk+icol) = final02 end do !print *,'efficient tmerge - leftk=2' else if (leftk .eq. 4) then do icol=1,rightk ! multiply inner products with right t matrix ! (dtrmv like) yty1 = yty(1,1) yty2 = yty(2,1) yty3 = yty(3,1) yty4 = yty(4,1) t11 = t(leftk+1,leftk+icol) reg01 = yty1 * t11 reg02 = yty2 * t11 reg03 = yty3 * t11 reg04 = yty4 * t11 do irow=2,icol yty1 = yty(1,irow) yty2 = yty(2,irow) yty3 = yty(3,irow) yty4 = yty(4,irow) t11 = t(leftk+irow,leftk+icol) reg01 = reg01 + yty1 * t11 reg02 = reg02 + yty2 * t11 reg03 = reg03 + yty3 * t11 reg04 = reg04 + yty4 * t11 end do ! multiply intermediate results with left t matrix and store in final t ! matrix (start from diagonal and move upwards) yty1 = -t(1,1) yty2 = -t(2,2) yty3 = -t(3,3) yty4 = -t(4,4) ! main diagonal final01 = reg01 * yty1 final02 = reg02 * yty2 final03 = reg03 * yty3 final04 = reg04 * yty4 ! above main diagonal yty1 = -t(1,2) yty2 = -t(2,3) yty3 = -t(3,4) final01 = final01 + reg02 * yty1 final02 = final02 + reg03 * yty2 final03 = final03 + reg04 * yty3 ! above first side diagonal yty1 = -t(1,3) yty2 = -t(2,4) final01 = final01 + reg03 * yty1 final02 = final02 + reg04 * yty2 ! above second side diagonal yty1 = -t(1,4) final01 = final01 + reg04 * yty1 ! write back to final matrix t(1,leftk+icol) = final01 t(2,leftk+icol) = final02 t(3,leftk+icol) = final03 t(4,leftk+icol) = final04 end do !print *,'efficient tmerge - leftk=4' else if (leftk .eq. 8) then do icol=1,rightk ! multiply inner products with right t matrix ! (dtrmv like) yty1 = yty(1,1) yty2 = yty(2,1) yty3 = yty(3,1) yty4 = yty(4,1) yty5 = yty(5,1) yty6 = yty(6,1) yty7 = yty(7,1) yty8 = yty(8,1) t11 = t(leftk+1,leftk+icol) reg01 = yty1 * t11 reg02 = yty2 * t11 reg03 = yty3 * t11 reg04 = yty4 * t11 reg05 = yty5 * t11 reg06 = yty6 * t11 reg07 = yty7 * t11 reg08 = yty8 * t11 do irow=2,icol yty1 = yty(1,irow) yty2 = yty(2,irow) yty3 = yty(3,irow) yty4 = yty(4,irow) yty5 = yty(5,irow) yty6 = yty(6,irow) yty7 = yty(7,irow) yty8 = yty(8,irow) t11 = t(leftk+irow,leftk+icol) reg01 = reg01 + yty1 * t11 reg02 = reg02 + yty2 * t11 reg03 = reg03 + yty3 * t11 reg04 = reg04 + yty4 * t11 reg05 = reg05 + yty5 * t11 reg06 = reg06 + yty6 * t11 reg07 = reg07 + yty7 * t11 reg08 = reg08 + yty8 * t11 end do ! multiply intermediate results with left t matrix and store in final t ! matrix (start from diagonal and move upwards) yty1 = -t(1,1) yty2 = -t(2,2) yty3 = -t(3,3) yty4 = -t(4,4) yty5 = -t(5,5) yty6 = -t(6,6) yty7 = -t(7,7) yty8 = -t(8,8) ! main diagonal final01 = reg01 * yty1 final02 = reg02 * yty2 final03 = reg03 * yty3 final04 = reg04 * yty4 final05 = reg05 * yty5 final06 = reg06 * yty6 final07 = reg07 * yty7 final08 = reg08 * yty8 ! above main diagonal yty1 = -t(1,2) yty2 = -t(2,3) yty3 = -t(3,4) yty4 = -t(4,5) yty5 = -t(5,6) yty6 = -t(6,7) yty7 = -t(7,8) final01 = final01 + reg02 * yty1 final02 = final02 + reg03 * yty2 final03 = final03 + reg04 * yty3 final04 = final04 + reg05 * yty4 final05 = final05 + reg06 * yty5 final06 = final06 + reg07 * yty6 final07 = final07 + reg08 * yty7 ! above first side diagonal yty1 = -t(1,3) yty2 = -t(2,4) yty3 = -t(3,5) yty4 = -t(4,6) yty5 = -t(5,7) yty6 = -t(6,8) final01 = final01 + reg03 * yty1 final02 = final02 + reg04 * yty2 final03 = final03 + reg05 * yty3 final04 = final04 + reg06 * yty4 final05 = final05 + reg07 * yty5 final06 = final06 + reg08 * yty6 !above second side diagonal yty1 = -t(1,4) yty2 = -t(2,5) yty3 = -t(3,6) yty4 = -t(4,7) yty5 = -t(5,8) final01 = final01 + reg04 * yty1 final02 = final02 + reg05 * yty2 final03 = final03 + reg06 * yty3 final04 = final04 + reg07 * yty4 final05 = final05 + reg08 * yty5 ! i think you got the idea by now yty1 = -t(1,5) yty2 = -t(2,6) yty3 = -t(3,7) yty4 = -t(4,8) final01 = final01 + reg05 * yty1 final02 = final02 + reg06 * yty2 final03 = final03 + reg07 * yty3 final04 = final04 + reg08 * yty4 ! ..... yty1 = -t(1,6) yty2 = -t(2,7) yty3 = -t(3,8) final01 = final01 + reg06 * yty1 final02 = final02 + reg07 * yty2 final03 = final03 + reg08 * yty3 ! ..... yty1 = -t(1,7) yty2 = -t(2,8) final01 = final01 + reg07 * yty1 final02 = final02 + reg08 * yty2 ! ..... yty1 = -t(1,8) final01 = final01 + reg08 * yty1 ! write back to final matrix t(1,leftk+icol) = final01 t(2,leftk+icol) = final02 t(3,leftk+icol) = final03 t(4,leftk+icol) = final04 t(5,leftk+icol) = final05 t(6,leftk+icol) = final06 t(7,leftk+icol) = final07 t(8,leftk+icol) = final08 end do !print *,'efficient tmerge - leftk=8' else ! reference implementation do icol=1,rightk t(1:leftk,leftk+icol) = yty(1:leftk,icol) end do ! -T1 * Y1'*Y2 call dtrmm("Left","Upper","Notrans","Nonunit",leftk,rightk,-1.0d0,t(1,1),ldt,t(1,leftk+1),ldt) ! (-T1 * Y1'*Y2) * T2 call dtrmm("Right","Upper","Notrans","Nonunit",leftk,rightk,1.0d0,t(leftk+1,leftk+1),ldt,t(1,leftk+1),ldt) end if end subroutine ! yty structure ! Y1'*Y2 Y1'*Y3 Y1'*Y4 ... ! 0 Y2'*Y3 Y2'*Y4 ... ! 0 0 Y3'*Y4 ... ! 0 0 0 ... subroutine qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy) use precision implicit none ! input variables (local) integer(kind=ik) :: ldt,ldy real(kind=rk) :: t(ldt,*),yty(ldy,*) ! input variables (global) integer(kind=ik) :: k,blocksize ! output variables (global) ! local scalars integer(kind=ik) :: nr_blocks,current_block integer(kind=ik) :: remainder,oldk integer(kind=ik) :: yty_column,toffset if (k .le. blocksize) return ! nothing to merge nr_blocks = k / blocksize remainder = k - nr_blocks*blocksize ! work in "negative" direction: ! start with latest T matrix part and add older ones toffset = 1 yty_column = 1 if (remainder .gt. 0) then call qr_pdlarft_merge_kernel_local(blocksize,remainder,t(toffset,toffset),ldt,yty(1,yty_column),ldy) current_block = 1 oldk = remainder+blocksize yty_column = yty_column + blocksize else call qr_pdlarft_merge_kernel_local(blocksize,blocksize,t(toffset,toffset),ldt,yty(1,yty_column),ldy) current_block = 2 oldk = 2*blocksize yty_column = yty_column + blocksize end if do while (current_block .lt. nr_blocks) call qr_pdlarft_merge_kernel_local(blocksize,oldk,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy) current_block = current_block + 1 oldk = oldk + blocksize yty_column = yty_column + blocksize end do end subroutine ! yty structure ! Y1'*Y2 Y1'*Y3 Y1'*Y4 ... ! 0 Y2'*Y3 Y2'*Y4 ... ! 0 0 Y3'*Y4 ... ! 0 0 0 ... subroutine qr_tmerge_tree_kernel(k,blocksize,treeorder,t,ldt,yty,ldy) use precision implicit none ! input variables (local) integer(kind=ik) :: ldt,ldy real(kind=rk) :: t(ldt,*),yty(ldy,*) ! input variables (global) integer(kind=ik) :: k,blocksize,treeorder ! output variables (global) ! local scalars integer temp_blocksize,nr_sets,current_set,setsize,nr_blocks integer remainder,max_treeorder,remaining_size integer toffset,yty_column integer toffset_start,yty_column_start integer yty_end,total_remainder,yty_remainder if (treeorder .eq. 0) return ! no merging if (treeorder .eq. 1) then call qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy) return end if nr_blocks = k / blocksize max_treeorder = min(nr_blocks,treeorder) if (max_treeorder .eq. 1) then call qr_tmerge_set_kernel(k,blocksize,t,ldt,yty,ldy) return end if ! work in "negative" direction: from latest set to oldest set ! implementation differs from rev=0 version due to issues with ! calculating the remainder parts ! compared to the rev=0 version we split remainder parts directly from ! parts which can be easily merged in a recursive way yty_end = (k / blocksize) * blocksize if (yty_end .eq. k) then yty_end = yty_end - blocksize end if !print *,'tree',yty_end,k,blocksize yty_column_start = 1 toffset_start = 1 ! is there a remainder block? nr_blocks = k / blocksize remainder = k - nr_blocks * blocksize if (remainder .eq. 0) then !print *,'no initial remainder' ! set offsets to the very beginning as there is no remainder part yty_column_start = 1 toffset_start = 1 total_remainder = 0 remaining_size = k yty_remainder = 0 else !print *,'starting with initial remainder' ! select submatrix and make remainder block public yty_column_start = 1 + blocksize toffset_start = 1 + remainder total_remainder = remainder remaining_size = k - remainder yty_remainder = 1 end if ! from now on it is a clean set of blocks with sizes of multiple of ! blocksize temp_blocksize = blocksize !------------------------------- do while (remaining_size .gt. 0) nr_blocks = remaining_size / temp_blocksize max_treeorder = min(nr_blocks,treeorder) if (max_treeorder .eq. 1) then remainder = 0 nr_sets = 0 setsize = 0 if (yty_remainder .gt. 0) then yty_column = yty_remainder !print *,'final merging with remainder',temp_blocksize,k,remaining_size,yty_column call qr_tmerge_set_kernel(k,temp_blocksize,t,ldt,yty(1,yty_column),ldy) else !print *,'no remainder - no merging needed',temp_blocksize,k,remaining_size endif remaining_size = 0 return ! done else nr_sets = nr_blocks / max_treeorder setsize = max_treeorder*temp_blocksize remainder = remaining_size - nr_sets*setsize end if if (remainder .gt. 0) then if (remainder .gt. temp_blocksize) then toffset = toffset_start yty_column = yty_column_start !print *,'set merging', toffset, yty_column,remainder call qr_tmerge_set_kernel(remainder,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy) if (total_remainder .gt. 0) then ! merge with existing global remainder part !print *,'single+set merging',yty_remainder,total_remainder,remainder call qr_pdlarft_merge_kernel_local(remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy) yty_remainder = yty_remainder + remainder toffset_start = toffset_start + remainder !print *,'single+set merging (new offsets)',yty_remainder,yty_column_start,toffset_start yty_column_start = yty_column_start + remainder else ! create new remainder part !print *,'new remainder+set',yty_remainder yty_remainder = yty_column_start + remainder - temp_blocksize yty_column_start = yty_column_start + remainder toffset_start = toffset_start + remainder !print *,'new remainder+set (new offsets)',yty_remainder,yty_column_start,toffset_start end if else if (total_remainder .gt. 0) then ! merge with existing global remainder part !print *,'single merging',yty_remainder,total_remainder,remainder call qr_pdlarft_merge_kernel_local(remainder,total_remainder,t(1,1),ldt,yty(1,yty_remainder),ldy) yty_remainder = yty_remainder + remainder toffset_start = toffset_start + remainder !print *,'single merging (new offsets)',yty_remainder,yty_column_start,toffset_start yty_column_start = yty_column_start + remainder else ! create new remainder part !print *,'new remainder',yty_remainder yty_remainder = yty_column_start yty_column_start = yty_column_start + temp_blocksize toffset_start = toffset_start + remainder !print *,'new remainder (new offsets)',yty_remainder,yty_column_start,toffset_start end if end if total_remainder = total_remainder + remainder remaining_size = remaining_size - remainder end if current_set = 0 do while (current_set .lt. nr_sets) toffset = toffset_start + current_set * setsize yty_column = yty_column_start + current_set * setsize !print *,'recursive merging', toffset, yty_column,setsize call qr_tmerge_set_kernel(setsize,temp_blocksize,t(toffset,toffset),ldt,yty(toffset,yty_column),ldy) current_set = current_set + 1 end do !print *,'increasing blocksize', temp_blocksize, setsize yty_column_start = yty_column_start + (setsize - temp_blocksize) temp_blocksize = setsize end do end subroutine ! yty should not contain the inner products vi'*vi subroutine qr_dlarft_kernel(n,tau,yty,ldy,t,ldt) use precision implicit none ! input variables integer(kind=ik) :: n,ldy,ldt real(kind=rk) :: tau(*),yty(ldy,*) ! output variables real(kind=rk) :: t(ldt,*) ! local variables integer(kind=ik) :: icol ! DEBUG: clear buffer first !t(1:n,1:n) = 0.0d0 ! T1 = tau1 ! | tauk Tk-1' * (-tauk * Y(:,1,k+1:n) * Y(:,k))' | ! | 0 Tk-1 | t(n,n) = tau(n) do icol=n-1,1,-1 t(icol,icol+1:n) = -tau(icol)*yty(icol,icol:n-1) call dtrmv("Upper","Trans","Nonunit",n-icol,t(icol+1,icol+1),ldt,t(icol,icol+1),ldt) t(icol,icol) = tau(icol) end do end subroutine elpa-2016.05.001/src/elpa_qr/elpa_pdgeqrf.F900000644000312500001440000024465712717516040015172 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! #include "config-f90.h" module elpa_pdgeqrf use elpa1_compute use elpa_pdlarfb use qr_utils_mod use elpa_mpi implicit none PRIVATE public :: qr_pdgeqrf_2dcomm public :: qr_pqrparam_init public :: qr_pdlarfg2_1dcomm_check contains subroutine qr_pdgeqrf_2dcomm(a, lda, matrixCols, v, ldv, vmrCols, tau, lengthTau, t, ldt, colsT, & work, workLength, lwork, m, n, mb, nb, rowidx, colidx, & rev, trans, PQRPARAM, mpicomm_rows, mpicomm_cols, blockheuristic) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup INTEGER(kind=ik), parameter :: gmode_ = 1, rank_ = 2, eps_ = 3 ! input variables (local) integer(kind=ik), intent(in) :: lda, lwork, ldv, ldt, matrixCols, m, vmrCols, lengthTau, & colsT, workLength ! input variables (global) integer(kind=ik) :: n, mb, nb, rowidx, colidx, rev, trans, mpicomm_cols, mpicomm_rows #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) :: PQRPARAM(*) real(kind=rk) :: a(lda,*), v(ldv,*), tau(*), t(ldt,*), work(*) #else integer(kind=ik) :: PQRPARAM(1:11) real(kind=rk) :: a(1:lda,1:matrixCols), v(1:ldv,1:vmrCols), tau(1:lengthTau), & t(1:ldt,1:colsT), work(1:workLength) #endif ! output variables (global) real(kind=rk) :: blockheuristic(*) ! input variables derived from PQRPARAM integer(kind=ik) :: updatemode,tmerge,size2d ! local scalars integer(kind=ik) :: mpierr,mpirank_cols,broadcast_size,mpirank_rows integer(kind=ik) :: mpirank_cols_qr,mpiprocs_cols integer(kind=ik) :: lcols_temp,lcols,icol,lastcol integer(kind=ik) :: baseoffset,offset,idx,voffset integer(kind=ik) :: update_voffset,update_tauoffset integer(kind=ik) :: update_lcols integer(kind=ik) :: work_offset real(kind=rk) :: dbroadcast_size(1),dtmat_bcast_size(1) real(kind=rk) :: pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1),tmerge_pdlarfb_size(1) integer(kind=ik) :: temptau_offset,temptau_size,broadcast_offset,tmat_bcast_size integer(kind=ik) :: remaining_cols integer(kind=ik) :: total_cols integer(kind=ik) :: incremental_update_size ! needed for incremental update mode #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdgeqrf_2dcomm") #endif size2d = PQRPARAM(1) updatemode = PQRPARAM(2) tmerge = PQRPARAM(3) ! copy value before we are going to filter it total_cols = n call mpi_comm_rank(mpicomm_cols,mpirank_cols,mpierr) call mpi_comm_rank(mpicomm_rows,mpirank_rows,mpierr) call mpi_comm_size(mpicomm_cols,mpiprocs_cols,mpierr) #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, & PQRPARAM(4),mpicomm_rows,blockheuristic) #else call qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqrf_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,trans, & PQRPARAM(4:11),mpicomm_rows,blockheuristic) #endif call qr_pdgeqrf_pack_unpack(v,ldv,dbroadcast_size(1),-1,m,total_cols,mb,rowidx,rowidx,rev,0,mpicomm_rows) call qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,dtmat_bcast_size(1),-1,total_cols,0) pdlarft_size(1) = 0.0d0 call qr_pdlarfb_1dcomm(m,mb,total_cols,total_cols,a,lda,v,ldv,tau,t,ldt,rowidx,rowidx,rev,mpicomm_rows, & pdlarfb_size(1),-1) call qr_tmerge_pdlarfb_1dcomm(m,mb,total_cols,total_cols,total_cols,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode, & mpicomm_rows,tmerge_pdlarfb_size(1),-1) temptau_offset = 1 temptau_size = total_cols broadcast_offset = temptau_offset + temptau_size broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1) work_offset = broadcast_offset + broadcast_size if (lwork .eq. -1) then work(1) = (DBLE(temptau_size) + DBLE(broadcast_size) + max(pdgeqrf_size(1),pdlarft_size(1),pdlarfb_size(1), & tmerge_pdlarfb_size(1))) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_2dcomm") #endif return end if lastcol = colidx-total_cols+1 voffset = total_cols incremental_update_size = 0 ! clear v buffer: just ensure that there is no junk in the upper triangle ! part, otherwise pdlarfb gets some problems ! pdlarfl(2) do not have these problems as they are working more on a vector ! basis v(1:ldv,1:total_cols) = 0.0d0 icol = colidx remaining_cols = total_cols !print *,'start decomposition',m,rowidx,colidx do while (remaining_cols .gt. 0) ! determine rank of process column with next qr block mpirank_cols_qr = MOD((icol-1)/nb,mpiprocs_cols) ! lcols can't be larger than than nb ! exception: there is only one process column ! however, we might not start at the first local column. ! therefore assume a matrix of size (1xlcols) starting at (1,icol) ! determine the real amount of local columns lcols_temp = min(nb,(icol-lastcol+1)) ! blocking parameter lcols_temp = max(min(lcols_temp,size2d),1) ! determine size from last decomposition column ! to first decomposition column call local_size_offset_1d(icol,nb,icol-lcols_temp+1,icol-lcols_temp+1,0, & mpirank_cols_qr,mpiprocs_cols, & lcols,baseoffset,offset) voffset = remaining_cols - lcols + 1 idx = rowidx - colidx + icol if (mpirank_cols .eq. mpirank_cols_qr) then ! qr decomposition part tau(offset:offset+lcols-1) = 0.0d0 #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, & work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4), & mpicomm_rows,blockheuristic) #else call qr_pdgeqrf_1dcomm(a(1,offset),lda,v(1,voffset),ldv,tau(offset),t(voffset,voffset),ldt, & work(work_offset),lwork,m,lcols,mb,rowidx,idx,rev,trans,PQRPARAM(4:11), & mpicomm_rows,blockheuristic) #endif ! pack broadcast buffer (v + tau) call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,& idx,rev,0,mpicomm_rows) ! determine broadcast size call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,& 0,mpicomm_rows) broadcast_size = dbroadcast_size(1) !if (mpirank_rows .eq. 0) then ! pack tmatrix into broadcast buffer and calculate new size call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt, & work(broadcast_offset+broadcast_size),lwork,lcols,0) call qr_pdgeqrf_pack_unpack_tmatrix(tau(offset),t(voffset,voffset),ldt,dtmat_bcast_size(1),-1,lcols,0) broadcast_size = broadcast_size + dtmat_bcast_size(1) !end if ! initiate broadcast (send part) #ifdef WITH_MPI call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, & mpirank_cols_qr,mpicomm_cols,mpierr) #endif ! copy tau parts into temporary tau buffer work(temptau_offset+voffset-1:temptau_offset+(voffset-1)+lcols-1) = tau(offset:offset+lcols-1) !print *,'generated tau:', tau(offset) else ! vector exchange part ! determine broadcast size call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,dbroadcast_size(1),-1,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows) broadcast_size = dbroadcast_size(1) call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, & dtmat_bcast_size(1),-1,lcols,0) tmat_bcast_size = dtmat_bcast_size(1) !print *,'broadcast_size (nonqr)',broadcast_size broadcast_size = dbroadcast_size(1) + dtmat_bcast_size(1) ! initiate broadcast (recv part) #ifdef WITH_MPI call MPI_Bcast(work(broadcast_offset),broadcast_size,mpi_real8, & mpirank_cols_qr,mpicomm_cols,mpierr) #endif ! last n*n elements in buffer are (still empty) T matrix elements ! fetch from first process in each column ! unpack broadcast buffer (v + tau) call qr_pdgeqrf_pack_unpack(v(1,voffset),ldv,work(broadcast_offset),lwork,m,lcols,mb,rowidx,idx,rev,1,mpicomm_rows) ! now send t matrix to other processes in our process column broadcast_size = dbroadcast_size(1) tmat_bcast_size = dtmat_bcast_size(1) ! t matrix should now be available on all processes => unpack call qr_pdgeqrf_pack_unpack_tmatrix(work(temptau_offset+voffset-1),t(voffset,voffset),ldt, & work(broadcast_offset+broadcast_size),lwork,lcols,1) end if remaining_cols = remaining_cols - lcols ! apply householder vectors to whole trailing matrix parts (if any) update_voffset = voffset update_tauoffset = icol update_lcols = lcols incremental_update_size = incremental_update_size + lcols icol = icol - lcols ! count colums from first column of global block to current index call local_size_offset_1d(icol,nb,colidx-n+1,colidx-n+1,0, & mpirank_cols,mpiprocs_cols, & lcols,baseoffset,offset) if (lcols .gt. 0) then !print *,'updating trailing matrix' if (updatemode .eq. ichar('I')) then print *,'pdgeqrf_2dcomm: incremental update not yet implemented! rev=1' else if (updatemode .eq. ichar('F')) then ! full update no merging call qr_pdlarfb_1dcomm(m,mb,lcols,update_lcols,a(1,offset),lda,v(1,update_voffset),ldv, & work(temptau_offset+update_voffset-1), & t(update_voffset,update_voffset),ldt, & rowidx,idx,1,mpicomm_rows,work(work_offset),lwork) else ! full update + merging default call qr_tmerge_pdlarfb_1dcomm(m,mb,lcols,n-(update_voffset+update_lcols-1),update_lcols, & v(1,update_voffset),ldv, & t(update_voffset,update_voffset),ldt, & a(1,offset),lda,rowidx,1,updatemode,mpicomm_rows, & work(work_offset),lwork) end if else if (updatemode .eq. ichar('I')) then print *,'sole merging of (incremental) T matrix', mpirank_cols, & n-(update_voffset+incremental_update_size-1) call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+incremental_update_size-1), & incremental_update_size,v(1,update_voffset),ldv, & t(update_voffset,update_voffset),ldt, & a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork) ! reset for upcoming incremental updates incremental_update_size = 0 else if (updatemode .eq. ichar('M')) then ! final merge call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(update_voffset+update_lcols-1),update_lcols, & v(1,update_voffset),ldv, & t(update_voffset,update_voffset),ldt, & a,lda,rowidx,1,updatemode,mpicomm_rows,work(work_offset),lwork) else ! full updatemode - nothing to update end if ! reset for upcoming incremental updates incremental_update_size = 0 end if end do if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then ! finally merge all small T parts call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size2d,tmerge,v,ldv,t,ldt,rowidx,rev,mpicomm_rows,work,lwork) end if !print *,'stop decomposition',rowidx,colidx #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdgeqrf_2dcomm") #endif end subroutine qr_pdgeqrf_2dcomm subroutine qr_pdgeqrf_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic) use precision use ELPA1 #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3 ! input variables (local) integer(kind=ik) :: lda,lwork,ldv,ldt real(kind=rk) :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*) ! input variables (global) integer(kind=ik) :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) :: PQRPARAM(*) #else integer(kind=ik) :: PQRPARAM(:) #endif ! derived input variables ! derived further input variables from QR_PQRPARAM integer(kind=ik) :: size1d,updatemode,tmerge ! output variables (global) real(kind=rk) :: blockheuristic(*) ! local scalars integer(kind=ik) :: nr_blocks,remainder,current_block,aoffset,idx,updatesize real(kind=rk) :: pdgeqr2_size(1),pdlarfb_size(1),tmerge_tree_size(1) #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdgeqrf_1dcomm") #endif size1d = max(min(PQRPARAM(1),n),1) updatemode = PQRPARAM(2) tmerge = PQRPARAM(3) if (lwork .eq. -1) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, & m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4),mpicomm,blockheuristic) #else call qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,pdgeqr2_size,-1, & m,size1d,mb,baseidx,baseidx,rev,trans,PQRPARAM(4:),mpicomm,blockheuristic) #endif ! reserve more space for incremental mode call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt, & a,lda,baseidx,rev,updatemode,mpicomm,pdlarfb_size,-1) call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,tmerge_tree_size,-1) work(1) = max(pdlarfb_size(1),pdgeqr2_size(1),tmerge_tree_size(1)) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_1dcomm") #endif return end if nr_blocks = n / size1d remainder = n - nr_blocks*size1d current_block = 0 do while (current_block .lt. nr_blocks) idx = rowidx-current_block*size1d updatesize = n-(current_block+1)*size1d aoffset = 1+updatesize #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, & m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic) #else call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v(1,aoffset),ldv,tau(aoffset),t(aoffset,aoffset),ldt,work,lwork, & m,size1d,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic) #endif if (updatemode .eq. ichar('M')) then ! full update + merging call qr_tmerge_pdlarfb_1dcomm(m,mb,updatesize,current_block*size1d,size1d, & v(1,aoffset),ldv,t(aoffset,aoffset),ldt, & a,lda,baseidx,1,ichar('F'),mpicomm,work,lwork) else if (updatemode .eq. ichar('I')) then if (updatesize .ge. size1d) then ! incremental update + merging call qr_tmerge_pdlarfb_1dcomm(m,mb,size1d,current_block*size1d,size1d, & v(1,aoffset),ldv,t(aoffset,aoffset),ldt, & a(1,aoffset-size1d),lda,baseidx,1,updatemode,mpicomm,work,lwork) else ! only remainder left ! incremental update + merging call qr_tmerge_pdlarfb_1dcomm(m,mb,remainder,current_block*size1d,size1d, & v(1,aoffset),ldv,t(aoffset,aoffset),ldt, & a(1,1),lda,baseidx,1,updatemode,mpicomm,work,lwork) end if else ! full update no merging is default ! full update no merging call qr_pdlarfb_1dcomm(m,mb,updatesize,size1d,a,lda,v(1,aoffset),ldv, & tau(aoffset),t(aoffset,aoffset),ldt,baseidx,idx,1,mpicomm,work,lwork) end if ! move on to next block current_block = current_block+1 end do if (remainder .gt. 0) then aoffset = 1 idx = rowidx-size1d*nr_blocks #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, & m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4),mpicomm,blockheuristic) #else call qr_pdgeqr2_1dcomm(a(1,aoffset),lda,v,ldv,tau,t,ldt,work,lwork, & m,remainder,mb,baseidx,idx,1,trans,PQRPARAM(4:),mpicomm,blockheuristic) #endif if ((updatemode .eq. ichar('I')) .or. (updatemode .eq. ichar('M'))) then ! final merging call qr_tmerge_pdlarfb_1dcomm(m,mb,0,size1d*nr_blocks,remainder, & v,ldv,t,ldt, & a,lda,baseidx,1,updatemode,mpicomm,work,lwork) ! updatemode argument does not matter end if end if if ((tmerge .gt. 0) .and. (updatemode .eq. ichar('F'))) then ! finally merge all small T parts call qr_pdlarft_tree_merge_1dcomm(m,mb,n,size1d,tmerge,v,ldv,t,ldt,baseidx,rev,mpicomm,work,lwork) end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_1dcomm") #endif end subroutine qr_pdgeqrf_1dcomm ! local a and tau are assumed to be positioned at the right column from a local ! perspective ! TODO: if local amount of data turns to zero the algorithm might produce wrong ! results (probably due to old buffer contents) subroutine qr_pdgeqr2_1dcomm(a,lda,v,ldv,tau,t,ldt,work,lwork,m,n,mb,baseidx,rowidx,rev,trans,PQRPARAM,mpicomm,blockheuristic) use precision use ELPA1 #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2 ,eps_ = 3, upmode1_ = 4 ! input variables (local) integer(kind=ik) :: lda,lwork,ldv,ldt real(kind=rk) :: a(lda,*),v(ldv,*),tau(*),t(ldt,*),work(*) ! input variables (global) integer(kind=ik) :: m,n,mb,baseidx,rowidx,rev,trans,mpicomm #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) :: PQRPARAM(*) #else integer(kind=ik) :: PQRPARAM(:) #endif ! output variables (global) real(kind=rk) :: blockheuristic(*) ! derived further input variables from QR_PQRPARAM integer(kind=ik) :: maxrank,hgmode,updatemode ! local scalars integer(kind=ik) :: icol,incx,idx real(kind=rk) :: pdlarfg_size(1),pdlarf_size(1),total_size real(kind=rk) :: pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1) real(kind=rk) :: pdlarft_size(1),pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1) integer(kind=ik) :: mpirank,mpiprocs,mpierr integer(kind=ik) :: rank,lastcol,actualrank,nextrank integer(kind=ik) :: update_cols,decomposition_cols integer(kind=ik) :: current_column #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdgeqr2_1dcomm") #endif maxrank = min(PQRPARAM(1),n) updatemode = PQRPARAM(2) hgmode = PQRPARAM(4) call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) if (trans .eq. 1) then incx = lda else incx = 1 end if if (lwork .eq. -1) then call qr_pdlarfg_1dcomm(a,incx,tau(1),pdlarfg_size(1),-1,n,rowidx,mb,hgmode,rev,mpicomm) call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,n,rowidx,mb,rev,mpicomm) #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM, & rev,mpicomm,actualrank) call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM,rev,mpicomm,actualrank) #else call qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfg2_size(1),-1,m,rowidx,mb,PQRPARAM(:), & rev,mpicomm,actualrank) call qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,pdlarfgk_size(1),-1,m,n,rowidx,mb,PQRPARAM(:),rev,mpicomm,actualrank) #endif call qr_pdlarfl2_tmatrix_1dcomm(v,ldv,baseidx,a,lda,t,ldt,pdlarfl2_size(1),-1,m,n,rowidx,mb,rev,mpicomm) pdlarft_size(1) = 0.0d0 call qr_pdlarfb_1dcomm(m,mb,n,n,a,lda,v,ldv,tau,t,ldt,baseidx,rowidx,1,mpicomm,pdlarfb_size(1),-1) pdlarft_pdlarfb_size(1) = 0.0d0 call qr_tmerge_pdlarfb_1dcomm(m,mb,n,n,n,v,ldv,t,ldt,a,lda,rowidx,rev,updatemode,mpicomm,tmerge_pdlarfb_size(1),-1) total_size = max(pdlarfg_size(1),pdlarf_size(1),pdlarfg2_size(1),pdlarfgk_size(1),pdlarfl2_size(1),pdlarft_size(1), & pdlarfb_size(1),pdlarft_pdlarfb_size(1),tmerge_pdlarfb_size(1)) work(1) = total_size #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqr2_1dcomm") #endif return end if icol = 1 lastcol = min(rowidx,n) decomposition_cols = lastcol update_cols = n do while (decomposition_cols .gt. 0) ! local qr block icol = lastcol-decomposition_cols+1 idx = rowidx-icol+1 ! get possible rank size ! limited by number of columns and remaining rows rank = min(n-icol+1,maxrank,idx) current_column = n-icol+1-rank+1 if (rank .eq. 1) then call qr_pdlarfg_1dcomm(a(1,current_column),incx, & tau(current_column),work,lwork, & m,idx,mb,hgmode,1,mpicomm) v(1:ldv,current_column) = 0.0d0 call qr_pdlarfg_copy_1dcomm(a(1,current_column),incx, & v(1,current_column),1, & m,baseidx,idx,mb,1,mpicomm) ! initialize t matrix part t(current_column,current_column) = tau(current_column) actualrank = 1 else if (rank .eq. 2) then #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), & t(current_column,current_column),ldt,v(1,current_column),ldv, & baseidx,work,lwork,m,idx,mb,PQRPARAM,1,mpicomm,actualrank) #else call qr_pdlarfg2_1dcomm_ref(a(1,current_column),lda,tau(current_column), & t(current_column,current_column),ldt,v(1,current_column),ldv, & baseidx,work,lwork,m,idx,mb,PQRPARAM(:),1,mpicomm,actualrank) #endif else #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), & t(current_column,current_column),ldt,v(1,current_column),ldv, & baseidx,work,lwork,m,rank,idx,mb,PQRPARAM,1,mpicomm,actualrank) #else call qr_pdlarfgk_1dcomm(a(1,current_column),lda,tau(current_column), & t(current_column,current_column),ldt,v(1,current_column),ldv, & baseidx,work,lwork,m,rank,idx,mb,PQRPARAM(:),1,mpicomm,actualrank) #endif end if blockheuristic(actualrank) = blockheuristic(actualrank) + 1 ! the blocked decomposition versions already updated their non ! decomposed parts using their information after communication update_cols = decomposition_cols - rank decomposition_cols = decomposition_cols - actualrank ! needed for incremental update nextrank = min(n-(lastcol-decomposition_cols+1)+1,maxrank,rowidx-(lastcol-decomposition_cols+1)+1) if (current_column .gt. 1) then idx = rowidx-icol+1 if (updatemode .eq. ichar('I')) then ! incremental update + merging call qr_tmerge_pdlarfb_1dcomm(m,mb,nextrank-(rank-actualrank),n-(current_column+rank-1),actualrank, & v(1,current_column+(rank-actualrank)),ldv, & t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, & a(1,current_column-nextrank+(rank-actualrank)),lda,baseidx,rev,updatemode,& mpicomm,work,lwork) else ! full update + merging call qr_tmerge_pdlarfb_1dcomm(m,mb,update_cols,n-(current_column+rank-1),actualrank, & v(1,current_column+(rank-actualrank)),ldv, & t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, & a(1,1),lda,baseidx,rev,updatemode,mpicomm,work,lwork) end if else call qr_tmerge_pdlarfb_1dcomm(m,mb,0,n-(current_column+rank-1),actualrank,v(1,current_column+(rank-actualrank)), & ldv, & t(current_column+(rank-actualrank),current_column+(rank-actualrank)),ldt, & a,lda,baseidx,rev,updatemode,mpicomm,work,lwork) end if end do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqr2_1dcomm") #endif end subroutine qr_pdgeqr2_1dcomm ! incx == 1: column major ! incx != 1: row major subroutine qr_pdlarfg_1dcomm(x,incx,tau,work,lwork,n,idx,nb,hgmode,rev,mpi_comm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2, eps_ = 3 ! input variables (local) integer(kind=ik) :: incx,lwork,hgmode real(kind=rk) :: x(*),work(*) ! input variables (global) integer(kind=ik) :: mpi_comm,nb,idx,n,rev ! output variables (global) real(kind=rk) :: tau ! local scalars integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top integer(kind=ik) :: sendsize,recvsize integer(kind=ik) :: local_size,local_offset,baseoffset integer(kind=ik) :: topidx,top,iproc real(kind=rk) :: alpha,xnorm,dot,xf ! external functions real(kind=rk), external :: ddot,dlapy2,dnrm2 external :: dscal ! intrinsic ! intrinsic sign #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg_1dcomm") #endif if (idx .le. 1) then tau = 0.0d0 #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg_1dcomm") #endif return end if call MPI_Comm_rank(mpi_comm, mpirank, mpierr) call MPI_Comm_size(mpi_comm, mpiprocs, mpierr) ! calculate expected work size and store in work(1) if (hgmode .eq. ichar('s')) then ! allreduce (MPI_SUM) sendsize = 2 recvsize = sendsize else if (hgmode .eq. ichar('x')) then ! alltoall sendsize = mpiprocs*2 recvsize = sendsize else if (hgmode .eq. ichar('g')) then ! allgather sendsize = 2 recvsize = mpiprocs*sendsize else ! no exchange at all (benchmarking) sendsize = 2 recvsize = sendsize end if if (lwork .eq. -1) then work(1) = DBLE(sendsize + recvsize) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg_1dcomm") #endif return end if ! Processor id for global index of top element mpirank_top = MOD((idx-1)/nb,mpiprocs) if (mpirank .eq. mpirank_top) then topidx = local_index(idx,mpirank_top,mpiprocs,nb,0) top = 1+(topidx-1)*incx end if call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, & local_size,baseoffset,local_offset) local_offset = local_offset * incx ! calculate and exchange information if (hgmode .eq. ichar('s')) then if (mpirank .eq. mpirank_top) then alpha = x(top) else alpha = 0.0d0 end if dot = ddot(local_size, & x(local_offset), incx, & x(local_offset), incx) work(1) = alpha work(2) = dot #ifdef WITH_MPI call mpi_allreduce(work(1),work(sendsize+1), & sendsize,mpi_real8,mpi_sum, & mpi_comm,mpierr) #else work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize) #endif alpha = work(sendsize+1) xnorm = sqrt(work(sendsize+2)) else if (hgmode .eq. ichar('x')) then if (mpirank .eq. mpirank_top) then alpha = x(top) else alpha = 0.0d0 end if xnorm = dnrm2(local_size, x(local_offset), incx) do iproc=0,mpiprocs-1 work(2*iproc+1) = alpha work(2*iproc+2) = xnorm end do #ifdef WITH_MPI call mpi_alltoall(work(1),2,mpi_real8, & work(sendsize+1),2,mpi_real8, & mpi_comm,mpierr) #else work(sendsize+1:sendsize+1+2-1) = work(1:2) #endif ! extract alpha value alpha = work(sendsize+1+mpirank_top*2) ! copy norm parts of buffer to beginning do iproc=0,mpiprocs-1 work(iproc+1) = work(sendsize+1+2*iproc+1) end do xnorm = dnrm2(mpiprocs, work(1), 1) else if (hgmode .eq. ichar('g')) then if (mpirank .eq. mpirank_top) then alpha = x(top) else alpha = 0.0d0 end if xnorm = dnrm2(local_size, x(local_offset), incx) work(1) = alpha work(2) = xnorm ! allgather #ifdef WITH_MPI call mpi_allgather(work(1),sendsize,mpi_real8, & work(sendsize+1),sendsize,mpi_real8, & mpi_comm,mpierr) #else work(sendsize+1:sendsize+1+sendsize-1) = work(1:sendsize) #endif ! extract alpha value alpha = work(sendsize+1+mpirank_top*2) ! copy norm parts of buffer to beginning do iproc=0,mpiprocs-1 work(iproc+1) = work(sendsize+1+2*iproc+1) end do xnorm = dnrm2(mpiprocs, work(1), 1) else ! dnrm2 xnorm = dnrm2(local_size, x(local_offset), incx) if (mpirank .eq. mpirank_top) then alpha = x(top) else alpha = 0.0d0 end if ! no exchange at all (benchmarking) xnorm = 0.0d0 end if !print *,'ref hg:', idx,xnorm,alpha !print *,x(1:n) ! calculate householder information if (xnorm .eq. 0.0d0) then ! H = I tau = 0.0d0 else ! General case call hh_transform_real(alpha,xnorm**2,xf,tau) if (mpirank .eq. mpirank_top) then x(top) = alpha end if call dscal(local_size, xf, & x(local_offset), incx) ! TODO: reimplement norm rescale method of ! original PDLARFG using mpi? end if ! useful for debugging !print *,'hg:mpirank,idx,beta,alpha:',mpirank,idx,beta,alpha,1.0d0/(beta+alpha),tau !print *,x(1:n) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg_1dcomm") #endif end subroutine qr_pdlarfg_1dcomm subroutine qr_pdlarfg2_1dcomm_ref(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,idx,mb,PQRPARAM,rev,mpicomm,actualk) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4 ! input variables (local) integer(kind=ik) :: lda,lwork,ldv,ldt real(kind=rk) :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*) ! input variables (global) integer(kind=ik) :: m,idx,baseidx,mb,rev,mpicomm #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) :: PQRPARAM(*) #else integer(kind=ik) :: PQRPARAM(:) #endif ! output variables (global) integer(kind=ik) :: actualk ! derived input variables from QR_PQRPARAM integer(kind=ik) :: eps ! local scalars real(kind=rk) :: dseedwork_size(1) integer(kind=ik) :: seedwork_size,seed_size integer(kind=ik) :: seedwork_offset,seed_offset logical :: accurate #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg2_1dcomm") #endif call qr_pdlarfg2_1dcomm_seed(a,lda,dseedwork_size(1),-1,work,m,mb,idx,rev,mpicomm) seedwork_size = dseedwork_size(1) seed_size = seedwork_size if (lwork .eq. -1) then work(1) = seedwork_size + seed_size #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm") #endif return end if seedwork_offset = 1 seed_offset = seedwork_offset + seedwork_size eps = PQRPARAM(3) ! check for border cases (only a 2x2 matrix left) if (idx .le. 1) then tau(1:2) = 0.0d0 t(1:2,1:2) = 0.0d0 #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm") #endif return end if call qr_pdlarfg2_1dcomm_seed(a,lda,work(seedwork_offset),lwork,work(seed_offset),m,mb,idx,rev,mpicomm) if (eps .gt. 0) then accurate = qr_pdlarfg2_1dcomm_check(work(seed_offset),eps) else accurate = .true. end if call qr_pdlarfg2_1dcomm_vector(a(1,2),1,tau(2),work(seed_offset), & m,mb,idx,0,1,mpicomm) call qr_pdlarfg_copy_1dcomm(a(1,2),1, & v(1,2),1, & m,baseidx,idx,mb,1,mpicomm) call qr_pdlarfg2_1dcomm_update(v(1,2),1,baseidx,a(1,1),lda,work(seed_offset),m,idx,mb,rev,mpicomm) ! check for 2x2 matrix case => only one householder vector will be ! generated if (idx .gt. 2) then if (accurate .eqv. .true.) then call qr_pdlarfg2_1dcomm_vector(a(1,1),1,tau(1),work(seed_offset), & m,mb,idx-1,1,1,mpicomm) call qr_pdlarfg_copy_1dcomm(a(1,1),1, & v(1,1),1, & m,baseidx,idx-1,mb,1,mpicomm) ! generate fuse element call qr_pdlarfg2_1dcomm_finalize_tmatrix(work(seed_offset),tau,t,ldt) actualk = 2 else t(1,1) = 0.0d0 t(1,2) = 0.0d0 t(2,2) = tau(2) actualk = 1 end if else t(1,1) = 0.0d0 t(1,2) = 0.0d0 t(2,2) = tau(2) ! no more vectors to create tau(1) = 0.0d0 actualk = 2 !print *,'rank2: no more data' end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm") #endif end subroutine qr_pdlarfg2_1dcomm_ref subroutine qr_pdlarfg2_1dcomm_seed(a,lda,work,lwork,seed,n,nb,idx,rev,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: lda,lwork real(kind=rk) :: a(lda,*),work(*),seed(*) ! input variables (global) integer(kind=ik) :: n,nb,idx,rev,mpicomm ! output variables (global) ! external functions real(kind=rk), external :: ddot ! local scalars real(kind=rk) :: top11,top21,top12,top22 real(kind=rk) :: dot11,dot12,dot22 integer(kind=ik) :: mpirank,mpiprocs,mpierr integer(kind=ik) :: mpirank_top11,mpirank_top21 integer(kind=ik) :: top11_offset,top21_offset integer(kind=ik) :: baseoffset integer(kind=ik) :: local_offset1,local_size1 integer(kind=ik) :: local_offset2,local_size2 #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg2_1dcomm_seed") #endif if (lwork .eq. -1) then work(1) = DBLE(8) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_seed") #endif return end if call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, & local_size1,baseoffset,local_offset1) call local_size_offset_1d(n,nb,idx,idx-2,rev,mpirank,mpiprocs, & local_size2,baseoffset,local_offset2) mpirank_top11 = MOD((idx-1)/nb,mpiprocs) mpirank_top21 = MOD((idx-2)/nb,mpiprocs) top11_offset = local_index(idx,mpirank_top11,mpiprocs,nb,0) top21_offset = local_index(idx-1,mpirank_top21,mpiprocs,nb,0) if (mpirank_top11 .eq. mpirank) then top11 = a(top11_offset,2) top12 = a(top11_offset,1) else top11 = 0.0d0 top12 = 0.0d0 end if if (mpirank_top21 .eq. mpirank) then top21 = a(top21_offset,2) top22 = a(top21_offset,1) else top21 = 0.0d0 top22 = 0.0d0 end if ! calculate 3 dot products dot11 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,2),1) dot12 = ddot(local_size1,a(local_offset1,2),1,a(local_offset1,1),1) dot22 = ddot(local_size2,a(local_offset2,1),1,a(local_offset2,1),1) ! store results in work buffer work(1) = top11 work(2) = dot11 work(3) = top12 work(4) = dot12 work(5) = top21 work(6) = top22 work(7) = dot22 work(8) = 0.0d0 ! fill up buffer ! exchange partial results #ifdef WITH_MPI call mpi_allreduce(work, seed, 8, mpi_real8, mpi_sum, & mpicomm, mpierr) #else seed(1:8) = work(1:8) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_seed") #endif end subroutine qr_pdlarfg2_1dcomm_seed logical function qr_pdlarfg2_1dcomm_check(seed,eps) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables real(kind=rk) :: seed(*) integer(kind=ik) :: eps ! local scalars real(kind=rk) :: epsd,first,second,first_second,estimate logical :: accurate real(kind=rk) :: dot11,dot12,dot22 real(kind=rk) :: top11,top12,top21,top22 #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg2_1dcomm_check") #endif EPSD = EPS top11 = seed(1) dot11 = seed(2) top12 = seed(3) dot12 = seed(4) top21 = seed(5) top22 = seed(6) dot22 = seed(7) ! reconstruct the whole inner products ! (including squares of the top elements) first = dot11 + top11*top11 second = dot22 + top22*top22 + top12*top12 first_second = dot12 + top11*top12 ! zero Householder vector (zero norm) case if (first*second .eq. 0.0d0) then qr_pdlarfg2_1dcomm_check = .false. #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_check") #endif return end if estimate = abs((first_second*first_second)/(first*second)) !print *,'estimate:',estimate ! if accurate the following check holds accurate = (estimate .LE. (epsd/(1.0d0+epsd))) qr_pdlarfg2_1dcomm_check = accurate #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_check") #endif end function qr_pdlarfg2_1dcomm_check ! id=0: first vector ! id=1: second vector subroutine qr_pdlarfg2_1dcomm_vector(x,incx,tau,seed,n,nb,idx,id,rev,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: incx real(kind=rk) :: x(*),seed(*),tau ! input variables (global) integer(kind=ik) :: n,nb,idx,id,rev,mpicomm ! output variables (global) ! external functions real(kind=rk), external :: dlapy2 external :: dscal ! local scalars integer(kind=ik) :: mpirank,mpirank_top,mpiprocs,mpierr real(kind=rk) :: alpha,dot,beta,xnorm integer(kind=ik) :: local_size,baseoffset,local_offset,top,topidx #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg2_1dcomm_vector") #endif call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) call local_size_offset_1d(n,nb,idx,idx-1,rev,mpirank,mpiprocs, & local_size,baseoffset,local_offset) local_offset = local_offset * incx ! Processor id for global index of top element mpirank_top = MOD((idx-1)/nb,mpiprocs) if (mpirank .eq. mpirank_top) then topidx = local_index(idx,mpirank_top,mpiprocs,nb,0) top = 1+(topidx-1)*incx end if alpha = seed(id*5+1) dot = seed(id*5+2) xnorm = sqrt(dot) if (xnorm .eq. 0.0d0) then ! H = I tau = 0.0d0 else ! General case beta = sign(dlapy2(alpha, xnorm), alpha) tau = (beta+alpha) / beta !print *,'hg2',tau,xnorm,alpha call dscal(local_size, 1.0d0/(beta+alpha), & x(local_offset), incx) ! TODO: reimplement norm rescale method of ! original PDLARFG using mpi? if (mpirank .eq. mpirank_top) then x(top) = -beta end if seed(8) = beta end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_vector") #endif end subroutine qr_pdlarfg2_1dcomm_vector subroutine qr_pdlarfg2_1dcomm_update(v,incv,baseidx,a,lda,seed,n,idx,nb,rev,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: incv,lda real(kind=rk) :: v(*),a(lda,*),seed(*) ! input variables (global) integer(kind=ik) :: n,baseidx,idx,nb,rev,mpicomm ! output variables (global) ! external functions external daxpy ! local scalars integer(kind=ik) :: mpirank,mpiprocs,mpierr integer(kind=ik) :: local_size,local_offset,baseoffset real(kind=rk) :: z,coeff,beta real(kind=rk) :: dot11,dot12,dot22 real(kind=rk) :: top11,top12,top21,top22 #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg2_1dcomm_update") #endif call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) ! seed should be updated by previous householder generation ! Update inner product of this column and next column vector top11 = seed(1) dot11 = seed(2) top12 = seed(3) dot12 = seed(4) top21 = seed(5) top22 = seed(6) dot22 = seed(7) beta = seed(8) call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, & local_size,baseoffset,local_offset) baseoffset = baseoffset * incv ! zero Householder vector (zero norm) case if (beta .eq. 0.0d0) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_update") #endif return end if z = (dot12 + top11 * top12) / beta + top12 !print *,'hg2 update:',baseidx,idx,mpirank,local_size call daxpy(local_size, -z, v(baseoffset),1, a(local_offset,1),1) ! prepare a full dot22 for update dot22 = dot22 + top22*top22 ! calculate coefficient COEFF = z / (top11 + beta) ! update inner product of next vector dot22 = dot22 - coeff * (2*dot12 - coeff*dot11) ! update dot12 value to represent update with first vector ! (needed for T matrix) dot12 = dot12 - COEFF * dot11 ! update top element of next vector top22 = top22 - coeff * top21 seed(6) = top22 ! restore separated dot22 for vector generation seed(7) = dot22 - top22*top22 !------------------------------------------------------ ! prepare elements for T matrix seed(4) = dot12 ! prepare dot matrix for fuse element of T matrix ! replace top11 value with -beta1 seed(1) = beta #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_update") #endif end subroutine qr_pdlarfg2_1dcomm_update ! run this function after second vector subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix(seed,tau,t,ldt) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik) :: ldt real(kind=rk) :: seed(*),t(ldt,*),tau(*) real(kind=rk) :: dot12,beta1,top21,beta2 #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg2_1dcomm_finalize_tmatrix") #endif beta1 = seed(1) dot12 = seed(4) top21 = seed(5) beta2 = seed(8) !print *,'beta1 beta2',beta1,beta2 dot12 = dot12 / beta2 + top21 dot12 = -(dot12 / beta1) t(1,1) = tau(1) t(1,2) = dot12 t(2,2) = tau(2) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg2_1dcomm_finalize_tmatrix") #endif end subroutine qr_pdlarfg2_1dcomm_finalize_tmatrix subroutine qr_pdlarfgk_1dcomm(a,lda,tau,t,ldt,v,ldv,baseidx,work,lwork,m,k,idx,mb,PQRPARAM,rev,mpicomm,actualk) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup ! input variables (local) integer(kind=ik) :: lda,lwork,ldv,ldt real(kind=rk) :: a(lda,*),v(ldv,*),tau(*),work(*),t(ldt,*) ! input variables (global) integer(kind=ik) :: m,k,idx,baseidx,mb,rev,mpicomm #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) ::PQRPARAM(*) #else integer(kind=ik) :: PQRPARAM(:) #endif ! output variables (global) integer(kind=ik) :: actualk ! local scalars integer(kind=ik) :: ivector real(kind=rk) :: pdlarfg_size(1),pdlarf_size(1) real(kind=rk) :: pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1) real(kind=rk) :: pdlarfgk_1dcomm_update_size(1) integer(kind=ik) :: seedC_size,seedC_offset integer(kind=ik) :: seedD_size,seedD_offset integer(kind=ik) :: work_offset #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm") #endif seedC_size = k*k seedC_offset = 1 seedD_size = k*k seedD_offset = seedC_offset + seedC_size work_offset = seedD_offset + seedD_size if (lwork .eq. -1) then call qr_pdlarfg_1dcomm(a,1,tau(1),pdlarfg_size(1),-1,m,baseidx,mb,PQRPARAM(4),rev,mpicomm) call qr_pdlarfl_1dcomm(v,1,baseidx,a,lda,tau(1),pdlarf_size(1),-1,m,k,baseidx,mb,rev,mpicomm) call qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,pdlarfgk_1dcomm_seed_size(1),-1,work,work,m,k,mb,mpicomm) #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk) call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM,pdlarfgk_1dcomm_check_size(1),-1,actualk) #else !call qr_pdlarfgk_1dcomm_check(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk) call qr_pdlarfgk_1dcomm_check_improved(work,work,k,PQRPARAM(:),pdlarfgk_1dcomm_check_size(1),-1,actualk) #endif call qr_pdlarfgk_1dcomm_update(a,lda,baseidx,pdlarfgk_1dcomm_update_size(1),-1,work,work,k,k,1,work,m,mb,rev,mpicomm) work(1) = max(pdlarfg_size(1),pdlarf_size(1),pdlarfgk_1dcomm_seed_size(1),pdlarfgk_1dcomm_check_size(1), & pdlarfgk_1dcomm_update_size(1)) + real(seedC_size + seedD_size, kind=rk) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm") #endif return end if call qr_pdlarfgk_1dcomm_seed(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset),work(seedD_offset),m,k,mb,mpicomm) #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk) call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM,work(work_offset),lwork,actualk) #else !call qr_pdlarfgk_1dcomm_check(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk) call qr_pdlarfgk_1dcomm_check_improved(work(seedC_offset),work(seedD_offset),k,PQRPARAM(:),work(work_offset),lwork,actualk) #endif !print *,'possible rank:', actualk ! override useful for debugging !actualk = 1 !actualk = k !actualk= min(actualk,2) do ivector=1,actualk call qr_pdlarfgk_1dcomm_vector(a(1,k-ivector+1),1,idx,tau(k-ivector+1), & work(seedC_offset),work(seedD_offset),k, & ivector,m,mb,rev,mpicomm) call qr_pdlarfgk_1dcomm_update(a(1,1),lda,idx,work(work_offset),lwork,work(seedC_offset), & work(seedD_offset),k,actualk,ivector,tau, & m,mb,rev,mpicomm) call qr_pdlarfg_copy_1dcomm(a(1,k-ivector+1),1, & v(1,k-ivector+1),1, & m,baseidx,idx-ivector+1,mb,1,mpicomm) end do ! generate final T matrix and convert preliminary tau values into real ones call qr_pdlarfgk_1dcomm_generateT(work(seedC_offset),work(seedD_offset),k,actualk,tau,t,ldt) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm") #endif end subroutine qr_pdlarfgk_1dcomm subroutine qr_pdlarfgk_1dcomm_seed(a,lda,baseidx,work,lwork,seedC,seedD,m,k,mb,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup ! input variables (local) integer(kind=ik) :: lda,lwork real(kind=rk) :: a(lda,*), work(*) ! input variables (global) integer(kind=ik) :: m,k,baseidx,mb,mpicomm real(kind=rk) :: seedC(k,*),seedD(k,*) ! output variables (global) ! derived input variables from QR_PQRPARAM ! local scalars integer(kind=ik) :: mpierr,mpirank,mpiprocs,mpirank_top integer(kind=ik) :: icol,irow,lidx,remsize integer(kind=ik) :: remaining_rank integer(kind=ik) :: C_size,D_size,sendoffset,recvoffset,sendrecv_size integer(kind=ik) :: localoffset,localsize,baseoffset #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm_seed") #endif call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) C_size = k*k D_size = k*k sendoffset = 1 sendrecv_size = C_size+D_size recvoffset = sendoffset + sendrecv_size if (lwork .eq. -1) then work(1) = DBLE(2*sendrecv_size) return end if ! clear buffer work(sendoffset:sendoffset+sendrecv_size-1)=0.0d0 ! collect C part do icol=1,k remaining_rank = k do while (remaining_rank .gt. 0) irow = k - remaining_rank + 1 lidx = baseidx - remaining_rank + 1 ! determine chunk where the current top element is located mpirank_top = MOD((lidx-1)/mb,mpiprocs) ! limit max number of remaining elements of this chunk to the block ! distribution parameter remsize = min(remaining_rank,mb) ! determine the number of needed elements in this chunk call local_size_offset_1d(lidx+remsize-1,mb, & lidx,lidx,0, & mpirank_top,mpiprocs, & localsize,baseoffset,localoffset) !print *,'local rank',localsize,localoffset if (mpirank .eq. mpirank_top) then ! copy elements to buffer work(sendoffset+(icol-1)*k+irow-1:sendoffset+(icol-1)*k+irow-1+localsize-1) & = a(localoffset:localoffset+remsize-1,icol) end if ! jump to next chunk remaining_rank = remaining_rank - localsize end do end do ! collect D part call local_size_offset_1d(m,mb,baseidx-k,baseidx-k,1, & mpirank,mpiprocs, & localsize,baseoffset,localoffset) !print *,'localsize',localsize,localoffset if (localsize > 0) then call dsyrk("Upper", "Trans", k, localsize, & 1.0d0, a(localoffset,1), lda, & 0.0d0, work(sendoffset+C_size), k) else work(sendoffset+C_size:sendoffset+C_size+k*k-1) = 0.0d0 end if ! TODO: store symmetric part more efficiently ! allreduce operation on results #ifdef WITH_MPI call mpi_allreduce(work(sendoffset),work(recvoffset),sendrecv_size, & mpi_real8,mpi_sum,mpicomm,mpierr) #else work(recvoffset:recvoffset+sendrecv_size-1) = work(sendoffset:sendoffset+sendrecv_size-1) #endif ! unpack result from buffer into seedC and seedD seedC(1:k,1:k) = 0.0d0 do icol=1,k seedC(1:k,icol) = work(recvoffset+(icol-1)*k:recvoffset+icol*k-1) end do seedD(1:k,1:k) = 0.0d0 do icol=1,k seedD(1:k,icol) = work(recvoffset+C_size+(icol-1)*k:recvoffset+C_size+icol*k-1) end do #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_seed") #endif end subroutine qr_pdlarfgk_1dcomm_seed ! k is assumed to be larger than two subroutine qr_pdlarfgk_1dcomm_check_improved(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (global) integer(kind=ik) :: k,lwork #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) :: PQRPARAM(*) #else integer(kind=ik) :: PQRPARAM(:) #endif real(kind=rk) :: seedC(k,*),seedD(k,*),work(k,*) ! output variables (global) integer(kind=ik) :: possiblerank ! derived input variables from QR_PQRPARAM integer(kind=ik) :: eps ! local variables integer(kind=ik) :: i,j,l real(kind=rk) :: sum_squares,diagonal_square,relative_error,epsd,diagonal_root real(kind=rk) :: dreverse_matrix_work(1) ! external functions real(kind=rk), external :: ddot,dlapy2,dnrm2 external :: dscal #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm_check_improved") #endif if (lwork .eq. -1) then call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1) work(1,1) = DBLE(k*k) + dreverse_matrix_work(1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check_improved") #endif return end if eps = PQRPARAM(3) if (eps .eq. 0) then possiblerank = k #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check_improved") #endif return end if epsd = DBLE(eps) ! build complete inner product from seedC and seedD ! copy seedD to work work(:,1:k) = seedD(:,1:k) ! add inner products of seedC to work call dsyrk("Upper", "Trans", k, k, & 1.0d0, seedC(1,1), k, & 1.0d0, work, k) ! TODO: optimize this part! call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k) call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k) ! transpose matrix do i=1,k do j=i+1,k work(i,j) = work(j,i) end do end do ! do cholesky decomposition i = 0 do while ((i .lt. k)) i = i + 1 diagonal_square = abs(work(i,i)) diagonal_root = sqrt(diagonal_square) ! zero Householder vector (zero norm) case if ((abs(diagonal_square) .eq. 0.0d0) .or. (abs(diagonal_root) .eq. 0.0d0)) then possiblerank = max(i-1,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check_improved") #endif return end if ! check if relative error is bounded for each Householder vector ! Householder i is stable iff Househoulder i-1 is "stable" and the accuracy criterion ! holds. ! first Householder vector is considered as "stable". do j=i+1,k work(i,j) = work(i,j) / diagonal_root do l=i+1,j work(l,j) = work(l,j) - work(i,j) * work(i,l) end do end do !print *,'cholesky step done' ! build sum of squares if (i .eq. 1) then sum_squares = 0.0d0 else sum_squares = ddot(i-1,work(1,i),1,work(1,i),1) end if !relative_error = sum_squares / diagonal_square !print *,'error ',i,sum_squares,diagonal_square,relative_error if (sum_squares .ge. (epsd * diagonal_square)) then possiblerank = max(i-1,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check_improved") #endif return end if end do possiblerank = i !print *,'possible rank', possiblerank #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check_improved") #endif end subroutine qr_pdlarfgk_1dcomm_check_improved ! TODO: zero Householder vector (zero norm) case ! - check alpha values as well (from seedC) subroutine qr_pdlarfgk_1dcomm_check(seedC,seedD,k,PQRPARAM,work,lwork,possiblerank) use precision use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup ! input variables (local) ! input variables (global) integer(kind=ik) :: k,lwork #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR integer(kind=ik) :: PQRPARAM(*) #else integer(kind=ik) :: PQRPARAM(:) #endif real(kind=rk) :: seedC(k,*),seedD(k,*),work(k,*) ! output variables (global) integer(kind=ik) :: possiblerank ! derived input variables from QR_PQRPARAM integer(kind=ik) :: eps ! local scalars integer(kind=ik) :: icol,isqr,iprod real(kind=rk) :: epsd,sum_sqr,sum_products,diff,temp,ortho,ortho_sum real(kind=rk) :: dreverse_matrix_work(1) #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm_check") #endif if (lwork .eq. -1) then call reverse_matrix_local(1,k,k,work,k,dreverse_matrix_work,-1) work(1,1) = DBLE(k*k) + dreverse_matrix_work(1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check") #endif return end if eps = PQRPARAM(3) if (eps .eq. 0) then possiblerank = k #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check") #endif return end if epsd = DBLE(eps) ! copy seedD to work work(:,1:k) = seedD(:,1:k) ! add inner products of seedC to work call dsyrk("Upper", "Trans", k, k, & 1.0d0, seedC(1,1), k, & 1.0d0, work, k) ! TODO: optimize this part! call reverse_matrix_local(0,k,k,work(1,1),k,work(1,k+1),lwork-2*k) call reverse_matrix_local(1,k,k,work(1,1),k,work(1,k+1),lwork-2*k) ! transpose matrix do icol=1,k do isqr=icol+1,k work(icol,isqr) = work(isqr,icol) end do end do ! work contains now the full inner product of the global (sub-)matrix do icol=1,k ! zero Householder vector (zero norm) case if (abs(work(icol,icol)) .eq. 0.0d0) then !print *,'too small ', icol, work(icol,icol) possiblerank = max(icol,1) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check") #endif return end if sum_sqr = 0.0d0 do isqr=1,icol-1 sum_products = 0.0d0 do iprod=1,isqr-1 sum_products = sum_products + work(iprod,isqr)*work(iprod,icol) end do !print *,'divisor',icol,isqr,work(isqr,isqr) temp = (work(isqr,icol) - sum_products)/work(isqr,isqr) work(isqr,icol) = temp sum_sqr = sum_sqr + temp*temp end do ! calculate diagonal value diff = work(icol,icol) - sum_sqr if (diff .lt. 0.0d0) then ! we definitely have a problem now possiblerank = icol-1 ! only decompose to previous column (including) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check") #endif return end if work(icol,icol) = sqrt(diff) ! calculate orthogonality ortho = 0.0d0 do isqr=1,icol-1 ortho_sum = 0.0d0 do iprod=isqr,icol-1 temp = work(isqr,iprod)*work(isqr,iprod) !print *,'ortho ', work(iprod,iprod) temp = temp / (work(iprod,iprod)*work(iprod,iprod)) ortho_sum = ortho_sum + temp end do ortho = ortho + ortho_sum * (work(isqr,icol)*work(isqr,icol)) end do ! ---------------- with division by zero ----------------------- ! !ortho = ortho / diff; ! if current estimate is not accurate enough, the following check holds !if (ortho .gt. epsd) then ! possiblerank = icol-1 ! only decompose to previous column (including) ! return !end if ! ---------------- without division by zero ----------------------- ! ! if current estimate is not accurate enough, the following check holds if (ortho .gt. epsd * diff) then possiblerank = icol-1 ! only decompose to previous column (including) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check") #endif return end if end do ! if we get to this point, the accuracy condition holds for the whole block possiblerank = k #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_check") #endif end subroutine qr_pdlarfgk_1dcomm_check !sidx: seed idx !k: max rank used during seed phase !rank: actual rank (k >= rank) subroutine qr_pdlarfgk_1dcomm_vector(x,incx,baseidx,tau,seedC,seedD,k,sidx,n,nb,rev,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: incx real(kind=rk) :: x(*),tau ! input variables (global) integer(kind=ik) :: n,nb,baseidx,rev,mpicomm,k,sidx real(kind=rk) :: seedC(k,*),seedD(k,*) ! output variables (global) ! external functions real(kind=rk), external :: dlapy2,dnrm2 external :: dscal ! local scalars integer(kind=ik) :: mpirank,mpirank_top,mpiprocs,mpierr real(kind=rk) :: alpha,dot,beta,xnorm integer(kind=ik) :: local_size,baseoffset,local_offset,top,topidx integer(kind=ik) :: lidx #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm_vector") #endif call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) lidx = baseidx-sidx+1 call local_size_offset_1d(n,nb,baseidx,lidx-1,rev,mpirank,mpiprocs, & local_size,baseoffset,local_offset) local_offset = local_offset * incx ! Processor id for global index of top element mpirank_top = MOD((lidx-1)/nb,mpiprocs) if (mpirank .eq. mpirank_top) then topidx = local_index((lidx),mpirank_top,mpiprocs,nb,0) top = 1+(topidx-1)*incx end if alpha = seedC(k-sidx+1,k-sidx+1) dot = seedD(k-sidx+1,k-sidx+1) ! assemble actual norm from both seed parts xnorm = dlapy2(sqrt(dot), dnrm2(k-sidx,seedC(1,k-sidx+1),1)) if (xnorm .eq. 0.0d0) then tau = 0.0d0 else ! General case beta = sign(dlapy2(alpha, xnorm), alpha) ! store a preliminary version of beta in tau tau = beta ! update global part call dscal(local_size, 1.0d0/(beta+alpha), & x(local_offset), incx) ! do not update local part here due to ! dependency of c vector during update process ! TODO: reimplement norm rescale method of ! original PDLARFG using mpi? if (mpirank .eq. mpirank_top) then x(top) = -beta end if end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_vector") #endif end subroutine qr_pdlarfgk_1dcomm_vector !k: original max rank used during seed function !rank: possible rank as from check function ! TODO: if rank is less than k, reduce buffersize in such a way ! that only the required entries for the next pdlarfg steps are ! computed subroutine qr_pdlarfgk_1dcomm_update(a,lda,baseidx,work,lwork,seedC,seedD,k,rank,sidx,tau,n,nb,rev,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! parameter setup INTEGER(kind=ik), parameter :: gmode_ = 1,rank_ = 2,eps_ = 3, upmode1_ = 4 ! input variables (local) integer(kind=ik) :: lda,lwork real(kind=rk) :: a(lda,*),work(*) ! input variables (global) integer(kind=ik) :: k,rank,sidx,n,baseidx,nb,rev,mpicomm real(kind=rk) :: beta ! output variables (global) real(kind=rk) :: seedC(k,*),seedD(k,*),tau(*) ! derived input variables from QR_PQRPARAM ! local scalars real(kind=rk) :: alpha integer(kind=ik) :: coffset,zoffset,yoffset,voffset,buffersize integer(kind=ik) :: mpirank,mpierr,mpiprocs,mpirank_top integer(kind=ik) :: localsize,baseoffset,localoffset,topidx integer(kind=ik) :: lidx #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm_update") #endif if (lwork .eq. -1) then ! buffer for c,z,y,v work(1) = 4*k #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_update") #endif return end if ! nothing to update anymore if (sidx .gt. rank) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_update") #endif return endif call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) lidx = baseidx-sidx if (lidx .lt. 1) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_update") #endif return endif call local_size_offset_1d(n,nb,baseidx,lidx,rev,mpirank,mpiprocs, & localsize,baseoffset,localoffset) coffset = 1 zoffset = coffset + k yoffset = zoffset + k voffset = yoffset + k buffersize = k - sidx ! finalize tau values alpha = seedC(k-sidx+1,k-sidx+1) beta = tau(k-sidx+1) ! zero Householder vector (zero norm) case !print *,'k update: alpha,beta',alpha,beta if ((beta .eq. 0.0d0) .or. (alpha .eq. 0.0d0)) then tau(k-sidx+1) = 0.0d0 seedC(k,k-sidx+1) = 0.0d0 #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_update") #endif return end if tau(k-sidx+1) = (beta+alpha) / beta ! --------------------------------------- ! calculate c vector (extra vector or encode in seedC/seedD? work(coffset:coffset+buffersize-1) = seedD(1:buffersize,k-sidx+1) call dgemv("Trans", buffersize+1, buffersize, & 1.0d0,seedC(1,1),k,seedC(1,k-sidx+1),1, & 1.0d0,work(coffset),1) ! calculate z using tau,seedD,seedC and c vector work(zoffset:zoffset+buffersize-1) = seedC(k-sidx+1,1:buffersize) call daxpy(buffersize, 1.0d0/beta, work(coffset), 1, work(zoffset), 1) ! update A1(local copy) and generate part of householder vectors for use call daxpy(buffersize, -1.0d0, work(zoffset),1,seedC(k-sidx+1,1),k) call dscal(buffersize, 1.0d0/(alpha+beta), seedC(1,k-sidx+1),1) call dger(buffersize, buffersize, -1.0d0, seedC(1,k-sidx+1),1, work(zoffset), 1, seedC(1,1), k) ! update A global (householder vector already generated by pdlarfgk) mpirank_top = MOD(lidx/nb,mpiprocs) if (mpirank .eq. mpirank_top) then ! handle first row separately topidx = local_index(lidx+1,mpirank_top,mpiprocs,nb,0) call daxpy(buffersize,-1.0d0,work(zoffset),1,a(topidx,1),lda) end if call dger(localsize, buffersize,-1.0d0, & a(localoffset,k-sidx+1),1,work(zoffset),1, & a(localoffset,1),lda) ! update D (symmetric) => two buffer vectors of size rank ! generate y vector work(yoffset:yoffset+buffersize-1) = 0.d0 call daxpy(buffersize,1.0d0/(alpha+beta),work(zoffset),1,work(yoffset),1) ! generate v vector work(voffset:voffset+buffersize-1) = seedD(1:buffersize,k-sidx+1) call daxpy(buffersize, -0.5d0*seedD(k-sidx+1,k-sidx+1), work(yoffset), 1, work(voffset),1) ! symmetric update of D using y and v call dsyr2("Upper", buffersize,-1.0d0, & work(yoffset),1,work(voffset),1, & seedD(1,1), k) ! prepare T matrix inner products ! D_k(1:k,k+1:n) = D_(k-1)(1:k,k+1:n) - D_(k-1)(1:k,k) * y' ! store coefficient 1.0d0/(alpha+beta) in C diagonal elements call dger(k-sidx,sidx,-1.0d0,work(yoffset),1,seedD(k-sidx+1,k-sidx+1),k,seedD(1,k-sidx+1),k) seedC(k,k-sidx+1) = 1.0d0/(alpha+beta) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_update") #endif end subroutine qr_pdlarfgk_1dcomm_update subroutine qr_pdlarfgk_1dcomm_generateT(seedC,seedD,k,actualk,tau,t,ldt) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik) :: k,actualk,ldt real(kind=rk) :: seedC(k,*),seedD(k,*),tau(*),t(ldt,*) integer(kind=ik) :: irow,icol real(kind=rk) :: column_coefficient #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfgk_1dcomm_generateT") #endif !print *,'reversed on the fly T generation NYI' do icol=1,actualk-1 ! calculate inner product of householder vector parts in seedC ! (actually calculating more than necessary, if actualk < k) ! => a lot of junk from row 1 to row k-actualk call dtrmv('Upper','Trans','Unit',k-icol,seedC(1,1),k,seedC(1,k-icol+1),1) ! add scaled D parts to current column of C (will become later T rows) column_coefficient = seedC(k,k-icol+1) do irow=k-actualk+1,k-1 seedC(irow,k-icol+1) = ( seedC(irow,k-icol+1) ) + ( seedD(irow,k-icol+1) * column_coefficient * seedC(k,irow) ) end do end do call qr_dlarft_kernel(actualk,tau(k-actualk+1),seedC(k-actualk+1,k-actualk+2),k,t(k-actualk+1,k-actualk+1),ldt) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfgk_1dcomm_generateT") #endif end subroutine qr_pdlarfgk_1dcomm_generateT !direction=0: pack into work buffer !direction=1: unpack from work buffer subroutine qr_pdgeqrf_pack_unpack(v,ldv,work,lwork,m,n,mb,baseidx,rowidx,rev,direction,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: ldv,lwork real(kind=rk) :: v(ldv,*), work(*) ! input variables (global) integer(kind=ik) :: m,n,mb,baseidx,rowidx,rev,direction,mpicomm ! output variables (global) ! local scalars integer(kind=ik) :: mpierr,mpirank,mpiprocs integer(kind=ik) :: buffersize,icol integer(kind=ik) :: local_size,baseoffset,offset ! external functions #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdgeqrf_pack_unpack") #endif call mpi_comm_rank(mpicomm,mpirank,mpierr) call mpi_comm_size(mpicomm,mpiprocs,mpierr) call local_size_offset_1d(m,mb,baseidx,rowidx,rev,mpirank,mpiprocs, & local_size,baseoffset,offset) !print *,'pack/unpack',local_size,baseoffset,offset ! rough approximate for buffer size if (lwork .eq. -1) then buffersize = local_size * n ! vector elements work(1) = DBLE(buffersize) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_pack_unpack") #endif return end if if (direction .eq. 0) then ! copy v part to buffer (including zeros) do icol=1,n work(1+local_size*(icol-1):local_size*icol) = v(baseoffset:baseoffset+local_size-1,icol) end do else ! copy v part from buffer (including zeros) do icol=1,n v(baseoffset:baseoffset+local_size-1,icol) = work(1+local_size*(icol-1):local_size*icol) end do end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_pack_unpack") #endif return end subroutine qr_pdgeqrf_pack_unpack !direction=0: pack into work buffer !direction=1: unpack from work buffer subroutine qr_pdgeqrf_pack_unpack_tmatrix(tau,t,ldt,work,lwork,n,direction) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: ldt,lwork real(kind=rk) :: work(*), t(ldt,*),tau(*) ! input variables (global) integer(kind=ik) :: n,direction ! output variables (global) ! local scalars integer(kind=ik) :: icol ! external functions #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdgeqrf_pack_unpack_tmatrix") #endif if (lwork .eq. -1) then work(1) = DBLE(n*n) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_pack_unpack_tmatrix") #endif return end if if (direction .eq. 0) then ! append t matrix to buffer (including zeros) do icol=1,n work(1+(icol-1)*n:icol*n) = t(1:n,icol) end do else ! append t matrix from buffer (including zeros) do icol=1,n t(1:n,icol) = work(1+(icol-1)*n:icol*n) tau(icol) = t(icol,icol) end do end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdgeqrf_pack_unpack_tmatrix") #endif end subroutine qr_pdgeqrf_pack_unpack_tmatrix ! TODO: encode following functionality ! - Direction? BOTTOM UP or TOP DOWN ("Up", "Down") ! => influences all related kernels (including DLARFT / DLARFB) ! - rank-k parameter (k=1,2,...,b) ! => influences possible update strategies ! => parameterize the function itself? (FUNCPTR, FUNCARG) ! - Norm mode? Allreduce, Allgather, AlltoAll, "AllHouse", (ALLNULL = benchmarking local kernels) ! - subblocking ! (maximum block size bounded by data distribution along rows) ! - blocking method (householder vectors only or compact WY?) ! - update strategy of trailing parts (incremental, complete) ! - difference for subblocks and normal blocks? (UPDATE and UPDATESUB) ! o "Incremental" ! o "Full" ! - final T generation (recursive: subblock wise, block wise, end) (TMERGE) ! ' (implicitly given by / influences update strategies?) ! => alternative: during update: iterate over sub t parts ! => advantage: smaller (cache aware T parts) ! => disadvantage: more memory write backs ! (number of T parts * matrix elements) ! - partial/sub T generation (TGEN) ! o add vectors right after creation (Vector) ! o add set of vectors (Set) ! - bcast strategy of householder vectors to other process columns ! (influences T matrix generation and trailing update ! in other process columns) ! o no broadcast (NONE = benchmarking?, ! or not needed due to 1D process grid) ! o after every housegen (VECTOR) ! o after every subblk (SUBBLOCK) ! o after full local column block decomposition (BLOCK) ! LOOP Housegen -> BCAST -> GENT/EXTENDT -> LOOP HouseLeft !subroutine qr_pqrparam_init(PQRPARAM, DIRECTION, RANK, NORMMODE, & ! SUBBLK, UPDATE, TGEN, BCAST) ! gmode: control communication pattern of dlarfg ! maxrank: control max number of householder vectors per communication ! eps: error threshold (integer) ! update*: control update pattern in pdgeqr2_1dcomm ('incremental','full','merge') ! merging = full update with tmatrix merging ! tmerge*: 0: do not merge, 1: incremental merge, >1: recursive merge ! only matters if update* == full subroutine qr_pqrparam_init(pqrparam,size2d,update2d,tmerge2d,size1d,update1d,tmerge1d,maxrank,update,eps,hgmode) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input CHARACTER :: update2d,update1d,update,hgmode INTEGER(kind=ik) :: size2d,size1d,maxrank,eps,tmerge2d,tmerge1d ! output #ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR INTEGER(kind=ik) :: PQRPARAM(*) #else INTEGER(kind=ik) :: PQRPARAM(1:11) #endif #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pqrparam_init") #endif PQRPARAM(1) = size2d PQRPARAM(2) = ichar(update2d) PQRPARAM(3) = tmerge2d ! TODO: broadcast T yes/no PQRPARAM(4) = size1d PQRPARAM(5) = ichar(update1d) PQRPARAM(6) = tmerge1d PQRPARAM(7) = maxrank PQRPARAM(8) = ichar(update) PQRPARAM(9) = eps PQRPARAM(10) = ichar(hgmode) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pqrparam_init") #endif end subroutine qr_pqrparam_init subroutine qr_pdlarfg_copy_1dcomm(x,incx,v,incv,n,baseidx,idx,nb,rev,mpicomm) use precision use ELPA1 use qr_utils_mod #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none ! input variables (local) integer(kind=ik) :: incx,incv real(kind=rk) :: x(*), v(*) ! input variables (global) integer(kind=ik) :: baseidx,idx,rev,nb,n integer(kind=ik) :: mpicomm ! output variables (global) ! local scalars integer(kind=ik) :: mpierr,mpiprocs integer(kind=ik) :: mpirank,mpirank_top integer(kind=ik) :: irow,x_offset integer(kind=ik) :: v_offset,local_size #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_pdlarfg_copy_1dcomm") #endif call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) call local_size_offset_1d(n,nb,baseidx,idx,rev,mpirank,mpiprocs, & local_size,v_offset,x_offset) v_offset = v_offset * incv !print *,'copy:',mpirank,baseidx,v_offset,x_offset,local_size ! copy elements do irow=1,local_size v((irow-1)*incv+v_offset) = x((irow-1)*incx+x_offset) end do ! replace top element to build an unitary vector mpirank_top = MOD((idx-1)/nb,mpiprocs) if (mpirank .eq. mpirank_top) then v(local_size*incv) = 1.0d0 end if #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_pdlarfg_copy_1dcomm") #endif end subroutine qr_pdlarfg_copy_1dcomm end module elpa_pdgeqrf elpa-2016.05.001/src/elpa_qr/qr_utils.F900000644000312500001440000003256712717516040014376 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! #include "config-f90.h" module qr_utils_mod use elpa_mpi implicit none PRIVATE public :: local_size_offset_1d public :: reverse_vector_local public :: reverse_matrix_local public :: reverse_matrix_1dcomm public :: reverse_matrix_2dcomm_ref contains ! rev parameter is critical, even in rev only mode! ! pdgeqrf_2dcomm uses rev=0 version to determine the process columns ! involved in the qr decomposition subroutine local_size_offset_1d(n,nb,baseidx,idx,rev,rank,nprocs, & lsize,baseoffset,offset) use precision use ELPA1_compute implicit none ! input integer(kind=ik) :: n,nb,baseidx,idx,rev,rank,nprocs ! output integer(kind=ik) :: lsize,baseoffset,offset ! local scalars integer(kind=ik) :: rank_idx rank_idx = MOD((idx-1)/nb,nprocs) ! calculate local size and offsets if (rev .eq. 1) then if (idx > 0) then lsize = local_index(idx,rank,nprocs,nb,-1) else lsize = 0 end if baseoffset = 1 offset = 1 else offset = local_index(idx,rank,nprocs,nb,1) baseoffset = local_index(baseidx,rank,nprocs,nb,1) lsize = local_index(n,rank,nprocs,nb,-1) !print *,'baseidx,idx',baseidx,idx,lsize,n lsize = lsize - offset + 1 baseoffset = offset - baseoffset + 1 end if end subroutine local_size_offset_1d subroutine reverse_vector_local(n,x,incx,work,lwork) use precision implicit none ! input integer(kind=ik) :: incx,n,lwork real(kind=rk) :: x(*),work(*) ! local scalars real(kind=rk) :: temp integer(kind=ik) :: srcoffset,destoffset,ientry if (lwork .eq. -1) then work(1) = 0.0d0 return end if do ientry=1,n/2 srcoffset=1+(ientry-1)*incx destoffset=1+(n-ientry)*incx temp = x(srcoffset) x(srcoffset) = x(destoffset) x(destoffset) = temp end do end subroutine reverse_vector_local subroutine reverse_matrix_local(trans,m,n,a,lda,work,lwork) use precision implicit none ! input integer(kind=ik) :: lda,m,n,lwork,trans real(kind=rk) :: a(lda,*),work(*) ! local scalars real(kind=rk) :: temp, dworksize(1) integer(kind=ik) :: incx integer(kind=ik) :: dimsize integer(kind=ik) :: i if (trans .eq. 1) then incx = lda dimsize = n else incx = 1 dimsize = m end if if (lwork .eq. -1) then call reverse_vector_local(dimsize,a,incx,dworksize,-1) work(1) = dworksize(1) return end if if (trans .eq. 1) then do i=1,m call reverse_vector_local(dimsize,a(i,1),incx,work,lwork) end do else do i=1,n call reverse_vector_local(dimsize,a(1,i),incx,work,lwork) end do end if end subroutine reverse_matrix_local subroutine reverse_matrix_2dcomm_ref(m,n,mb,nb,a,lda,work,lwork,mpicomm_cols,mpicomm_rows) use precision implicit none ! input integer(kind=ik) :: m,n,lda,lwork,mpicomm_cols,mpicomm_rows,mb,nb real(kind=rk) :: a(lda,*),work(*) ! local scalars real(kind=rk) :: reverse_column_size(1) real(kind=rk) :: reverse_row_size(1) integer(kind=ik) :: mpirank_cols,mpirank_rows integer(kind=ik) :: mpiprocs_cols,mpiprocs_rows integer(kind=ik) :: mpierr integer(kind=ik) :: lrows,lcols,offset,baseoffset call MPI_Comm_rank(mpicomm_cols,mpirank_cols,mpierr) call MPI_Comm_rank(mpicomm_rows,mpirank_rows,mpierr) call MPI_Comm_size(mpicomm_cols,mpiprocs_cols,mpierr) call MPI_Comm_size(mpicomm_rows,mpiprocs_rows,mpierr) call local_size_offset_1d(m,mb,1,1,0,mpirank_cols,mpiprocs_cols, & lrows,baseoffset,offset) call local_size_offset_1d(n,nb,1,1,0,mpirank_rows,mpiprocs_rows, & lcols,baseoffset,offset) if (lwork .eq. -1) then call reverse_matrix_1dcomm(0,m,lcols,mb,a,lda,reverse_column_size,-1,mpicomm_cols) call reverse_matrix_1dcomm(1,lrows,n,nb,a,lda,reverse_row_size,-1,mpicomm_rows) work(1) = max(reverse_column_size(1),reverse_row_size(1)) return end if call reverse_matrix_1dcomm(0,m,lcols,mb,a,lda,work,lwork,mpicomm_cols) call reverse_matrix_1dcomm(1,lrows,n,nb,a,lda,work,lwork,mpicomm_rows) end subroutine reverse_matrix_2dcomm_ref ! b: if trans = 'N': b is size of block distribution between rows ! b: if trans = 'T': b is size of block distribution between columns subroutine reverse_matrix_1dcomm(trans,m,n,b,a,lda,work,lwork,mpicomm) use precision use elpa_mpi implicit none ! input integer(kind=ik) :: trans integer(kind=ik) :: m,n,b,lda,lwork,mpicomm real(kind=rk) :: a(lda,*),work(*) ! local scalars integer(kind=ik) :: mpirank,mpiprocs,mpierr #ifdef WITH_MPI integer(kind=ik) :: mpistatus(MPI_STATUS_SIZE) #endif integer(kind=ik) :: nr_blocks,dest_process,src_process,step integer(kind=ik) :: lsize,baseoffset,offset integer(kind=ik) :: current_index,destblk,srcblk,icol,next_index integer(kind=ik) :: sendcount,recvcount integer(kind=ik) :: sendoffset,recvoffset integer(kind=ik) :: newmatrix_offset,work_offset integer(kind=ik) :: lcols,lrows,lroffset,lcoffset,dimsize,fixedsize real(kind=rk) :: dworksize(1) call MPI_Comm_rank(mpicomm, mpirank, mpierr) call MPI_Comm_size(mpicomm, mpiprocs, mpierr) if (trans .eq. 1) then call local_size_offset_1d(n,b,1,1,0,mpirank,mpiprocs, & lcols,baseoffset,lcoffset) lrows = m else call local_size_offset_1d(m,b,1,1,0,mpirank,mpiprocs, & lrows,baseoffset,lroffset) lcols = n end if if (lwork .eq. -1) then call reverse_matrix_local(trans,lrows,lcols,a,max(lrows,lcols),dworksize,-1) work(1) = DBLE(3*lrows*lcols) + dworksize(1) return end if sendoffset = 1 recvoffset = sendoffset + lrows*lcols newmatrix_offset = recvoffset + lrows*lcols work_offset = newmatrix_offset + lrows*lcols if (trans .eq. 1) then dimsize = n fixedsize = m else dimsize = m fixedsize = n end if if (dimsize .le. 1) then return ! nothing to do end if ! 1. adjust step size to remainder size nr_blocks = dimsize / b nr_blocks = nr_blocks * b step = dimsize - nr_blocks if (step .eq. 0) step = b ! 2. iterate over destination blocks starting with process 0 current_index = 1 do while (current_index .le. dimsize) destblk = (current_index-1) / b dest_process = mod(destblk,mpiprocs) srcblk = (dimsize-current_index) / b src_process = mod(srcblk,mpiprocs) next_index = current_index+step ! block for dest_process is located on mpirank if lsize > 0 call local_size_offset_1d(dimsize-current_index+1,b,dimsize-next_index+2,dimsize-next_index+2,0, & src_process,mpiprocs,lsize,baseoffset,offset) sendcount = lsize*fixedsize recvcount = sendcount ! TODO: this send/recv stuff seems to blow up on BlueGene/P ! TODO: is there actually room for the requested matrix part? the target ! process might not have any parts at all (thus no room) if ((src_process .eq. mpirank) .and. (dest_process .eq. src_process)) then ! 5. pack data if (trans .eq. 1) then do icol=offset,offset+lsize-1 work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = & a(1:lrows,icol) end do else do icol=1,lcols work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = & a(offset:offset+lsize-1,icol) end do end if ! 7. reverse data if (trans .eq. 1) then call reverse_matrix_local(1,lrows,lsize,work(sendoffset),lrows,work(work_offset),lwork) else call reverse_matrix_local(0,lsize,lcols,work(sendoffset),lsize,work(work_offset),lwork) end if ! 8. store in temp matrix if (trans .eq. 1) then do icol=1,lsize work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = & work(sendoffset+(icol-1)*lrows:sendoffset+icol*lrows-1) end do newmatrix_offset = newmatrix_offset + lsize*lrows else do icol=1,lcols work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = & work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) end do newmatrix_offset = newmatrix_offset + lsize end if else if (dest_process .eq. mpirank) then ! 6b. call MPI_Recv #ifdef WITH_MPI call MPI_Recv(work(recvoffset), recvcount, mpi_real8, & src_process, current_index, mpicomm, mpistatus, mpierr) #else work(recvoffset:recvoffset+recvcount-1) = work(sendoffset:sendoffset+sendcount-1) #endif ! 7. reverse data if (trans .eq. 1) then call reverse_matrix_local(1,lrows,lsize,work(recvoffset),lrows,work(work_offset),lwork) else call reverse_matrix_local(0,lsize,lcols,work(recvoffset),lsize,work(work_offset),lwork) end if ! 8. store in temp matrix if (trans .eq. 1) then do icol=1,lsize work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) = & work(recvoffset+(icol-1)*lrows:recvoffset+icol*lrows-1) end do newmatrix_offset = newmatrix_offset + lsize*lrows else do icol=1,lcols work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+(icol-1)*lrows+lsize-1) = & work(recvoffset+(icol-1)*lsize:recvoffset+icol*lsize-1) end do newmatrix_offset = newmatrix_offset + lsize end if end if if (src_process .eq. mpirank) then ! 5. pack data if (trans .eq. 1) then do icol=offset,offset+lsize-1 work(sendoffset+(icol-offset)*lrows:sendoffset+(icol-offset+1)*lrows-1) = & a(1:lrows,icol) end do else do icol=1,lcols work(sendoffset+(icol-1)*lsize:sendoffset+icol*lsize-1) = & a(offset:offset+lsize-1,icol) end do end if ! 6a. call MPI_Send #ifdef WITH_MPI call MPI_Send(work(sendoffset), sendcount, mpi_real8, & dest_process, current_index, mpicomm, mpierr) #endif end if end if current_index = next_index end do ! 9. copy temp matrix to real matrix newmatrix_offset = recvoffset + lrows*lcols do icol=1,lcols a(1:lrows,icol) = & work(newmatrix_offset+(icol-1)*lrows:newmatrix_offset+icol*lrows-1) end do end subroutine reverse_matrix_1dcomm end module elpa-2016.05.001/src/mod_compute_hh_trafo_complex.F900000644000312500001440000003670412717516040017022 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF module compute_hh_trafo_complex #include "config-f90.h" use elpa_mpi implicit none #ifdef WITH_OPENMP public compute_hh_trafo_complex_cpu_openmp #else public compute_hh_trafo_complex_cpu #endif contains #ifdef WITH_OPENMP subroutine compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & off, ncols, istripe, & my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL) #else subroutine compute_hh_trafo_complex_cpu (a, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & off, ncols, istripe, last_stripe_width, & THIS_COMPLEX_ELPA_KERNEL) #endif use precision use elpa2_utilities #if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple #endif #if defined(WITH_COMPLEX_GENERIC_KERNEL) use complex_generic_kernel, only : single_hh_trafo_complex_generic #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif #if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) use kernel_interfaces #endif implicit none real(kind=rk), intent(inout) :: kernel_time integer(kind=lik) :: kernel_flops integer(kind=ik), intent(in) :: nbw, max_blk_size complex(kind=ck) :: bcast_buffer(nbw,max_blk_size) integer(kind=ik), intent(in) :: a_off integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count #ifndef WITH_OPENMP integer(kind=ik), intent(in) :: last_stripe_width complex(kind=ck) :: a(stripe_width,a_dim2,stripe_count) #else integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width complex(kind=ck) :: a(stripe_width,a_dim2,stripe_count,max_threads) #endif integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL ! Private variables in OMP regions (my_thread) should better be in the argument list! integer(kind=ik) :: off, ncols, istripe, j, nl, jj #ifdef WITH_OPENMP integer(kind=ik) :: my_thread, noff #endif real(kind=rk) :: ttt !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Currently (on Sandy Bridge), single is faster than double !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! complex(kind=ck) :: w(nbw,2) #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%start("compute_hh_trafo_complex_cpu_openmp") #else call timer%start("compute_hh_trafo_complex_cpu") #endif #endif #ifdef WITH_OPENMP if (istripe ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Lorenz Huedepohl, MPCDF module aligned_mem use, intrinsic :: iso_c_binding interface function posix_memalign(memptr, alignment, size) result(error) bind(C, name="posix_memalign") import c_int, c_size_t, c_ptr integer(kind=c_int) :: error type(c_ptr), intent(inout) :: memptr integer(kind=c_size_t), intent(in), value :: alignment, size end function end interface interface subroutine free(ptr) bind(C, name="free") import c_ptr type(c_ptr), value :: ptr end subroutine end interface end module elpa-2016.05.001/src/elpa_transpose_vectors.X900000644000312500001440000001570212717516040015707 00000000000000#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! Author: Andreas Marek, MPCDF #endif #if REALCASE==1 subroutine elpa_transpose_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk) #endif #if COMPLEXCASE==1 subroutine elpa_transpose_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvs,nvr,nvc,nblk) #endif !------------------------------------------------------------------------------- ! This routine transposes an array of vectors which are distributed in ! communicator comm_s into its transposed form distributed in communicator comm_t. ! There must be an identical copy of vmat_s in every communicator comm_s. ! After this routine, there is an identical copy of vmat_t in every communicator comm_t. ! ! vmat_s original array of vectors ! ld_s leading dimension of vmat_s ! comm_s communicator over which vmat_s is distributed ! vmat_t array of vectors in transposed form ! ld_t leading dimension of vmat_t ! comm_t communicator over which vmat_t is distributed ! nvs global index where to start in vmat_s/vmat_t ! Please note: this is kind of a hint, some values before nvs will be ! accessed in vmat_s/put into vmat_t ! nvr global length of vmat_s/vmat_t ! nvc number of columns in vmat_s/vmat_t ! nblk block size of block cyclic distribution ! !------------------------------------------------------------------------------- use precision ! use ELPA1 ! for least_common_multiple #ifdef WITH_OPENMP use omp_lib #endif use elpa_mpi implicit none integer(kind=ik), intent(in) :: ld_s, comm_s, ld_t, comm_t, nvs, nvr, nvc, nblk DATATYPE, intent(in) :: vmat_s(ld_s,nvc) DATATYPE, intent(inout) :: vmat_t(ld_t,nvc) DATATYPE, allocatable :: aux(:) integer(kind=ik) :: myps, mypt, nps, npt integer(kind=ik) :: n, lc, k, i, ips, ipt, ns, nl, mpierr integer(kind=ik) :: lcm_s_t, nblks_tot, nblks_comm, nblks_skip integer(kind=ik) :: auxstride call mpi_comm_rank(comm_s,myps,mpierr) call mpi_comm_size(comm_s,nps ,mpierr) call mpi_comm_rank(comm_t,mypt,mpierr) call mpi_comm_size(comm_t,npt ,mpierr) ! The basic idea of this routine is that for every block (in the block cyclic ! distribution), the processor within comm_t which owns the diagonal ! broadcasts its values of vmat_s to all processors within comm_t. ! Of course this has not to be done for every block separately, since ! the communictation pattern repeats in the global matrix after ! the least common multiple of (nps,npt) blocks lcm_s_t = least_common_multiple(nps,npt) ! least common multiple of nps, npt nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr ! Get the number of blocks to be skipped at the begin. ! This must be a multiple of lcm_s_t (else it is getting complicated), ! thus some elements before nvs will be accessed/set. nblks_skip = ((nvs-1)/(nblk*lcm_s_t))*lcm_s_t allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc )) #ifdef WITH_OPENMP !$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) #endif do n = 0, lcm_s_t-1 ips = mod(n,nps) ipt = mod(n,npt) if(mypt == ipt) then nblks_comm = (nblks_tot-nblks_skip-n+lcm_s_t-1)/lcm_s_t auxstride = nblk * nblks_comm ! if(nblks_comm==0) cycle if (nblks_comm .ne. 0) then if(myps == ips) then ! k = 0 #ifdef WITH_OPENMP !$omp do #endif do lc=1,nvc do i = nblks_skip+n, nblks_tot-1, lcm_s_t k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride ns = (i/nps)*nblk ! local start of block i nl = min(nvr-i*nblk,nblk) ! length aux(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc) ! k = k+nblk enddo enddo endif #ifdef WITH_OPENMP !$omp barrier !$omp master #endif #ifdef WITH_MPI #if COMPLEXCASE==1 call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_DOUBLE_COMPLEX,ips,comm_s,mpierr) #endif #if REALCASE==1 call MPI_Bcast(aux,nblks_comm*nblk*nvc,MPI_REAL8,ips,comm_s,mpierr) #endif #endif /* WITH_MPI */ #ifdef WITH_OPENMP !$omp end master !$omp barrier !$omp do #endif ! k = 0 do lc=1,nvc do i = nblks_skip+n, nblks_tot-1, lcm_s_t k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride ns = (i/npt)*nblk ! local start of block i nl = min(nvr-i*nblk,nblk) ! length vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl) ! k = k+nblk enddo enddo endif endif enddo #ifdef WITH_OPENMP !$omp end parallel #endif deallocate(aux) end subroutine elpa-2016.05.001/src/mod_precision.f900000644000312500001440000000415012717516040013766 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF module precision use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_INT32_T, C_INT64_T implicit none integer, parameter :: rk = C_DOUBLE integer, parameter :: ck = C_DOUBLE integer, parameter :: ik = C_INT32_T integer, parameter :: lik = C_INT64_T end module precision elpa-2016.05.001/src/elpa_reduce_add_vectors.X900000644000312500001440000001433212717516040015746 00000000000000#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF #endif #if REALCASE==1 subroutine elpa_reduce_add_vectors_real(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk) #endif #if COMPLEXCASE==1 subroutine elpa_reduce_add_vectors_complex(vmat_s,ld_s,comm_s,vmat_t,ld_t,comm_t,nvr,nvc,nblk) #endif !------------------------------------------------------------------------------- ! This routine does a reduce of all vectors in vmat_s over the communicator comm_t. ! The result of the reduce is gathered on the processors owning the diagonal ! and added to the array of vectors vmat_t (which is distributed over comm_t). ! ! Opposed to elpa_transpose_vectors, there is NO identical copy of vmat_s ! in the different members within vmat_t (else a reduce wouldn't be necessary). ! After this routine, an allreduce of vmat_t has to be done. ! ! vmat_s array of vectors to be reduced and added ! ld_s leading dimension of vmat_s ! comm_s communicator over which vmat_s is distributed ! vmat_t array of vectors to which vmat_s is added ! ld_t leading dimension of vmat_t ! comm_t communicator over which vmat_t is distributed ! nvr global length of vmat_s/vmat_t ! nvc number of columns in vmat_s/vmat_t ! nblk block size of block cyclic distribution ! !------------------------------------------------------------------------------- use precision ! use ELPA1 ! for least_common_multiple #ifdef WITH_OPENMP use omp_lib #endif use elpa_mpi implicit none integer(kind=ik), intent(in) :: ld_s, comm_s, ld_t, comm_t, nvr, nvc, nblk DATATYPE, intent(in) :: vmat_s(ld_s,nvc) DATATYPE, intent(inout) :: vmat_t(ld_t,nvc) DATATYPE, allocatable :: aux1(:), aux2(:) integer(kind=ik) :: myps, mypt, nps, npt integer(kind=ik) :: n, lc, k, i, ips, ipt, ns, nl, mpierr integer(kind=ik) :: lcm_s_t, nblks_tot integer(kind=ik) :: auxstride, tylerk, error_unit call mpi_comm_rank(comm_s,myps,mpierr) call mpi_comm_size(comm_s,nps ,mpierr) call mpi_comm_rank(comm_t,mypt,mpierr) call mpi_comm_size(comm_t,npt ,mpierr) ! Look to elpa_transpose_vectors for the basic idea! ! The communictation pattern repeats in the global matrix after ! the least common multiple of (nps,npt) blocks lcm_s_t = least_common_multiple(nps,npt) ! least common multiple of nps, npt nblks_tot = (nvr+nblk-1)/nblk ! number of blocks corresponding to nvr allocate(aux1( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc )) allocate(aux2( ((nblks_tot+lcm_s_t-1)/lcm_s_t) * nblk * nvc )) aux1(:) = 0 aux2(:) = 0 #ifdef WITH_OPENMP !$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) #endif do n = 0, lcm_s_t-1 ips = mod(n,nps) ipt = mod(n,npt) auxstride = nblk * ((nblks_tot - n + lcm_s_t - 1)/lcm_s_t) if(myps == ips) then ! k = 0 #ifdef WITH_OPENMP !$omp do #endif do lc=1,nvc do i = n, nblks_tot-1, lcm_s_t k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride ns = (i/nps)*nblk ! local start of block i nl = min(nvr-i*nblk,nblk) ! length aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc) ! k = k+nblk enddo enddo k = nvc * auxstride #ifdef WITH_OPENMP !$omp barrier !$omp master #endif #ifdef WITH_MPI #if REALCASE==1 if(k>0) call mpi_reduce(aux1,aux2,k,MPI_REAL8,MPI_SUM,ipt,comm_t,mpierr) #endif #if COMPLEXCASE==1 if(k>0) call mpi_reduce(aux1,aux2,k,MPI_DOUBLE_COMPLEX,MPI_SUM,ipt,comm_t,mpierr) #endif #else /* WITH_MPI */ if(k>0) aux2 = aux1 #endif /* WITH_MPI */ #ifdef WITH_OPENMP !$omp end master !$omp barrier #endif if (mypt == ipt) then ! k = 0 #ifdef WITH_OPENMP !$omp do #endif do lc=1,nvc do i = n, nblks_tot-1, lcm_s_t k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride ns = (i/npt)*nblk ! local start of block i nl = min(nvr-i*nblk,nblk) ! length vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl) ! k = k+nblk enddo enddo endif endif enddo #ifdef WITH_OPENMP !$omp end parallel #endif deallocate(aux1) deallocate(aux2) end subroutine elpa-2016.05.001/src/elpa_c_interface.F900000644000312500001440000004100712717516040014341 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MCPDF #include "config-f90.h" !c> #include !c> /*! \brief C old, deprecated interface to create the MPI communicators for ELPA !c> * !c> * \param mpi_comm_word MPI global communicator (in) !c> * \param my_prow Row coordinate of the calling process in the process grid (in) !c> * \param my_pcol Column coordinate of the calling process in the process grid (in) !c> * \param mpi_comm_rows Communicator for communicating within rows of processes (out) !c> * \result int integer error value of mpi_comm_split function !c> */ !c> int elpa_get_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols); function get_elpa_row_col_comms_wrapper_c_name1(mpi_comm_world, my_prow, my_pcol, & mpi_comm_rows, mpi_comm_cols) & result(mpierr) bind(C,name="elpa_get_communicators") use, intrinsic :: iso_c_binding use elpa1, only : get_elpa_row_col_comms implicit none integer(kind=c_int) :: mpierr integer(kind=c_int), value :: mpi_comm_world, my_prow, my_pcol integer(kind=c_int) :: mpi_comm_rows, mpi_comm_cols mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, & mpi_comm_rows, mpi_comm_cols) end function !c> #include !c> /*! \brief C interface to create the MPI communicators for ELPA !c> * !c> * \param mpi_comm_word MPI global communicator (in) !c> * \param my_prow Row coordinate of the calling process in the process grid (in) !c> * \param my_pcol Column coordinate of the calling process in the process grid (in) !c> * \param mpi_comm_rows Communicator for communicating within rows of processes (out) !c> * \result int integer error value of mpi_comm_split function !c> */ !c> int get_elpa_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols); function get_elpa_row_col_comms_wrapper_c_name2(mpi_comm_world, my_prow, my_pcol, & mpi_comm_rows, mpi_comm_cols) & result(mpierr) bind(C,name="get_elpa_communicators") use, intrinsic :: iso_c_binding use elpa1, only : get_elpa_row_col_comms implicit none integer(kind=c_int) :: mpierr integer(kind=c_int), value :: mpi_comm_world, my_prow, my_pcol integer(kind=c_int) :: mpi_comm_rows, mpi_comm_cols mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, & mpi_comm_rows, mpi_comm_cols) end function !c> /*! \brief C interface to solve the real eigenvalue problem with 1-stage solver !c> * !c> * \param na Order of matrix a !c> * \param nev Number of eigenvalues needed. !c> * The smallest nev eigenvalues/eigenvectors are calculated. !c> * \param a Distributed matrix for which eigenvalues are to be computed. !c> * Distribution is like in Scalapack. !c> * The full matrix must be set (not only one half like in scalapack). !c> * \param lda Leading dimension of a !c> * \param ev(na) On output: eigenvalues of a, every processor gets the complete set !c> * \param q On output: Eigenvectors of a !c> * Distribution is like in Scalapack. !c> * Must be always dimensioned to the full size (corresponding to (na,na)) !c> * even if only a part of the eigenvalues is needed. !c> * \param ldq Leading dimension of q !c> * \param nblk blocksize of cyclic distribution, must be the same in both directions! !c> * \param matrixCols distributed number of matrix columns !c> * \param mpi_comm_rows MPI-Communicator for rows !c> * \param mpi_comm_cols MPI-Communicator for columns !c> * !c> * \result int: 1 if error occured, otherwise 0 !c>*/ !c> int elpa_solve_evp_real_1stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); function solve_elpa1_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) & result(success) bind(C,name="elpa_solve_evp_real_1stage") use, intrinsic :: iso_c_binding use elpa1, only : solve_evp_real implicit none integer(kind=c_int) :: success integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows real(kind=c_double) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) logical :: successFortran successFortran = solve_evp_real(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) if (successFortran) then success = 1 else success = 0 endif end function !c> /*! \brief C interface to solve the complex eigenvalue problem with 1-stage solver !c> * !c> * \param na Order of matrix a !c> * \param nev Number of eigenvalues needed. !c> * The smallest nev eigenvalues/eigenvectors are calculated. !c> * \param a Distributed matrix for which eigenvalues are to be computed. !c> * Distribution is like in Scalapack. !c> * The full matrix must be set (not only one half like in scalapack). !c> * \param lda Leading dimension of a !c> * \param ev(na) On output: eigenvalues of a, every processor gets the complete set !c> * \param q On output: Eigenvectors of a !c> * Distribution is like in Scalapack. !c> * Must be always dimensioned to the full size (corresponding to (na,na)) !c> * even if only a part of the eigenvalues is needed. !c> * \param ldq Leading dimension of q !c> * \param nblk blocksize of cyclic distribution, must be the same in both directions! !c> * \param matrixCols distributed number of matrix columns !c> * \param mpi_comm_rows MPI-Communicator for rows !c> * \param mpi_comm_cols MPI-Communicator for columns !c> * !c> * \result int: 1 if error occured, otherwise 0 !c> */ !c> int elpa_solve_evp_complex_1stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols); function solve_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols) & result(success) bind(C,name="elpa_solve_evp_complex_1stage") use, intrinsic :: iso_c_binding use elpa1, only : solve_evp_complex implicit none integer(kind=c_int) :: success integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows complex(kind=c_double_complex) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) real(kind=c_double) :: ev(1:na) logical :: successFortran successFortran = solve_evp_complex(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) if (successFortran) then success = 1 else success = 0 endif end function !c> /*! \brief C interface to solve the real eigenvalue problem with 2-stage solver !c> * !c> * \param na Order of matrix a !c> * \param nev Number of eigenvalues needed. !c> * The smallest nev eigenvalues/eigenvectors are calculated. !c> * \param a Distributed matrix for which eigenvalues are to be computed. !c> * Distribution is like in Scalapack. !c> * The full matrix must be set (not only one half like in scalapack). !c> * \param lda Leading dimension of a !c> * \param ev(na) On output: eigenvalues of a, every processor gets the complete set !c> * \param q On output: Eigenvectors of a !c> * Distribution is like in Scalapack. !c> * Must be always dimensioned to the full size (corresponding to (na,na)) !c> * even if only a part of the eigenvalues is needed. !c> * \param ldq Leading dimension of q !c> * \param nblk blocksize of cyclic distribution, must be the same in both directions! !c> * \param matrixCols distributed number of matrix columns !c> * \param mpi_comm_rows MPI-Communicator for rows !c> * \param mpi_comm_cols MPI-Communicator for columns !c> * \param mpi_coll_all MPI communicator for the total processor set !c> * \param THIS_REAL_ELPA_KERNEL_API specify used ELPA2 kernel via API !c> * \param use_qr use QR decomposition 1 = yes, 0 = no !c> * !c> * \result int: 1 if error occured, otherwise 0 !c> */ !c> int elpa_solve_evp_real_2stage(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR); function solve_elpa2_evp_real_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & THIS_REAL_ELPA_KERNEL_API, useQR) & result(success) bind(C,name="elpa_solve_evp_real_2stage") use, intrinsic :: iso_c_binding use elpa2, only : solve_evp_real_2stage implicit none integer(kind=c_int) :: success integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, & mpi_comm_all integer(kind=c_int), value, intent(in) :: THIS_REAL_ELPA_KERNEL_API, useQR real(kind=c_double) :: a(1:lda,1:matrixCols), ev(1:na), q(1:ldq,1:matrixCols) logical :: successFortran, useQRFortran if (useQR .eq. 0) then useQRFortran =.false. else useQRFortran = .true. endif successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, mpi_comm_all, & THIS_REAL_ELPA_KERNEL_API, useQRFortran) if (successFortran) then success = 1 else success = 0 endif end function !c> /*! \brief C interface to solve the complex eigenvalue problem with 2-stage solver !c> * !c> * \param na Order of matrix a !c> * \param nev Number of eigenvalues needed. !c> * The smallest nev eigenvalues/eigenvectors are calculated. !c> * \param a Distributed matrix for which eigenvalues are to be computed. !c> * Distribution is like in Scalapack. !c> * The full matrix must be set (not only one half like in scalapack). !c> * \param lda Leading dimension of a !c> * \param ev(na) On output: eigenvalues of a, every processor gets the complete set !c> * \param q On output: Eigenvectors of a !c> * Distribution is like in Scalapack. !c> * Must be always dimensioned to the full size (corresponding to (na,na)) !c> * even if only a part of the eigenvalues is needed. !c> * \param ldq Leading dimension of q !c> * \param nblk blocksize of cyclic distribution, must be the same in both directions! !c> * \param matrixCols distributed number of matrix columns !c> * \param mpi_comm_rows MPI-Communicator for rows !c> * \param mpi_comm_cols MPI-Communicator for columns !c> * \param mpi_coll_all MPI communicator for the total processor set !c> * \param THIS_REAL_ELPA_KERNEL_API specify used ELPA2 kernel via API !c> * \param use_qr use QR decomposition 1 = yes, 0 = no !c> * !c> * \result int: 1 if error occured, otherwise 0 !c> */ !c> int elpa_solve_evp_complex_2stage(int na, int nev, double complex *a, int lda, double *ev, double complex *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API); function solve_elpa2_evp_complex_wrapper(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & THIS_COMPLEX_ELPA_KERNEL_API) & result(success) bind(C,name="elpa_solve_evp_complex_2stage") use, intrinsic :: iso_c_binding use elpa2, only : solve_evp_complex_2stage implicit none integer(kind=c_int) :: success integer(kind=c_int), value, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_cols, mpi_comm_rows, & mpi_comm_all integer(kind=c_int), value, intent(in) :: THIS_COMPLEX_ELPA_KERNEL_API complex(kind=c_double_complex) :: a(1:lda,1:matrixCols), q(1:ldq,1:matrixCols) real(kind=c_double) :: ev(1:na) logical :: successFortran successFortran = solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) if (successFortran) then success = 1 else success = 0 endif end function elpa-2016.05.001/src/elpa2_utilities.F900000644000312500001440000007422012717516040014177 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "config-f90.h" #include module ELPA2_utilities use ELPA_utilities implicit none PRIVATE ! By default, all routines contained are private ! The following routines are public: public :: get_actual_real_kernel_name, get_actual_complex_kernel_name public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, & REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, & REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_SSE_BLOCK2, & REAL_ELPA_KERNEL_SSE_BLOCK4, REAL_ELPA_KERNEL_SSE_BLOCK6, & REAL_ELPA_KERNEL_AVX_BLOCK2, & REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, & REAL_ELPA_KERNEL_AVX2_BLOCK2, & REAL_ELPA_KERNEL_AVX2_BLOCK4, REAL_ELPA_KERNEL_AVX2_BLOCK6,& DEFAULT_REAL_ELPA_KERNEL public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_SSE_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE_BLOCK2, & COMPLEX_ELPA_KERNEL_AVX_BLOCK1,COMPLEX_ELPA_KERNEL_AVX_BLOCK2, & COMPLEX_ELPA_KERNEL_AVX2_BLOCK1,COMPLEX_ELPA_KERNEL_AVX2_BLOCK2, & DEFAULT_COMPLEX_ELPA_KERNEL public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES public :: get_actual_complex_kernel, get_actual_real_kernel public :: check_allowed_complex_kernels, check_allowed_real_kernels public :: AVAILABLE_COMPLEX_ELPA_KERNELS, AVAILABLE_REAL_ELPA_KERNELS public :: print_available_real_kernels, print_available_complex_kernels public :: query_available_real_kernels, query_available_complex_kernels public :: qr_decomposition_via_environment_variable integer, parameter :: number_of_real_kernels = ELPA2_NUMBER_OF_REAL_KERNELS integer, parameter :: REAL_ELPA_KERNEL_GENERIC = ELPA2_REAL_KERNEL_GENERIC integer, parameter :: REAL_ELPA_KERNEL_GENERIC_SIMPLE = ELPA2_REAL_KERNEL_GENERIC_SIMPLE integer, parameter :: REAL_ELPA_KERNEL_BGP = ELPA2_REAL_KERNEL_BGP integer, parameter :: REAL_ELPA_KERNEL_BGQ = ELPA2_REAL_KERNEL_BGQ integer, parameter :: REAL_ELPA_KERNEL_SSE = ELPA2_REAL_KERNEL_SSE integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_REAL_KERNEL_SSE_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK4 = ELPA2_REAL_KERNEL_SSE_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_SSE_BLOCK6 = ELPA2_REAL_KERNEL_SSE_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_REAL_KERNEL_AVX2_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK4 = ELPA2_REAL_KERNEL_AVX2_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_AVX2_BLOCK6 = ELPA2_REAL_KERNEL_AVX2_BLOCK6 #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) #ifndef WITH_ONE_SPECIFIC_REAL_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC #else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ #ifdef WITH_REAL_GENERIC_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC #endif #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE #endif #ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE #endif #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) #ifdef WITH_REAL_SSE_BLOCK6_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6 #else #ifdef WITH_REAL_SSE_BLOCK4_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4 #else #ifdef WITH_REAL_SSE_BLOCK2_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2 #endif #endif #endif #endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */ #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) #ifdef WITH_REAL_AVX_BLOCK6_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6 #else #ifdef WITH_REAL_AVX_BLOCK4_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4 #else #ifdef WITH_REAL_AVX_BLOCK2_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2 #endif #endif #endif #endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */ #ifdef WITH_REAL_BGP_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP #endif #ifdef WITH_REAL_BGQ_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ #endif #endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ #else /* WITH_REAL_AVX_BLOCK2_KERNEL */ #ifndef WITH_ONE_SPECIFIC_REAL_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC #else /* WITH_ONE_SPECIFIC_REAL_KERNEL */ #ifdef WITH_REAL_GENERIC_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC #endif #ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE #endif #ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE #endif #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) #ifdef WITH_REAL_SSE_BLOCK6_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK6 #else #ifdef WITH_REAL_SSE_BLOCK4_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK4 #else #ifdef WITH_REAL_SSE_BLOCK2_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE_BLOCK2 #endif #endif #endif #endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */ #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) #ifdef WITH_REAL_AVX_BLOCK6_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6 #else #ifdef WITH_REAL_AVX_BLOCK4_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4 #else #ifdef WITH_REAL_AVX_BLOCK2_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2 #endif #endif #endif #endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */ #ifdef WITH_REAL_BGP_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP #endif #ifdef WITH_REAL_BGQ_KERNEL integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ #endif #endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */ #endif /* WITH_REAL_AVX_BLOCK2_KERNEL */ character(35), parameter, dimension(number_of_real_kernels) :: & REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", & "REAL_ELPA_KERNEL_GENERIC_SIMPLE ", & "REAL_ELPA_KERNEL_BGP ", & "REAL_ELPA_KERNEL_BGQ ", & "REAL_ELPA_KERNEL_SSE ", & "REAL_ELPA_KERNEL_SSE_BLOCK2 ", & "REAL_ELPA_KERNEL_SSE_BLOCK4 ", & "REAL_ELPA_KERNEL_SSE_BLOCK6 ", & "REAL_ELPA_KERNEL_AVX_BLOCK2 ", & "REAL_ELPA_KERNEL_AVX_BLOCK4 ", & "REAL_ELPA_KERNEL_AVX_BLOCK6 ", & "REAL_ELPA_KERNEL_AVX2_BLOCK2 ", & "REAL_ELPA_KERNEL_AVX2_BLOCK4 ", & "REAL_ELPA_KERNEL_AVX2_BLOCK6 "/) integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE = ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE integer, parameter :: COMPLEX_ELPA_KERNEL_BGP = ELPA2_COMPLEX_KERNEL_BGP integer, parameter :: COMPLEX_ELPA_KERNEL_BGQ = ELPA2_COMPLEX_KERNEL_BGQ integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK1 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 integer, parameter :: COMPLEX_ELPA_KERNEL_SSE_BLOCK2 = ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) #ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC #else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ ! go through all kernels and set them #ifdef WITH_COMPLEX_GENERIC_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC #endif #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE #endif #ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE #endif #if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) #ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2 #else #ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1 #endif #endif #endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */ #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) #ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2 #else #ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1 #endif #endif #endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */ #endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ #else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */ #ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC #else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ ! go through all kernels and set them #ifdef WITH_COMPLEX_GENERIC_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC #endif #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE #endif #ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE #endif #if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) #ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK2 #else #ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE_BLOCK1 #endif #endif #endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */ #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) #ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2 #else #ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1 #endif #endif #endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */ #endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */ #endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */ character(35), parameter, dimension(number_of_complex_kernels) :: & COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", & "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", & "COMPLEX_ELPA_KERNEL_BGP ", & "COMPLEX_ELPA_KERNEL_BGQ ", & "COMPLEX_ELPA_KERNEL_SSE ", & "COMPLEX_ELPA_KERNEL_SSE_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_SSE_BLOCK2 ", & "COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_AVX_BLOCK2 ", & "COMPLEX_ELPA_KERNEL_AVX2_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_AVX2_BLOCK2 "/) integer, parameter :: & AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = & (/ & #if WITH_REAL_GENERIC_KERNEL 1 & #else 0 & #endif #if WITH_REAL_GENERIC_SIMPLE_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_BGP_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_BGQ_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_SSE_ASSEMBLY_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_SSE_BLOCK2_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_SSE_BLOCK4_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_SSE_BLOCK6_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_AVX_BLOCK2_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_AVX_BLOCK4_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_AVX_BLOCK6_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_AVX2_BLOCK2_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_AVX2_BLOCK4_KERNEL ,1 & #else ,0 & #endif #if WITH_REAL_AVX2_BLOCK6_KERNEL ,1 & #else ,0 & #endif /) integer, parameter :: & AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) = & (/ & #if WITH_COMPLEX_GENERIC_KERNEL 1 & #else 0 & #endif #if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_BGP_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_BGQ_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_SSE_ASSEMBLY_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_SSE_BLOCK1_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_SSE_BLOCK2_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_AVX_BLOCK1_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_AVX_BLOCK2_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_AVX2_BLOCK1_KERNEL ,1 & #else ,0 & #endif #if WITH_COMPLEX_AVX2_BLOCK2_KERNEL ,1 & #else ,0 & #endif /) !****** contains subroutine print_available_real_kernels #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("print_available_real_kernels") #endif do i=1, number_of_real_kernels if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then write(*,*) REAL_ELPA_KERNEL_NAMES(i) endif enddo write(*,*) " " write(*,*) " At the moment the following kernel would be choosen:" write(*,*) get_actual_real_kernel_name() #ifdef HAVE_DETAILED_TIMINGS call timer%stop("print_available_real_kernels") #endif end subroutine print_available_real_kernels subroutine query_available_real_kernels #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("query_available_real_kernels") #endif do i=1, number_of_real_kernels if (AVAILABLE_REAL_ELPA_KERNELS(i) .eq. 1) then write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " write(error_unit,*) " At the moment the following kernel would be choosen:" write(error_unit,*) get_actual_real_kernel_name() #ifdef HAVE_DETAILED_TIMINGS call timer%stop("query_available_real_kernels") #endif end subroutine query_available_real_kernels subroutine print_available_complex_kernels #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("print_available_complex_kernels") #endif do i=1, number_of_complex_kernels if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then write(*,*) COMPLEX_ELPA_KERNEL_NAMES(i) endif enddo write(*,*) " " write(*,*) " At the moment the following kernel would be choosen:" write(*,*) get_actual_complex_kernel_name() #ifdef HAVE_DETAILED_TIMINGS call timer%stop("print_available_complex_kernels") #endif end subroutine print_available_complex_kernels subroutine query_available_complex_kernels #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("query_available_complex_kernels") #endif do i=1, number_of_complex_kernels if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .eq. 1) then write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " write(error_unit,*) " At the moment the following kernel would be choosen:" write(error_unit,*) get_actual_complex_kernel_name() #ifdef HAVE_DETAILED_TIMINGS call timer%stop("query_available_complex_kernels") #endif end subroutine query_available_complex_kernels function get_actual_real_kernel() result(actual_kernel) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: actual_kernel #ifdef HAVE_DETAILED_TIMINGS call timer%start("get_actual_real_kernel") #endif ! if kernel is not choosen via api ! check whether set by environment variable actual_kernel = real_kernel_via_environment_variable() if (actual_kernel .eq. 0) then ! if not then set default kernel actual_kernel = DEFAULT_REAL_ELPA_KERNEL endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("get_actual_real_kernel") #endif end function get_actual_real_kernel function get_actual_real_kernel_name() result(actual_kernel_name) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none character(35) :: actual_kernel_name integer :: actual_kernel #ifdef HAVE_DETAILED_TIMINGS call timer%start("get_actual_real_kernel_name") #endif actual_kernel = get_actual_real_kernel() actual_kernel_name = REAL_ELPA_KERNEL_NAMES(actual_kernel) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("get_actual_real_kernel_name") #endif end function get_actual_real_kernel_name function get_actual_complex_kernel() result(actual_kernel) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: actual_kernel #ifdef HAVE_DETAILED_TIMINGS call timer%start("get_actual_complex_kernel") #endif ! if kernel is not choosen via api ! check whether set by environment variable actual_kernel = complex_kernel_via_environment_variable() if (actual_kernel .eq. 0) then ! if not then set default kernel actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("get_actual_complex_kernel") #endif end function get_actual_complex_kernel function get_actual_complex_kernel_name() result(actual_kernel_name) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none character(35) :: actual_kernel_name integer :: actual_kernel #ifdef HAVE_DETAILED_TIMINGS call timer%start("get_actual_complex_kernel_name") #endif actual_kernel = get_actual_complex_kernel() actual_kernel_name = COMPLEX_ELPA_KERNEL_NAMES(actual_kernel) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("get_actual_complex_kernel_name") #endif end function get_actual_complex_kernel_name function check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL) result(err) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer, intent(in) :: THIS_REAL_ELPA_KERNEL logical :: err #ifdef HAVE_DETAILED_TIMINGS call timer%start("check_allowed_real_kernels") #endif err = .false. if (AVAILABLE_REAL_ELPA_KERNELS(THIS_REAL_ELPA_KERNEL) .ne. 1) err=.true. #ifdef HAVE_DETAILED_TIMINGS call timer%stop("check_allowed_real_kernels") #endif end function check_allowed_real_kernels function check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL) result(err) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL logical :: err #ifdef HAVE_DETAILED_TIMINGS call timer%start("check_allowed_complex_kernels") #endif err = .false. if (AVAILABLE_COMPLEX_ELPA_KERNELS(THIS_COMPLEX_ELPA_KERNEL) .ne. 1) err=.true. #ifdef HAVE_DETAILED_TIMINGS call timer%stop("check_allowed_complex_kernels") #endif end function check_allowed_complex_kernels function qr_decomposition_via_environment_variable(useQR) result(isSet) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none logical, intent(out) :: useQR logical :: isSet CHARACTER(len=255) :: ELPA_QR_DECOMPOSITION #ifdef HAVE_DETAILED_TIMINGS call timer%start("qr_decomposition_via_environment_variable") #endif isSet = .false. #if defined(HAVE_ENVIRONMENT_CHECKING) call get_environment_variable("ELPA_QR_DECOMPOSITION",ELPA_QR_DECOMPOSITION) #endif if (trim(ELPA_QR_DECOMPOSITION) .eq. "yes") then useQR = .true. isSet = .true. endif if (trim(ELPA_QR_DECOMPOSITION) .eq. "no") then useQR = .false. isSet = .true. endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("qr_decomposition_via_environment_variable") #endif end function qr_decomposition_via_environment_variable function real_kernel_via_environment_variable() result(kernel) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: kernel CHARACTER(len=255) :: REAL_KERNEL_ENVIRONMENT integer :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("real_kernel_via_environment_variable") #endif #if defined(HAVE_ENVIRONMENT_CHECKING) call get_environment_variable("REAL_ELPA_KERNEL",REAL_KERNEL_ENVIRONMENT) #endif do i=1,size(REAL_ELPA_KERNEL_NAMES(:)) ! if (trim(dummy_char) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then if (trim(REAL_KERNEL_ENVIRONMENT) .eq. trim(REAL_ELPA_KERNEL_NAMES(i))) then kernel = i exit else kernel = 0 endif enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("real_kernel_via_environment_variable") #endif end function real_kernel_via_environment_variable function complex_kernel_via_environment_variable() result(kernel) #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer :: kernel CHARACTER(len=255) :: COMPLEX_KERNEL_ENVIRONMENT integer :: i #ifdef HAVE_DETAILED_TIMINGS call timer%start("complex_kernel_via_environment_variable") #endif #if defined(HAVE_ENVIRONMENT_CHECKING) call get_environment_variable("COMPLEX_ELPA_KERNEL",COMPLEX_KERNEL_ENVIRONMENT) #endif do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:)) if (trim(COMPLEX_ELPA_KERNEL_NAMES(i)) .eq. trim(COMPLEX_KERNEL_ENVIRONMENT)) then kernel = i exit else kernel = 0 endif enddo #ifdef HAVE_DETAILED_TIMINGS call timer%stop("complex_kernel_via_environment_variable") #endif end function !------------------------------------------------------------------------------- end module ELPA2_utilities elpa-2016.05.001/src/elpa1.F900000644000312500001440000004542512717516040012110 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". !> \mainpage !> Eigenvalue SoLvers for Petaflop-Applications (ELPA) !> \par !> http://elpa.mpcdf.mpg.de !> !> \par !> The ELPA library was originally created by the ELPA consortium, !> consisting of the following organizations: !> !> - Max Planck Computing and Data Facility (MPCDF) formerly known as !> Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), !> - Bergische Universität Wuppertal, Lehrstuhl für angewandte !> Informatik, !> - Technische Universität München, Lehrstuhl für Informatik mit !> Schwerpunkt Wissenschaftliches Rechnen , !> - Fritz-Haber-Institut, Berlin, Abt. Theorie, !> - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, !> Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, !> and !> - IBM Deutschland GmbH !> !> Some parts and enhancements of ELPA have been contributed and authored !> by the Intel Corporation which is not part of the ELPA consortium. !> !> Contributions to the ELPA source have been authored by (in alphabetical order): !> !> \author T. Auckenthaler, Volker Blum, A. Heinecke, L. Huedepohl, R. Johanni, Werner Jürgens, and A. Marek #include "config-f90.h" !> \brief Fortran module which provides the routines to use the one-stage ELPA solver module ELPA1 use precision use elpa_utilities use elpa1_compute #ifdef HAVE_DETAILED_TIMINGS use timings #endif use elpa_mpi implicit none PRIVATE ! By default, all routines contained are private ! The following routines are public: public :: get_elpa_row_col_comms !< old, deprecated interface: Sets MPI row/col communicators public :: get_elpa_communicators !< Sets MPI row/col communicators public :: solve_evp_real !< old, deprecated interface: Driver routine for real eigenvalue problem public :: solve_evp_real_1stage !< Driver routine for real eigenvalue problem public :: solve_evp_complex !< old, deprecated interface: Driver routine for complex eigenvalue problem public :: solve_evp_complex_1stage !< Driver routine for complex eigenvalue problem ! Timing results, set by every call to solve_evp_xxx real(kind=rk), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form) real(kind=rk), public :: time_evp_solve !< time for solving the tridiagonal system real(kind=rk), public :: time_evp_back !< time for back transformations of eigenvectors logical, public :: elpa_print_times = .false. !< Set elpa_print_times to .true. for explicit timing outputs !> \brief get_elpa_row_col_comms: old, deprecated Fortran function to create the MPI communicators for ELPA. Better use "elpa_get_communicators" !> \detail !> The interface and variable definition is the same as in "elpa_get_communicators" !> \param mpi_comm_global Global communicator for the calculations (in) !> !> \param my_prow Row coordinate of the calling process in the process grid (in) !> !> \param my_pcol Column coordinate of the calling process in the process grid (in) !> !> \param mpi_comm_rows Communicator for communicating within rows of processes (out) !> !> \param mpi_comm_cols Communicator for communicating within columns of processes (out) !> \result mpierr integer error value of mpi_comm_split function interface get_elpa_row_col_comms module procedure get_elpa_communicators end interface !> \brief solve_evp_real: old, deprecated Fortran function to solve the real eigenvalue problem with 1-stage solver. Better use "solve_evp_real_1stage" !> !> \detail !> The interface and variable definition is the same as in "elpa_solve_evp_real_1stage" ! Parameters ! !> \param na Order of matrix a !> !> \param nev Number of eigenvalues needed. !> The smallest nev eigenvalues/eigenvectors are calculated. !> !> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. !> Distribution is like in Scalapack. !> The full matrix must be set (not only one half like in scalapack). !> Destroyed on exit (upper and lower half). !> !> \param lda Leading dimension of a !> !> \param ev(na) On output: eigenvalues of a, every processor gets the complete set !> !> \param q(ldq,matrixCols) On output: Eigenvectors of a !> Distribution is like in Scalapack. !> Must be always dimensioned to the full size (corresponding to (na,na)) !> even if only a part of the eigenvalues is needed. !> !> \param ldq Leading dimension of q !> !> \param nblk blocksize of cyclic distribution, must be the same in both directions! !> !> \param matrixCols distributed number of matrix columns !> !> \param mpi_comm_rows MPI-Communicator for rows !> \param mpi_comm_cols MPI-Communicator for columns !> !> \result success interface solve_evp_real module procedure solve_evp_real_1stage end interface !> \brief solve_evp_complex: old, deprecated Fortran function to solve the complex eigenvalue problem with 1-stage solver. Better use "solve_evp_complex_1stage" !> !> \detail !> The interface and variable definition is the same as in "elpa_solve_evp_complex_1stage" ! Parameters ! !> \param na Order of matrix a !> !> \param nev Number of eigenvalues needed. !> The smallest nev eigenvalues/eigenvectors are calculated. !> !> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. !> Distribution is like in Scalapack. !> The full matrix must be set (not only one half like in scalapack). !> Destroyed on exit (upper and lower half). !> !> \param lda Leading dimension of a !> !> \param ev(na) On output: eigenvalues of a, every processor gets the complete set !> !> \param q(ldq,matrixCols) On output: Eigenvectors of a !> Distribution is like in Scalapack. !> Must be always dimensioned to the full size (corresponding to (na,na)) !> even if only a part of the eigenvalues is needed. !> !> \param ldq Leading dimension of q !> !> \param nblk blocksize of cyclic distribution, must be the same in both directions! !> !> \param matrixCols distributed number of matrix columns !> !> \param mpi_comm_rows MPI-Communicator for rows !> \param mpi_comm_cols MPI-Communicator for columns !> !> \result success interface solve_evp_complex module procedure solve_evp_complex_1stage end interface contains !------------------------------------------------------------------------------- !> \brief Fortran function to create the MPI communicators for ELPA. ! All ELPA routines need MPI communicators for communicating within ! rows or columns of processes, these are set here. ! mpi_comm_rows/mpi_comm_cols can be free'd with MPI_Comm_free if not used any more. ! ! Parameters ! !> \param mpi_comm_global Global communicator for the calculations (in) !> !> \param my_prow Row coordinate of the calling process in the process grid (in) !> !> \param my_pcol Column coordinate of the calling process in the process grid (in) !> !> \param mpi_comm_rows Communicator for communicating within rows of processes (out) !> !> \param mpi_comm_cols Communicator for communicating within columns of processes (out) !> \result mpierr integer error value of mpi_comm_split function function get_elpa_communicators(mpi_comm_global, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols) result(mpierr) use precision implicit none integer(kind=ik), intent(in) :: mpi_comm_global, my_prow, my_pcol integer(kind=ik), intent(out) :: mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: mpierr ! mpi_comm_rows is used for communicating WITHIN rows, i.e. all processes ! having the same column coordinate share one mpi_comm_rows. ! So the "color" for splitting is my_pcol and the "key" is my row coordinate. ! Analogous for mpi_comm_cols call mpi_comm_split(mpi_comm_global,my_pcol,my_prow,mpi_comm_rows,mpierr) call mpi_comm_split(mpi_comm_global,my_prow,my_pcol,mpi_comm_cols,mpierr) end function get_elpa_communicators !> \brief solve_evp_real_1stage: Fortran function to solve the real eigenvalue problem with 1-stage solver !> ! Parameters ! !> \param na Order of matrix a !> !> \param nev Number of eigenvalues needed. !> The smallest nev eigenvalues/eigenvectors are calculated. !> !> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. !> Distribution is like in Scalapack. !> The full matrix must be set (not only one half like in scalapack). !> Destroyed on exit (upper and lower half). !> !> \param lda Leading dimension of a !> !> \param ev(na) On output: eigenvalues of a, every processor gets the complete set !> !> \param q(ldq,matrixCols) On output: Eigenvectors of a !> Distribution is like in Scalapack. !> Must be always dimensioned to the full size (corresponding to (na,na)) !> even if only a part of the eigenvalues is needed. !> !> \param ldq Leading dimension of q !> !> \param nblk blocksize of cyclic distribution, must be the same in both directions! !> !> \param matrixCols distributed number of matrix columns !> !> \param mpi_comm_rows MPI-Communicator for rows !> \param mpi_comm_cols MPI-Communicator for columns !> !> \result success function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) use precision #ifdef HAVE_DETAILED_TIMINGS use timings #endif implicit none integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real(kind=rk) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) ! was ! real a(lda,*), q(ldq,*) integer(kind=ik) :: my_prow, my_pcol, mpierr real(kind=rk), allocatable :: e(:), tau(:) real(kind=rk) :: ttt0, ttt1 logical :: success logical, save :: firstCall = .true. logical :: wantDebug #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_evp_real_1stage") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) success = .true. wantDebug = .false. if (firstCall) then ! are debug messages desired? wantDebug = debug_messages_via_environment_variable() firstCall = .false. endif allocate(e(na), tau(na)) ttt0 = MPI_Wtime() call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0 time_evp_fwd = ttt1-ttt0 ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) if (.not.(success)) return ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 time_evp_solve = ttt1-ttt0 ttt0 = MPI_Wtime() call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0 time_evp_back = ttt1-ttt0 deallocate(e, tau) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_evp_real_1stage") #endif end function solve_evp_real_1stage !> \brief solve_evp_complex_1stage: Fortran function to solve the complex eigenvalue problem with 1-stage solver !> ! Parameters ! !> \param na Order of matrix a !> !> \param nev Number of eigenvalues needed. !> The smallest nev eigenvalues/eigenvectors are calculated. !> !> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. !> Distribution is like in Scalapack. !> The full matrix must be set (not only one half like in scalapack). !> Destroyed on exit (upper and lower half). !> !> \param lda Leading dimension of a !> !> \param ev(na) On output: eigenvalues of a, every processor gets the complete set !> !> \param q(ldq,matrixCols) On output: Eigenvectors of a !> Distribution is like in Scalapack. !> Must be always dimensioned to the full size (corresponding to (na,na)) !> even if only a part of the eigenvalues is needed. !> !> \param ldq Leading dimension of q !> !> \param nblk blocksize of cyclic distribution, must be the same in both directions! !> !> \param matrixCols distributed number of matrix columns !> !> \param mpi_comm_rows MPI-Communicator for rows !> \param mpi_comm_cols MPI-Communicator for columns !> !> \result success function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols) ! was ! complex a(lda,*), q(ldq,*) real(kind=rk) :: ev(na) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_rows, l_cols, l_cols_nev real(kind=rk), allocatable :: q_real(:,:), e(:) complex(kind=ck), allocatable :: tau(:) real(kind=rk) :: ttt0, ttt1 logical :: success logical, save :: firstCall = .true. logical :: wantDebug #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_evp_complex_1stage") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) success = .true. wantDebug = .false. if (firstCall) then ! are debug messages desired? wantDebug = debug_messages_via_environment_variable() firstCall = .false. endif l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev allocate(e(na), tau(na)) allocate(q_real(l_rows,l_cols)) ttt0 = MPI_Wtime() call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0 time_evp_fwd = ttt1-ttt0 ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) if (.not.(success)) return ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 time_evp_solve = ttt1-ttt0 ttt0 = MPI_Wtime() q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0 time_evp_back = ttt1-ttt0 deallocate(q_real) deallocate(e, tau) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_evp_complex_1stage") #endif end function solve_evp_complex_1stage end module ELPA1 elpa-2016.05.001/src/elpa1_compute.F900000644000312500001440000046141112717516040013641 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! This particular source code file contains additions, changes and ! enhancements authored by Intel Corporation which is not part of ! the ELPA consortium. ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "config-f90.h" module ELPA1_compute use elpa_utilities #ifdef HAVE_DETAILED_TIMINGS use timings #endif use elpa_mpi implicit none PRIVATE ! set default to private public :: tridiag_real ! Transform real symmetric matrix to tridiagonal form public :: trans_ev_real ! Transform eigenvectors of a tridiagonal matrix back public :: mult_at_b_real ! Multiply real matrices A**T * B public :: tridiag_complex ! Transform complex hermitian matrix to tridiagonal form public :: trans_ev_complex ! Transform eigenvectors of a tridiagonal matrix back public :: mult_ah_b_complex ! Multiply complex matrices A**H * B public :: solve_tridi ! Solve tridiagonal eigensystem with divide and conquer method public :: cholesky_real ! Cholesky factorization of a real matrix public :: invert_trm_real ! Invert real triangular matrix public :: cholesky_complex ! Cholesky factorization of a complex matrix public :: invert_trm_complex ! Invert complex triangular matrix public :: local_index ! Get local index of a block cyclic distributed matrix public :: least_common_multiple ! Get least common multiple public :: hh_transform_real public :: hh_transform_complex public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real contains #define DATATYPE REAL(kind=rk) #define BYTESIZE 8 #define REALCASE 1 #include "elpa_transpose_vectors.X90" #include "elpa_reduce_add_vectors.X90" #undef DATATYPE #undef BYTESIZE #undef REALCASE subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) !------------------------------------------------------------------------------- ! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form ! (like Scalapack Routine PDSYTRD) ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the Householder vectors ! ! lda Leading dimension of a ! matrixCols local columns of matrix ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! d(na) Diagonal elements (returned), identical on all processors ! ! e(na) Off-Diagonal elements (returned), identical on all processors ! ! tau(na) Factors for the Householder vectors (returned), needed for back transformation ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real(kind=rk) :: d(na), e(na), tau(na) #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk) :: a(lda,*) #else real(kind=rk) :: a(lda,matrixCols) #endif integer(kind=ik), parameter :: max_stored_rows = 32 integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols integer(kind=ik) :: l_cols, l_rows, nstor integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile #ifdef WITH_OPENMP integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads #endif real(kind=rk) :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf real(kind=rk), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) #ifdef WITH_OPENMP real(kind=rk), allocatable :: ur_p(:,:), uc_p(:,:) #endif integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("tridiag_real") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile totalblocks = (na-1)/nblk + 1 max_blocks_row = (totalblocks-1)/np_rows + 1 max_blocks_col = (totalblocks-1)/np_cols + 1 max_local_rows = max_blocks_row*nblk max_local_cols = max_blocks_col*nblk allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating tmp "//errorMessage stop endif allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating vr "//errorMessage stop endif allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating ur "//errorMessage stop endif allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating vc "//errorMessage stop endif allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating uc "//errorMessage stop endif #ifdef WITH_OPENMP max_threads = omp_get_max_threads() allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating ur_p "//errorMessage stop endif allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating uc_p "//errorMessage stop endif #endif tmp = 0 vr = 0 ur = 0 vc = 0 uc = 0 allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating vur "//errorMessage stop endif allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating uvc "//errorMessage stop endif d(:) = 0 e(:) = 0 tau(:) = 0 nstor = 0 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) do istep=na,3,-1 ! Calculate number of local rows and columns of the still remaining matrix ! on the local processor l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) ! Calculate vector for Householder transformation on all procs ! owning column istep if(my_pcol==pcol(istep, nblk, np_cols)) then ! Get vector to be transformed; distribute last element and norm of ! remaining elements to all procs in current column vr(1:l_rows) = a(1:l_rows,l_cols+1) if(nstor>0 .and. l_rows>0) then call DGEMV('N',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1), & uvc(l_cols+1,1),ubound(uvc,dim=1),1.d0,vr,1) endif if(my_prow==prow(istep-1, nblk, np_rows)) then aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1)) aux1(2) = vr(l_rows) else aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) aux1(2) = 0. endif #ifdef WITH_MPI call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else aux2 = aux1 #endif vnorm2 = aux2(1) vrl = aux2(2) ! Householder transformation call hh_transform_real(vrl, vnorm2, xf, tau(istep)) ! Scale vr and store Householder vector for back transformation vr(1:l_rows) = vr(1:l_rows) * xf if(my_prow==prow(istep-1, nblk, np_rows)) then vr(l_rows) = 1. e(istep-1) = vrl endif a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation endif ! Broadcast the Householder vector (and tau) along columns if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) #ifdef WITH_MPI call MPI_Bcast(vr,l_rows+1,MPI_REAL8,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) #endif tau(istep) = vr(l_rows+1) ! Transpose Householder vector vr -> vc call elpa_transpose_vectors_real (vr, ubound(vr,dim=1), mpi_comm_rows, & vc, ubound(vc,dim=1), mpi_comm_cols, & 1, istep-1, 1, nblk) ! Calculate u = (A + VU**T + UV**T)*v ! For cache efficiency, we use only the upper half of the matrix tiles for this, ! thus the result is partly in uc(:) and partly in ur(:) uc(1:l_cols) = 0 ur(1:l_rows) = 0 if (l_rows>0 .and. l_cols>0) then #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) my_thread = omp_get_thread_num() n_threads = omp_get_num_threads() n_iter = 0 uc_p(1:l_cols,my_thread) = 0. ur_p(1:l_rows,my_thread) = 0. #endif do i=0,(istep-2)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce0) then call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,dim=1),vr,1,0.d0,aux,1) call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,dim=1),aux,1,1.d0,uc,1) endif endif ! Sum up all ur(:) parts along rows and add them to the uc(:) parts ! on the processors containing the diagonal ! This is only necessary if ur has been calculated, i.e. if the ! global tile size is smaller than the global remaining matrix if (tile_size < istep-1) then call elpa_reduce_add_vectors_REAL (ur, ubound(ur,dim=1), mpi_comm_rows, & uc, ubound(uc,dim=1), mpi_comm_cols, & istep-1, 1, nblk) endif ! Sum up all the uc(:) parts, transpose uc -> ur if (l_cols>0) then tmp(1:l_cols) = uc(1:l_cols) #ifdef WITH_MPI call mpi_allreduce(tmp,uc,l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else uc = tmp #endif endif call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, & ur, ubound(ur,dim=1), mpi_comm_rows, & 1, istep-1, 1, nblk) ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) x = 0 if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols)) #ifdef WITH_MPI call mpi_allreduce(x,vav,1,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) #else vav = x #endif ! store u and v in the matrices U and V ! these matrices are stored combined in one here do j=1,l_rows vur(j,2*nstor+1) = tau(istep)*vr(j) vur(j,2*nstor+2) = 0.5*tau(istep)*vav*vr(j) - ur(j) enddo do j=1,l_cols uvc(j,2*nstor+1) = 0.5*tau(istep)*vav*vc(j) - uc(j) uvc(j,2*nstor+2) = tau(istep)*vc(j) enddo nstor = nstor+1 ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T if (nstor==max_stored_rows .or. istep==3) then do i=0,(istep-2)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lrs = 1 lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) a(l_rows,l_cols) = a(l_rows,l_cols) & + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) d(istep-1) = a(l_rows,l_cols) endif enddo ! Store e(1) and d(1) if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when deallocating uvc "//errorMessage stop endif ! distribute the arrays d and e to all processors allocate(tmp(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when allocating tmp "//errorMessage stop endif #ifdef WITH_MPI tmp = d call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) tmp = d call mpi_allreduce(tmp,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) tmp = e call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) tmp = e call mpi_allreduce(tmp,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) #endif deallocate(tmp, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_real: error when deallocating tmp "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("tridiag_real") #endif end subroutine tridiag_real subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) !------------------------------------------------------------------------------- ! trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back ! to the eigenvectors of the original matrix ! (like Scalapack Routine PDORMTR) ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nqc Number of columns of matrix q ! ! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real) ! Distribution is like in Scalapack. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a and q ! ! tau(na) Factors of the Householder vectors ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! ldq Leading dimension of q ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real(kind=rk) :: tau(na) #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk) :: a(lda,*), q(ldq,*) #else real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols) #endif integer(kind=ik) :: max_stored_rows integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols integer(kind=ik) :: l_cols, l_rows, l_colh, nstor integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol real(kind=rk), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) real(kind=rk), allocatable :: tmat(:,:), h1(:), h2(:) integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("trans_ev_real") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) totalblocks = (na-1)/nblk + 1 max_blocks_row = (totalblocks-1)/np_rows + 1 max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! max_local_rows = max_blocks_row*nblk max_local_cols = max_blocks_col*nblk max_stored_rows = (63/nblk+1)*nblk allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating tmat "//errorMessage stop endif allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating h1 "//errorMessage stop endif allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating h2 "//errorMessage stop endif allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating tmp1 "//errorMessage stop endif allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating tmp2 "//errorMessage stop endif allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating hvn "//errorMessage stop endif allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when allocating hvm "//errorMessage stop endif hvm = 0 ! Must be set to 0 !!! hvb = 0 ! Safety only l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q nstor = 0 do istep=1,na,nblk ics = MAX(istep,3) ice = MIN(istep+nblk-1,na) if (ice0) & call MPI_Bcast(hvb,nb,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr) #endif nb = 0 do ic=ics,ice l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) nstor = nstor+1 nb = nb+l_rows enddo ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then ! Calculate scalar products of stored vectors. ! This can be done in different ways, we use dsyrk tmat = 0 if (l_rows>0) & call dsyrk('U','T',nstor,l_rows,1.d0,hvm,ubound(hvm,dim=1),0.d0,tmat,max_stored_rows) nc = 0 do n=1,nstor-1 h1(nc+1:nc+n) = tmat(1:n,n+1) nc = nc+n enddo #ifdef WITH_MPI if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else if (nc>0) h2 = h1 #endif ! Calculate triangular matrix T nc = 0 tmat(1,1) = tau(ice-nstor+1) do n=1,nstor-1 call dtrmv('L','T','N',n,tmat,max_stored_rows,h2(nc+1),1) tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1) nc = nc+n enddo ! Q = Q - V * T * V**T * Q if (l_rows>0) then call dgemm('T','N',nstor,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), & q,ldq,0.d0,tmp1,nstor) else tmp1(1:l_cols*nstor) = 0 endif #ifdef WITH_MPI call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) #else tmp2 = tmp1 #endif if (l_rows>0) then call dtrmm('L','L','N','N',nstor,l_cols,1.0d0,tmat,max_stored_rows,tmp2,nstor) call dgemm('N','N',l_rows,l_cols,nstor,-1.d0,hvm,ubound(hvm,dim=1), & tmp2,nstor,1.d0,q,ldq) endif nstor = 0 endif enddo deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_real: error when deallocating hvm "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_real") #endif end subroutine trans_ev_real subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_comm_rows, mpi_comm_cols, c, ldc) !------------------------------------------------------------------------------- ! mult_at_b_real: Performs C := A**T * B ! ! where: A is a square matrix (na,na) which is optionally upper or lower triangular ! B is a (na,ncb) matrix ! C is a (na,ncb) matrix where optionally only the upper or lower ! triangle may be computed ! ! Parameters ! ! uplo_a 'U' if A is upper triangular ! 'L' if A is lower triangular ! anything else if A is a full matrix ! Please note: This pertains to the original A (as set in the calling program) ! whereas the transpose of A is used for calculations ! If uplo_a is 'U' or 'L', the other triangle is not used at all, ! i.e. it may contain arbitrary numbers ! ! uplo_c 'U' if only the upper diagonal part of C is needed ! 'L' if only the upper diagonal part of C is needed ! anything else if the full matrix C is needed ! Please note: Even when uplo_c is 'U' or 'L', the other triangle may be ! written to a certain extent, i.e. one shouldn't rely on the content there! ! ! na Number of rows/columns of A, number of rows of B and C ! ! ncb Number of columns of B and C ! ! a Matrix A ! ! lda Leading dimension of a ! ! b Matrix B ! ! ldb Leading dimension of b ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! c Matrix C ! ! ldc Leading dimension of c ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none character*1 :: uplo_a, uplo_c integer(kind=ik) :: na, ncb, lda, ldb, nblk, mpi_comm_rows, mpi_comm_cols, ldc real(kind=rk) :: a(lda,*), b(ldb,*), c(ldc,*) ! remove assumed size! integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows, l_rows_np integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce integer(kind=ik) :: gcol_min, gcol, goff integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals integer(kind=ik), allocatable :: lrs_save(:), lre_save(:) logical :: a_lower, a_upper, c_lower, c_upper real(kind=rk), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:) integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("mult_at_b_real") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b ! Block factor for matrix multiplications, must be a multiple of nblk if (na/np_rows<=256) then nblk_mult = (31/nblk+1)*nblk else nblk_mult = (63/nblk+1)*nblk endif allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when allocating aux_mat "//errorMessage stop endif allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when allocating aux_bc "//errorMessage stop endif allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when allocating lrs_save "//errorMessage stop endif allocate(lre_save(nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when allocating lre_save "//errorMessage stop endif a_lower = .false. a_upper = .false. c_lower = .false. c_upper = .false. if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true. if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true. if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true. if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true. ! Build up the result matrix by processor rows do np = 0, np_rows-1 ! In this turn, procs of row np assemble the result l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors nr_done = 0 ! Number of rows done aux_mat = 0 nstor = 0 ! Number of columns stored in aux_mat ! Loop over the blocks on row np do nb=0,(l_rows_np-1)/nblk goff = nb*np_rows + np ! Global offset in blocks corresponding to nb ! Get the processor column which owns this block (A is transposed, so we need the column) ! and the offset in blocks within this column. ! The corresponding block column in A is then broadcast to all for multiplication with B np_bc = MOD(goff,np_cols) noff = goff/np_cols n_aux_bc = 0 ! Gather up the complete block column of A on the owner do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast gcol = goff*nblk + n ! global column corresponding to n if (nstor==0 .and. n==1) gcol_min = gcol lrs = 1 ! 1st local row number for broadcast lre = l_rows ! last local row number for broadcast if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1) if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) if (lrs<=lre) then nvals = lre-lrs+1 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n) n_aux_bc = n_aux_bc + nvals endif lrs_save(n) = lrs lre_save(n) = lre enddo ! Broadcast block column #ifdef WITH_MPI call MPI_Bcast(aux_bc,n_aux_bc,MPI_REAL8,np_bc,mpi_comm_cols,mpierr) #endif ! Insert what we got in aux_mat n_aux_bc = 0 do n = 1, min(l_rows_np-nb*nblk,nblk) nstor = nstor+1 lrs = lrs_save(n) lre = lre_save(n) if (lrs<=lre) then nvals = lre-lrs+1 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals) n_aux_bc = n_aux_bc + nvals endif enddo ! If we got nblk_mult columns in aux_mat or this is the last block ! do the matrix multiplication if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then lrs = 1 ! 1st local row number for multiply lre = l_rows ! last local row number for multiply if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1) if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) lcs = 1 ! 1st local col number for multiply lce = l_cols ! last local col number for multiply if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1) if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols) if (lcs<=lce) then allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when allocating tmp1 "//errorMessage stop endif if (lrs<=lre) then call dgemm('T','N',nstor,lce-lcs+1,lre-lrs+1,1.d0,aux_mat(lrs,1),ubound(aux_mat,dim=1), & b(lrs,lcs),ldb,0.d0,tmp1,nstor) else tmp1 = 0 endif ! Sum up the results and send to processor row np #ifdef WITH_MPI call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_REAL8,MPI_SUM,np,mpi_comm_rows,mpierr) #else tmp2 = tmp1 #endif ! Put the result into C if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when deallocating tmp1 "//errorMessage stop endif endif nr_done = nr_done+nstor nstor=0 aux_mat(:,:)=0 endif enddo enddo deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_at_b_real: error when deallocating aux_mat "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mult_at_b_real") #endif end subroutine mult_at_b_real #define DATATYPE COMPLEX(kind=ck) #define BYTESIZE 16 #define COMPLEXCASE 1 #include "elpa_transpose_vectors.X90" #include "elpa_reduce_add_vectors.X90" #undef DATATYPE #undef BYTESIZE #undef COMPLEXCASE subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) !------------------------------------------------------------------------------- ! tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form ! (like Scalapack Routine PZHETRD) ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be reduced. ! Distribution is like in Scalapack. ! Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half) ! a(:,:) is overwritten on exit with the Householder vectors ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! d(na) Diagonal elements (returned), identical on all processors ! ! e(na) Off-Diagonal elements (returned), identical on all processors ! ! tau(na) Factors for the Householder vectors (returned), needed for back transformation ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols complex(kind=ck) :: tau(na) #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck) :: a(lda,*) #else complex(kind=ck) :: a(lda,matrixCols) #endif real(kind=rk) :: d(na), e(na) integer(kind=ik), parameter :: max_stored_rows = 32 complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols integer(kind=ik) :: l_cols, l_rows, nstor integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile #ifdef WITH_OPENMP integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads #endif real(kind=rk) :: vnorm2 complex(kind=ck) :: vav, xc, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf complex(kind=ck), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) #ifdef WITH_OPENMP complex(kind=ck), allocatable :: ur_p(:,:), uc_p(:,:) #endif real(kind=rk), allocatable :: tmpr(:) integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("tridiag_complex") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile totalblocks = (na-1)/nblk + 1 max_blocks_row = (totalblocks-1)/np_rows + 1 max_blocks_col = (totalblocks-1)/np_cols + 1 max_local_rows = max_blocks_row*nblk max_local_cols = max_blocks_col*nblk allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating tmp "//errorMessage stop endif allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating vr "//errorMessage stop endif allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating ur "//errorMessage stop endif allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating vc "//errorMessage stop endif allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating uc "//errorMessage stop endif #ifdef WITH_OPENMP max_threads = omp_get_max_threads() allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating ur_p "//errorMessage stop endif allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating uc_p "//errorMessage stop endif #endif tmp = 0 vr = 0 ur = 0 vc = 0 uc = 0 allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating vur "//errorMessage stop endif allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating uvc "//errorMessage stop endif d(:) = 0 e(:) = 0 tau(:) = 0 nstor = 0 l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a if (my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) do istep=na,3,-1 ! Calculate number of local rows and columns of the still remaining matrix ! on the local processor l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) ! Calculate vector for Householder transformation on all procs ! owning column istep if (my_pcol==pcol(istep, nblk, np_cols)) then ! Get vector to be transformed; distribute last element and norm of ! remaining elements to all procs in current column vr(1:l_rows) = a(1:l_rows,l_cols+1) if (nstor>0 .and. l_rows>0) then aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor)) call ZGEMV('N',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1), & aux,1,CONE,vr,1) endif if (my_prow==prow(istep-1, nblk, np_rows)) then aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1)) aux1(2) = vr(l_rows) else aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) aux1(2) = 0. endif #ifdef WITH_MPI call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else aux2 = aux1 #endif vnorm2 = aux2(1) vrl = aux2(2) ! Householder transformation call hh_transform_complex(vrl, vnorm2, xf, tau(istep)) ! Scale vr and store Householder vector for back transformation vr(1:l_rows) = vr(1:l_rows) * xf if (my_prow==prow(istep-1, nblk, np_rows)) then vr(l_rows) = 1. e(istep-1) = vrl endif a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation endif ! Broadcast the Householder vector (and tau) along columns if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) #ifdef WITH_MPI call MPI_Bcast(vr,l_rows+1,MPI_DOUBLE_COMPLEX,pcol(istep, nblk, np_cols),mpi_comm_cols,mpierr) #endif tau(istep) = vr(l_rows+1) ! Transpose Householder vector vr -> vc ! call elpa_transpose_vectors (vr, 2*ubound(vr,dim=1), mpi_comm_rows, & ! vc, 2*ubound(vc,dim=1), mpi_comm_cols, & ! 1, 2*(istep-1), 1, 2*nblk) call elpa_transpose_vectors_complex (vr, ubound(vr,dim=1), mpi_comm_rows, & vc, ubound(vc,dim=1), mpi_comm_cols, & 1, (istep-1), 1, nblk) ! Calculate u = (A + VU**T + UV**T)*v ! For cache efficiency, we use only the upper half of the matrix tiles for this, ! thus the result is partly in uc(:) and partly in ur(:) uc(1:l_cols) = 0 ur(1:l_rows) = 0 if (l_rows>0 .and. l_cols>0) then #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) my_thread = omp_get_thread_num() n_threads = omp_get_num_threads() n_iter = 0 uc_p(1:l_cols,my_thread) = 0. ur_p(1:l_rows,my_thread) = 0. #endif do i=0,(istep-2)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce0) then call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,dim=1),vr,1,CZERO,aux,1) call ZGEMV('N',l_cols,2*nstor,CONE,uvc,ubound(uvc,dim=1),aux,1,CONE,uc,1) endif endif ! Sum up all ur(:) parts along rows and add them to the uc(:) parts ! on the processors containing the diagonal ! This is only necessary if ur has been calculated, i.e. if the ! global tile size is smaller than the global remaining matrix if (tile_size < istep-1) then call elpa_reduce_add_vectors_COMPLEX (ur, ubound(ur,dim=1), mpi_comm_rows, & uc, ubound(uc,dim=1), mpi_comm_cols, & (istep-1), 1, nblk) endif ! Sum up all the uc(:) parts, transpose uc -> ur if (l_cols>0) then tmp(1:l_cols) = uc(1:l_cols) #ifdef WITH_MPI call mpi_allreduce(tmp,uc,l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else uc = tmp #endif endif ! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, & ! ur, 2*ubound(ur,dim=1), mpi_comm_rows, & ! 1, 2*(istep-1), 1, 2*nblk) call elpa_transpose_vectors_complex (uc, ubound(uc,dim=1), mpi_comm_cols, & ur, ubound(ur,dim=1), mpi_comm_rows, & 1, (istep-1), 1, nblk) ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) xc = 0 if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols)) #ifdef WITH_MPI call mpi_allreduce(xc,vav,1,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_cols,mpierr) #else vav = xc #endif ! store u and v in the matrices U and V ! these matrices are stored combined in one here do j=1,l_rows vur(j,2*nstor+1) = conjg(tau(istep))*vr(j) vur(j,2*nstor+2) = 0.5*conjg(tau(istep))*vav*vr(j) - ur(j) enddo do j=1,l_cols uvc(j,2*nstor+1) = 0.5*conjg(tau(istep))*vav*vc(j) - uc(j) uvc(j,2*nstor+2) = conjg(tau(istep))*vc(j) enddo nstor = nstor+1 ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T if (nstor==max_stored_rows .or. istep==3) then do i=0,(istep-2)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) lrs = 1 lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) a(l_rows,l_cols) = a(l_rows,l_cols) & + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) d(istep-1) = a(l_rows,l_cols) endif enddo ! istep ! Store e(1) and d(1) if (my_pcol==pcol(2, nblk, np_cols)) then if (my_prow==prow(1, nblk, np_rows)) then ! We use last l_cols value of loop above vrl = a(1,l_cols) call hh_transform_complex(vrl, 0.d0, xf, tau(2)) e(1) = vrl a(1,l_cols) = 1. ! for consistency only endif #ifdef WITH_MPI call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,prow(1, nblk, np_rows),mpi_comm_rows,mpierr) #endif endif #ifdef WITH_MPI call mpi_bcast(tau(2),1,MPI_DOUBLE_COMPLEX,pcol(2, nblk, np_cols),mpi_comm_cols,mpierr) #endif if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when deallocating tmp "//errorMessage stop endif ! distribute the arrays d and e to all processors allocate(tmpr(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when allocating tmpr "//errorMessage stop endif #ifdef WITH_MPI tmpr = d call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) tmpr = d call mpi_allreduce(tmpr,d,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) tmpr = e call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr) tmpr = e call mpi_allreduce(tmpr,e,na,MPI_REAL8,MPI_SUM,mpi_comm_cols,mpierr) #endif deallocate(tmpr, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"tridiag_complex: error when deallocating tmpr "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("tridiag_complex") #endif end subroutine tridiag_complex subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) !------------------------------------------------------------------------------- ! trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back ! to the eigenvectors of the original matrix ! (like Scalapack Routine PZUNMTR) ! ! Parameters ! ! na Order of matrix a, number of rows of matrix q ! ! nqc Number of columns of matrix q ! ! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex) ! Distribution is like in Scalapack. ! ! lda Leading dimension of a ! ! tau(na) Factors of the Householder vectors ! ! q On input: Eigenvectors of tridiagonal matrix ! On output: Transformed eigenvectors ! Distribution is like in Scalapack. ! ! ldq Leading dimension of q ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols complex(kind=ck) :: tau(na) #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck) :: a(lda,*), q(ldq,*) #else complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols) #endif integer(kind=ik) :: max_stored_rows complex(kind=ck), parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols integer(kind=ik) :: l_cols, l_rows, l_colh, nstor integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol complex(kind=ck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) complex(kind=ck), allocatable :: tmat(:,:), h1(:), h2(:) integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("trans_ev_complex") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) totalblocks = (na-1)/nblk + 1 max_blocks_row = (totalblocks-1)/np_rows + 1 max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! max_local_rows = max_blocks_row*nblk max_local_cols = max_blocks_col*nblk max_stored_rows = (63/nblk+1)*nblk allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating tmat "//errorMessage stop endif allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating h1 "//errorMessage stop endif allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating h2 "//errorMessage stop endif allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating tmp1 "//errorMessage stop endif allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating tmp2 "//errorMessage stop endif allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating hvb "//errorMessage stop endif allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when allocating hvm "//errorMessage stop endif hvm = 0 ! Must be set to 0 !!! hvb = 0 ! Safety only l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q nstor = 0 ! In the complex case tau(2) /= 0 if (my_prow == prow(1, nblk, np_rows)) then q(1,1:l_cols) = q(1,1:l_cols)*((1.d0,0.d0)-tau(2)) endif do istep=1,na,nblk ics = MAX(istep,3) ice = MIN(istep+nblk-1,na) if (ice0) & call MPI_Bcast(hvb,nb,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr) #endif nb = 0 do ic=ics,ice l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) nstor = nstor+1 nb = nb+l_rows enddo ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then ! Calculate scalar products of stored vectors. ! This can be done in different ways, we use zherk tmat = 0 if (l_rows>0) & call zherk('U','C',nstor,l_rows,CONE,hvm,ubound(hvm,dim=1),CZERO,tmat,max_stored_rows) nc = 0 do n=1,nstor-1 h1(nc+1:nc+n) = tmat(1:n,n+1) nc = nc+n enddo #ifdef WITH_MPI if (nc>0) call mpi_allreduce(h1,h2,nc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else if (nc>0) h2=h1 #endif ! Calculate triangular matrix T nc = 0 tmat(1,1) = tau(ice-nstor+1) do n=1,nstor-1 call ztrmv('L','C','N',n,tmat,max_stored_rows,h2(nc+1),1) tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1) nc = nc+n enddo ! Q = Q - V * T * V**T * Q if (l_rows>0) then call zgemm('C','N',nstor,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), & q,ldq,CZERO,tmp1,nstor) else tmp1(1:l_cols*nstor) = 0 endif #ifdef WITH_MPI call mpi_allreduce(tmp1,tmp2,nstor*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr) #else tmp2 = tmp1 #endif if (l_rows>0) then call ztrmm('L','L','N','N',nstor,l_cols,CONE,tmat,max_stored_rows,tmp2,nstor) call zgemm('N','N',l_rows,l_cols,nstor,-CONE,hvm,ubound(hvm,dim=1), & tmp2,nstor,CONE,q,ldq) endif nstor = 0 endif enddo deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_complex: error when deallocating hvb "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("trans_ev_complex") #endif end subroutine trans_ev_complex subroutine mult_ah_b_complex(uplo_a, uplo_c, na, ncb, a, lda, b, ldb, nblk, mpi_comm_rows, mpi_comm_cols, c, ldc) !------------------------------------------------------------------------------- ! mult_ah_b_complex: Performs C := A**H * B ! ! where: A is a square matrix (na,na) which is optionally upper or lower triangular ! B is a (na,ncb) matrix ! C is a (na,ncb) matrix where optionally only the upper or lower ! triangle may be computed ! ! Parameters ! ! uplo_a 'U' if A is upper triangular ! 'L' if A is lower triangular ! anything else if A is a full matrix ! Please note: This pertains to the original A (as set in the calling program) ! whereas the transpose of A is used for calculations ! If uplo_a is 'U' or 'L', the other triangle is not used at all, ! i.e. it may contain arbitrary numbers ! ! uplo_c 'U' if only the upper diagonal part of C is needed ! 'L' if only the upper diagonal part of C is needed ! anything else if the full matrix C is needed ! Please note: Even when uplo_c is 'U' or 'L', the other triangle may be ! written to a certain extent, i.e. one shouldn't rely on the content there! ! ! na Number of rows/columns of A, number of rows of B and C ! ! ncb Number of columns of B and C ! ! a Matrix A ! ! lda Leading dimension of a ! ! b Matrix B ! ! ldb Leading dimension of b ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! ! c Matrix C ! ! ldc Leading dimension of c ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none character*1 :: uplo_a, uplo_c integer(kind=ik) :: na, ncb, lda, ldb, nblk, mpi_comm_rows, mpi_comm_cols, ldc complex(kind=ck) :: a(lda,*), b(ldb,*), c(ldc,*) ! remove assumed size! integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows, l_rows_np integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce integer(kind=ik) :: gcol_min, gcol, goff integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals integer(kind=ik), allocatable :: lrs_save(:), lre_save(:) logical :: a_lower, a_upper, c_lower, c_upper complex(kind=ck), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:) integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("mult_ah_b_complex") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b ! Block factor for matrix multiplications, must be a multiple of nblk if (na/np_rows<=256) then nblk_mult = (31/nblk+1)*nblk else nblk_mult = (63/nblk+1)*nblk endif allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when allocating aux_mat "//errorMessage stop endif allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when allocating aux_bc "//errorMessage stop endif allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when allocating lrs_save "//errorMessage stop endif allocate(lre_save(nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when allocating lre_save "//errorMessage stop endif a_lower = .false. a_upper = .false. c_lower = .false. c_upper = .false. if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true. if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true. if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true. if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true. ! Build up the result matrix by processor rows do np = 0, np_rows-1 ! In this turn, procs of row np assemble the result l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors nr_done = 0 ! Number of rows done aux_mat = 0 nstor = 0 ! Number of columns stored in aux_mat ! Loop over the blocks on row np do nb=0,(l_rows_np-1)/nblk goff = nb*np_rows + np ! Global offset in blocks corresponding to nb ! Get the processor column which owns this block (A is transposed, so we need the column) ! and the offset in blocks within this column. ! The corresponding block column in A is then broadcast to all for multiplication with B np_bc = MOD(goff,np_cols) noff = goff/np_cols n_aux_bc = 0 ! Gather up the complete block column of A on the owner do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast gcol = goff*nblk + n ! global column corresponding to n if (nstor==0 .and. n==1) gcol_min = gcol lrs = 1 ! 1st local row number for broadcast lre = l_rows ! last local row number for broadcast if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1) if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) if (lrs<=lre) then nvals = lre-lrs+1 if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n) n_aux_bc = n_aux_bc + nvals endif lrs_save(n) = lrs lre_save(n) = lre enddo ! Broadcast block column #ifdef WITH_MPI call MPI_Bcast(aux_bc,n_aux_bc,MPI_DOUBLE_COMPLEX,np_bc,mpi_comm_cols,mpierr) #endif ! Insert what we got in aux_mat n_aux_bc = 0 do n = 1, min(l_rows_np-nb*nblk,nblk) nstor = nstor+1 lrs = lrs_save(n) lre = lre_save(n) if (lrs<=lre) then nvals = lre-lrs+1 aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals) n_aux_bc = n_aux_bc + nvals endif enddo ! If we got nblk_mult columns in aux_mat or this is the last block ! do the matrix multiplication if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then lrs = 1 ! 1st local row number for multiply lre = l_rows ! last local row number for multiply if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1) if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) lcs = 1 ! 1st local col number for multiply lce = l_cols ! last local col number for multiply if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1) if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols) if (lcs<=lce) then allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when allocating tmp1 "//errorMessage stop endif if (lrs<=lre) then call zgemm('C','N',nstor,lce-lcs+1,lre-lrs+1,(1.d0,0.d0),aux_mat(lrs,1),ubound(aux_mat,dim=1), & b(lrs,lcs),ldb,(0.d0,0.d0),tmp1,nstor) else tmp1 = 0 endif ! Sum up the results and send to processor row np #ifdef WITH_MPI call mpi_reduce(tmp1,tmp2,nstor*(lce-lcs+1),MPI_DOUBLE_COMPLEX,MPI_SUM,np,mpi_comm_rows,mpierr) #else tmp2 = tmp1 #endif ! Put the result into C if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage stop endif endif nr_done = nr_done+nstor nstor=0 aux_mat(:,:)=0 endif enddo enddo deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"mult_ah_b_complex: error when deallocating aux_mat "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("mult_ah_b_complex") #endif end subroutine mult_ah_b_complex subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success ) #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real(kind=rk) :: d(na), e(na) #ifdef DESPERATELY_WANT_ASSUMED_SIZE real(kind=rk) :: q(ldq,*) #else real(kind=rk) :: q(ldq,matrixCols) #endif integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:) logical, intent(in) :: wantDebug logical, intent(out) :: success integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_tridi") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) success = .true. l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q ! Set Q to 0 q(1:l_rows, 1:l_cols) = 0. ! Get the limits of the subdivisons, each subdivison has as many cols ! as fit on the respective processor column allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when allocating limits "//errorMessage stop endif limits(0) = 0 do np=0,np_cols-1 nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np ! Check for the case that a column has have zero width. ! This is not supported! ! Scalapack supports it but delivers no results for these columns, ! which is rather annoying if (nc==0) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") #endif if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width' success = .false. return endif limits(np+1) = limits(np) + nc enddo ! Subdivide matrix by subtracting rank 1 modifications do i=1,np_cols-1 n = limits(i) d(n) = d(n)-abs(e(n)) d(n+1) = d(n+1)-abs(e(n)) enddo ! Solve sub problems on processsor columns nc = limits(my_pcol) ! column after which my problem starts if (np_cols>1) then nev1 = l_cols ! all eigenvectors are needed else nev1 = MIN(nev,l_cols) endif call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & matrixCols, mpi_comm_rows, wantDebug, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") #endif return endif ! If there is only 1 processor column, we are done if (np_cols==1) then deallocate(limits, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when deallocating limits "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") #endif return endif ! Set index arrays for Q columns ! Dense distribution scheme: allocate(l_col(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when allocating l_col "//errorMessage stop endif allocate(p_col(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when allocating p_col "//errorMessage stop endif n = 0 do np=0,np_cols-1 nc = local_index(na, np, np_cols, nblk, -1) do i=1,nc n = n+1 l_col(n) = i p_col(n) = np enddo enddo ! Block cyclic distribution scheme, only nev columns are set: allocate(l_col_bc(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when allocating l_col_bc "//errorMessage stop endif allocate(p_col_bc(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when allocating p_col_bc "//errorMessage stop endif p_col_bc(:) = -1 l_col_bc(:) = -1 do i = 0, na-1, nblk*np_cols do j = 0, np_cols-1 do n = 1, nblk if (i+j*nblk+n <= MIN(nev,na)) then p_col_bc(i+j*nblk+n) = j l_col_bc(i+j*nblk+n) = i/np_cols + n endif enddo enddo enddo ! Recursively merge sub problems call merge_recursive(0, np_cols, wantDebug, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") #endif return endif deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi: error when deallocating l_col "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_tridi") #endif return contains recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success) use precision implicit none ! noff is always a multiple of nblk_ev ! nlen-noff is always > nblk_ev integer(kind=ik) :: np_off, nprocs integer(kind=ik) :: np1, np2, noff, nlen, nmid, n #ifdef WITH_MPI integer(kind=ik) :: mpi_status(mpi_status_size) #endif logical, intent(in) :: wantDebug logical, intent(out) :: success success = .true. if (nprocs<=1) then ! Safety check only if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs success = .false. return endif ! Split problem into 2 subproblems of size np1 / np2 np1 = nprocs/2 np2 = nprocs-np1 if (np1 > 1) call merge_recursive(np_off, np1, wantDebug, success) if (.not.(success)) return if (np2 > 1) call merge_recursive(np_off+np1, np2, wantDebug, success) if (.not.(success)) return noff = limits(np_off) nmid = limits(np_off+np1) - noff nlen = limits(np_off+nprocs) - noff #ifdef WITH_MPI if (my_pcol==np_off) then do n=np_off+np1,np_off+nprocs-1 call mpi_send(d(noff+1),nmid,MPI_REAL8,n,1,mpi_comm_cols,mpierr) enddo endif #endif if (my_pcol>=np_off+np1 .and. my_pcol=np_off .and. my_pcol2*min_submatrix_size) n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries ndiv = ndiv*2 enddo ! If there is only 1 processor row and not all eigenvectors are needed ! and the matrix size is big enough, then use 2 subdivisions ! so that merge_systems is called once and only the needed ! eigenvectors are calculated for the final problem. if (np_rows==1 .and. nev2*min_submatrix_size) ndiv = 2 allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi_col: error when allocating limits "//errorMessage stop endif limits(0) = 0 limits(ndiv) = na n = ndiv do while(n>1) n = n/2 ! n is always a power of 2 do i=0,ndiv-1,2*n ! We want to have even boundaries (for cache line alignments) limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2 enddo enddo ! Calculate the maximum size of a subproblem max_size = 0 do i=1,ndiv max_size = MAX(max_size,limits(i)-limits(i-1)) enddo ! Subdivide matrix by subtracting rank 1 modifications do i=1,ndiv-1 n = limits(i) d(n) = d(n)-abs(e(n)) d(n+1) = d(n+1)-abs(e(n)) enddo if (np_rows==1) then ! For 1 processor row there may be 1 or 2 subdivisions do n=0,ndiv-1 noff = limits(n) ! Start of subproblem nlen = limits(n+1)-noff ! Size of subproblem call solve_tridi_single(nlen,d(noff+1),e(noff+1), & q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) if (.not.(success)) return enddo else ! Solve sub problems in parallel with solve_tridi_single ! There is at maximum 1 subproblem per processor allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage stop endif allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage stop endif qmat1 = 0 ! Make sure that all elements are defined if (my_prow < ndiv) then noff = limits(my_prow) ! Start of subproblem nlen = limits(my_prow+1)-noff ! Size of subproblem call solve_tridi_single(nlen,d(noff+1),e(noff+1),qmat1, & ubound(qmat1,dim=1), wantDebug, success) if (.not.(success)) return endif ! Fill eigenvectors in qmat1 into global matrix q do np = 0, ndiv-1 noff = limits(np) nlen = limits(np+1)-noff #ifdef WITH_MPI call MPI_Bcast(d(noff+1),nlen,MPI_REAL8,np,mpi_comm_rows,mpierr) #endif qmat2 = qmat1 #ifdef WITH_MPI call MPI_Bcast(qmat2,max_size*max_size,MPI_REAL8,np,mpi_comm_rows,mpierr) #endif do i=1,nlen call distribute_global_column(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) enddo enddo deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage stop endif endif ! Allocate and set index arrays l_col and p_col allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi_col: error when allocating l_col "//errorMessage stop endif do i=1,na l_col(i) = i p_col_i(i) = 0 p_col_o(i) = 0 enddo ! Merge subproblems n = 1 do while(n 1d-14) then write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1) else write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1) write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.' write(error_unit,'(a)') 'In this extent, this is completely harmless.' write(error_unit,'(a)') 'Still, we keep this info message just in case.' end if allocate(qtmp(nlen), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_tridi_single: error when allocating qtmp "//errorMessage stop endif dtmp = d(i+1) qtmp(1:nlen) = q(1:nlen,i+1) do j=i,1,-1 if (dtmp=npc_0+npc_n) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("merge_systems") #endif return endif ! Determine number of "next" and "prev" column for ring sends if (my_pcol == npc_0+npc_n-1) then np_next = npc_0 else np_next = my_pcol + 1 endif if (my_pcol == npc_0) then np_prev = npc_0+npc_n-1 else np_prev = my_pcol - 1 endif call check_monotony(nm,d,'Input1',wantDebug, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("merge_systems") #endif return endif call check_monotony(na-nm,d(nm+1),'Input2',wantDebug, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop("merge_systems") #endif return endif ! Get global number of processors and my processor number. ! Please note that my_proc does not need to match any real processor number, ! it is just used for load balancing some loops. n_procs = np_rows*npc_n my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major ! Local limits of the rows of Q l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm l_rows = l_rqe-l_rqs+1 ! Total number of local rows ! My number of local columns l_cols = COUNT(p_col(1:na)==my_pcol) ! Get max number of local columns max_local_cols = 0 do np = npc_0, npc_0+npc_n-1 max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np)) enddo ! Calculations start here beta = abs(e) sig = sign(1.d0,e) ! Calculate rank-1 modifier z z(:) = 0 if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then ! nm is local on my row do i = 1, na if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i)) enddo endif if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then ! nm+1 is local on my row do i = 1, na if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i)) enddo endif call global_gather(z, na) ! Normalize z so that norm(z) = 1. Since z is the concatenation of ! two normalized vectors, norm2(z) = sqrt(2). z = z/sqrt(2.0d0) rho = 2.*beta ! Calculate index for merging both systems by ascending eigenvalues call DLAMRG( nm, na-nm, d, 1, 1, idx ) ! Calculate the allowable deflation tolerance zmax = maxval(abs(z)) dmax = maxval(abs(d)) EPS = DLAMCH( 'Epsilon' ) TOL = 8.*EPS*MAX(dmax,zmax) ! If the rank-1 modifier is small enough, no more needs to be done ! except to reorganize D and Q IF ( RHO*zmax <= TOL ) THEN ! Rearrange eigenvalues tmp = d do i=1,na d(i) = tmp(idx(i)) enddo ! Rearrange eigenvectors call resort_ev(idx, na) #ifdef HAVE_DETAILED_TIMINGS call timer%stop("merge_systems") #endif return ENDIF ! Merge and deflate system na1 = 0 na2 = 0 ! COLTYP: ! 1 : non-zero in the upper half only; ! 2 : dense; ! 3 : non-zero in the lower half only; ! 4 : deflated. coltyp(1:nm) = 1 coltyp(nm+1:na) = 3 do i=1,na if (rho*abs(z(idx(i))) <= tol) then ! Deflate due to small z component. na2 = na2+1 d2(na2) = d(idx(i)) idx2(na2) = idx(i) coltyp(idx(i)) = 4 else if (na1>0) then ! Check if eigenvalues are close enough to allow deflation. S = Z(idx(i)) C = Z1(na1) ! Find sqrt(a**2+b**2) without overflow or ! destructive underflow. TAU = DLAPY2( C, S ) T = D1(na1) - D(idx(i)) C = C / TAU S = -S / TAU IF ( ABS( T*C*S ) <= TOL ) THEN ! Deflation is possible. na2 = na2+1 Z1(na1) = TAU d2new = D(idx(i))*C**2 + D1(na1)*S**2 d1new = D(idx(i))*S**2 + D1(na1)*C**2 ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0 ! This means that after the above transformation it must be ! D1(na1) <= d1new <= D(idx(i)) ! D1(na1) <= d2new <= D(idx(i)) ! ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1)) ! so there is no problem with sorting here. ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1) ! which makes a check (and possibly a resort) necessary. ! ! The above relations may not hold exactly due to numeric differences ! so they have to be enforced in order not to get troubles with sorting. if (d1newD(idx(i))) d1new = D(idx(i)) if (d2newD(idx(i))) d2new = D(idx(i)) D1(na1) = d1new do j=na2-1,1,-1 if (d2new2) then ! Solve secular equation z(1:na1) = 1 #ifdef WITH_OPENMP z_p(1:na1,:) = 1 #endif dbase(1:na1) = 0 ddiff(1:na1) = 0 info = 0 #ifdef WITH_OPENMP #ifdef HAVE_DETAILED_TIMINGS call timer%start("OpenMP parallel") #endif !$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j) my_thread = omp_get_thread_num() !$OMP DO #endif DO i = my_proc+1, na1, n_procs ! work distributed over all processors call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! if (info/=0) then ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) ! use the more stable bisection algorithm in solve_secular_equation ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' call solve_secular_equation(na1, i, d1, z1, delta, rho, s) endif ! Compute updated z #ifdef WITH_OPENMP do j=1,na1 if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) ) enddo z_p(i,my_thread) = z_p(i,my_thread)*delta(i) #else do j=1,na1 if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) ) enddo z(i) = z(i)*delta(i) #endif ! store dbase/ddiff if (i1) then if (np_rem==npc_0) then np_rem = npc_0+npc_n-1 else np_rem = np_rem-1 endif #ifdef WITH_MPI call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, & np_next, 1111, np_prev, 1111, & mpi_comm_cols, mpi_status, mpierr) #endif endif ! Gather the parts in d1 and z which are fitting to qtmp1. ! This also delivers nnzu/nnzl for proc np_rem nnzu = 0 nnzl = 0 do i=1,na1 if (p_col(idx1(i))==np_rem) then if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then nnzu = nnzu+1 d1u(nnzu) = d1(i) zu (nnzu) = z (i) endif if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then nnzl = nnzl+1 d1l(nnzl) = d1(i) zl (nnzl) = z (i) endif endif enddo ! Set the deflated eigenvectors in Q (comming from proc np_rem) ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix do i = 1, na j = idx(i) if (j>na1) then if (p_col(idx2(j-na1))==np_rem) then ndef = ndef+1 if (p_col_out(i)==my_pcol) & q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef) endif endif enddo do ns = 0, nqcols1-1, max_strip ! strimining loop ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip ! Get partial result from (output) Q do i = 1, ncnt qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) enddo ! Compute eigenvectors of the rank-1 modified matrix. ! Parts for multiplying with upper half of Q: do i = 1, ncnt j = idx(idxq1(i+ns)) ! Calculate the j-th eigenvector of the deflated system ! See above why we are doing it this way! tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) call v_add_s(tmp,nnzu,ddiff(j)) ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) enddo ! Multiply old Q with eigenvectors (upper half) if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1,ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), & 1.d0,qtmp2(1,1),ubound(qtmp2,dim=1)) ! Compute eigenvectors of the rank-1 modified matrix. ! Parts for multiplying with lower half of Q: do i = 1, ncnt j = idx(idxq1(i+ns)) ! Calculate the j-th eigenvector of the deflated system ! See above why we are doing it this way! tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) call v_add_s(tmp,nnzl,ddiff(j)) ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) enddo ! Multiply old Q with eigenvectors (lower half) if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1(l_rnm+1,1),ubound(qtmp1,dim=1),ev,ubound(ev,dim=1), & 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,dim=1)) ! Put partial result into (output) Q do i = 1, ncnt q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i) enddo enddo enddo deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"merge_systems: error when deallocating ev "//errorMessage stop endif endif #ifdef WITH_OPENMP deallocate(z_p, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"merge_systems: error when deallocating z_p "//errorMessage stop endif #endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("merge_systems") #endif return contains subroutine add_tmp(d1, dbase, ddiff, z, ev_scale_value, na1,i) use precision implicit none integer(kind=ik), intent(in) :: na1, i real(kind=rk), intent(in) :: d1(:), dbase(:), ddiff(:), z(:) real(kind=rk), intent(inout) :: ev_scale_value real(kind=rk) :: tmp(1:na1) ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i) ! in exactly this order, but we want to prevent compiler optimization tmp(1:na1) = d1(1:na1) -dbase(i) call v_add_s(tmp(1:na1),na1,ddiff(i)) tmp(1:na1) = z(1:na1) / tmp(1:na1) ev_scale_value = 1.0/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) end subroutine add_tmp subroutine resort_ev(idx_ev, nLength) use precision implicit none integer(kind=ik), intent(in) :: nLength integer(kind=ik) :: idx_ev(nLength) integer(kind=ik) :: i, nc, pc1, pc2, lc1, lc2, l_cols_out real(kind=rk), allocatable :: qtmp(:,:) integer(kind=ik) :: istat character(200) :: errorMessage if (l_rows==0) return ! My processor column has no work to do ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i)) l_cols_out = COUNT(p_col_out(1:na)==my_pcol) allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"resort_ev: error when allocating qtmp "//errorMessage stop endif nc = 0 do i=1,na pc1 = p_col(idx_ev(i)) lc1 = l_col(idx_ev(i)) pc2 = p_col_out(i) if (pc2<0) cycle ! This column is not needed in output if (pc2==my_pcol) nc = nc+1 ! Counter for output columns if (pc1==my_pcol) then if (pc2==my_pcol) then ! send and recieve column are local qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) else #ifdef WITH_MPI call mpi_send(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,mod(i,4096),mpi_comm_cols,mpierr) #endif endif else if (pc2==my_pcol) then #ifdef WITH_MPI call mpi_recv(qtmp(1,nc),l_rows,MPI_REAL8,pc1,mod(i,4096),mpi_comm_cols,mpi_status,mpierr) #else qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,nc) #endif endif enddo ! Insert qtmp into (output) q nc = 0 do i=1,na pc2 = p_col_out(i) lc2 = l_col_out(i) if (pc2==my_pcol) then nc = nc+1 q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc) endif enddo deallocate(qtmp, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"resort_ev: error when deallocating qtmp "//errorMessage stop endif end subroutine resort_ev subroutine transform_columns(col1, col2) use precision implicit none integer(kind=ik) :: col1, col2 integer(kind=ik) :: pc1, pc2, lc1, lc2 if (l_rows==0) return ! My processor column has no work to do pc1 = p_col(col1) lc1 = l_col(col1) pc2 = p_col(col2) lc2 = l_col(col2) if (pc1==my_pcol) then if (pc2==my_pcol) then ! both columns are local tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1) q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) q(l_rqs:l_rqe,lc1) = tmp(1:l_rows) else #ifdef WITH_MPI call mpi_sendrecv(q(l_rqs,lc1),l_rows,MPI_REAL8,pc2,1, & tmp,l_rows,MPI_REAL8,pc2,1, & mpi_comm_cols,mpi_status,mpierr) #else tmp(1:l_rows) = q(l_rqs:l_rqe,lc1) #endif q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1) endif else if (pc2==my_pcol) then #ifdef WITH_MPI call mpi_sendrecv(q(l_rqs,lc2),l_rows,MPI_REAL8,pc1,1, & tmp,l_rows,MPI_REAL8,pc1,1, & mpi_comm_cols,mpi_status,mpierr) #else tmp(1:l_rows) = q(l_rqs:l_rqe,lc2) #endif q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) endif end subroutine transform_columns subroutine global_gather(z, n) ! This routine sums up z over all processors. ! It should only be used for gathering distributed results, ! i.e. z(i) should be nonzero on exactly 1 processor column, ! otherways the results may be numerically different on different columns use precision implicit none integer(kind=ik) :: n real(kind=rk) :: z(n) real(kind=rk) :: tmp(n) if (npc_n==1 .and. np_rows==1) return ! nothing to do ! Do an mpi_allreduce over processor rows #ifdef WITH_MPI call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) #else tmp = z #endif ! If only 1 processor column, we are done if (npc_n==1) then z(:) = tmp(:) return endif ! If all processor columns are involved, we can use mpi_allreduce if (npc_n==np_cols) then #ifdef WITH_MPI call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) #else tmp = z #endif return endif ! Do a ring send over processor columns z(:) = 0 do np = 1, npc_n z(:) = z(:) + tmp(:) #ifdef WITH_MPI call MPI_Sendrecv_replace(z, n, MPI_REAL8, np_next, 1111, np_prev, 1111, & mpi_comm_cols, mpi_status, mpierr) #endif enddo end subroutine global_gather subroutine global_product(z, n) ! This routine calculates the global product of z. use precision implicit none integer(kind=ik) :: n real(kind=rk) :: z(n) real(kind=rk) :: tmp(n) if (npc_n==1 .and. np_rows==1) return ! nothing to do ! Do an mpi_allreduce over processor rows #ifdef WITH_MPI call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_PROD, mpi_comm_rows, mpierr) #else tmp = z #endif ! If only 1 processor column, we are done if (npc_n==1) then z(:) = tmp(:) return endif ! If all processor columns are involved, we can use mpi_allreduce if (npc_n==np_cols) then #ifdef WITH_MPI call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_PROD, mpi_comm_cols, mpierr) #else z = tmp #endif return endif ! We send all vectors to the first proc, do the product there ! and redistribute the result. if (my_pcol == npc_0) then z(1:n) = tmp(1:n) do np = npc_0+1, npc_0+npc_n-1 #ifdef WITH_MPI call mpi_recv(tmp,n,MPI_REAL8,np,1111,mpi_comm_cols,mpi_status,mpierr) #else tmp(1:n) = z(1:n) #endif z(1:n) = z(1:n)*tmp(1:n) enddo do np = npc_0+1, npc_0+npc_n-1 #ifdef WITH_MPI call mpi_send(z,n,MPI_REAL8,np,1111,mpi_comm_cols,mpierr) #endif enddo else #ifdef WITH_MPI call mpi_send(tmp,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpierr) call mpi_recv(z ,n,MPI_REAL8,npc_0,1111,mpi_comm_cols,mpi_status,mpierr) #else z(1:n) = tmp(1:n) #endif endif end subroutine global_product subroutine check_monotony(n,d,text, wantDebug, success) ! This is a test routine for checking if the eigenvalues are monotonically increasing. ! It is for debug purposes only, an error should never be triggered! use precision implicit none integer(kind=ik) :: n real(kind=rk) :: d(n) character*(*) :: text integer(kind=ik) :: i logical, intent(in) :: wantDebug logical, intent(out) :: success success = .true. do i=1,n-1 if (d(i+1) 0 and d(i+1) > d(i) ! ! but this routine will not terminate with error if these are not satisfied ! (it will normally converge to a pole in this case). ! ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases ! N=1 and N=2 which is not compatible with DLAED4. ! Thus this routine shouldn't be used for these cases as a simple replacement ! of DLAED4. ! ! The arguments are the same as in DLAED4 (with the exception of the INFO argument): ! ! ! N (input) INTEGER ! The length of all arrays. ! ! I (input) INTEGER ! The index of the eigenvalue to be computed. 1 <= I <= N. ! ! D (input) DOUBLE PRECISION array, dimension (N) ! The original eigenvalues. It is assumed that they are in ! order, D(I) < D(J) for I < J. ! ! Z (input) DOUBLE PRECISION array, dimension (N) ! The components of the updating vector. ! ! DELTA (output) DOUBLE PRECISION array, dimension (N) ! DELTA contains (D(j) - lambda_I) in its j-th component. ! See remark above about DLAED4 compatibility! ! ! RHO (input) DOUBLE PRECISION ! The scalar in the symmetric updating formula. ! ! DLAM (output) DOUBLE PRECISION ! The computed lambda_I, the I-th updated eigenvalue. !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: n, i real(kind=rk) :: d(n), z(n), delta(n), rho, dlam integer(kind=ik) :: iter real(kind=rk) :: a, b, x, y, dshift ! In order to obtain sufficient numerical accuracy we have to shift the problem ! either by d(i) or d(i+1), whichever is closer to the solution ! Upper and lower bound of the shifted solution interval are a and b #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_secular_equation") #endif if (i==n) then ! Special case: Last eigenvalue ! We shift always by d(n), lower bound is d(n), ! upper bound is determined by a guess: dshift = d(n) delta(:) = d(:) - dshift a = 0. ! delta(n) b = rho*SUM(z(:)**2) + 1. ! rho*SUM(z(:)**2) is the lower bound for the guess else ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1) ! We check the sign of the function in the midpoint of the interval ! in order to determine if eigenvalue is more close to d(i) or d(i+1) x = 0.5*(d(i)+d(i+1)) y = 1. + rho*SUM(z(:)**2/(d(:)-x)) if (y>0) then ! solution is next to d(i) dshift = d(i) else ! solution is next to d(i+1) dshift = d(i+1) endif delta(:) = d(:) - dshift a = delta(i) b = delta(i+1) endif ! Bisection: do iter=1,200 ! Interval subdivision x = 0.5*(a+b) if (x==a .or. x==b) exit ! No further interval subdivisions possible if (abs(x) < 1.d-200) exit ! x next to pole ! evaluate value at x y = 1. + rho*SUM(z(:)**2/(delta(:)-x)) if (y==0) then ! found exact solution exit elseif (y>0) then b = x else a = x endif enddo ! Solution: dlam = x + dshift delta(:) = delta(:) - x #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_secular_equation") #endif end subroutine solve_secular_equation !------------------------------------------------------------------------------- integer function local_index(idx, my_proc, num_procs, nblk, iflag) !------------------------------------------------------------------------------- ! local_index: returns the local index for a given global index ! If the global index has no local index on the ! processor my_proc behaviour is defined by iflag ! ! Parameters ! ! idx Global index ! ! my_proc Processor row/column for which to calculate the local index ! ! num_procs Total number of processors along row/column ! ! nblk Blocksize ! ! iflag Controls the behaviour if idx is not on local processor ! iflag< 0 : Return last local index before that row/col ! iflag==0 : Return 0 ! iflag> 0 : Return next local index after that row/col !------------------------------------------------------------------------------- use precision implicit none integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag integer(kind=ik) :: iblk iblk = (idx-1)/nblk ! global block number, 0 based if (mod(iblk,num_procs) == my_proc) then ! block is local, always return local row/col number local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1 else ! non local block if (iflag == 0) then local_index = 0 else local_index = (iblk/num_procs)*nblk if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk if (iflag>0) local_index = local_index + 1 endif endif end function local_index subroutine cholesky_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success) !------------------------------------------------------------------------------- ! cholesky_real: Cholesky factorization of a real symmetric matrix ! ! Parameters ! ! na Order of matrix ! ! a(lda,matrixCols) Distributed matrix which should be factorized. ! Distribution is like in Scalapack. ! Only upper triangle is needs to be set. ! On return, the upper triangle contains the Cholesky factor ! and the lower triangle is set to 0. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols real(kind=rk) :: a(lda,matrixCols) ! was ! real a(lda, *) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx integer(kind=ik) :: n, nc, i, info integer(kind=ik) :: lcs, lce, lrs, lre integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile real(kind=rk), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) logical, intent(in) :: wantDebug logical, intent(out) :: success integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("cholesky_real") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) success = .true. ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_real: error when allocating tmp1 "//errorMessage stop endif allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_real: error when allocating tmp2 "//errorMessage stop endif tmp1 = 0 tmp2 = 0 allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_real: error when allocating tmatr "//errorMessage stop endif allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_real: error when allocating tmatc "//errorMessage stop endif tmatr = 0 tmatc = 0 do n = 1, na, nblk ! Calculate first local row and column of the still remaining matrix ! on the local processor l_row1 = local_index(n, my_prow, np_rows, nblk, +1) l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) if (n+nblk > na) then ! This is the last step, just do a Cholesky-Factorization ! of the remaining block if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then call dpotrf('U',na-n+1,a(l_row1,l_col1),lda,info) if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" success = .false. return endif endif exit ! Loop endif if (my_prow==prow(n, nblk, np_rows)) then if (my_pcol==pcol(n, nblk, np_cols)) then ! The process owning the upper left remaining block does the ! Cholesky-Factorization of this block call dpotrf('U',nblk,a(l_row1,l_col1),lda,info) if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" success = .false. return endif nc = 0 do i=1,nblk tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) nc = nc+i enddo endif #ifdef WITH_MPI call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) #endif nc = 0 do i=1,nblk tmp2(1:i,i) = tmp1(nc+1:nc+i) nc = nc+i enddo if (l_cols-l_colx+1>0) & call dtrsm('L','U','T','N',nblk,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) endif do i=1,nblk if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols) #ifdef WITH_MPI if (l_cols-l_colx+1>0) & call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) #endif enddo ! this has to be checked since it was changed substantially when doing type safe call elpa_transpose_vectors_real (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & n, na, nblk, nblk) do i=0,(na-1)/tile_size lcs = max(l_colx,i*l_cols_tile+1) lce = min(l_cols,(i+1)*l_cols_tile) lrs = l_rowx lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) & call DTRMM('L','U','N','N',nb,l_cols-l_colx+1,1.d0,tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 endif if (l_row1>1) then if (my_pcol==pcol(n, nblk, np_cols)) then tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1) a(1:l_row1-1,l_col1:l_col1+nb-1) = 0 endif do i=1,nb #ifdef WITH_MPI call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_REAL8,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) #endif enddo endif #ifdef WITH_MPI if (l_cols-l_col1+1>0) & call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_REAL8,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) #endif if (l_row1>1 .and. l_cols-l_col1+1>0) & call dgemm('N','N',l_row1-1,l_cols-l_col1+1,nb, -1.d0, & tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), & 1.d0, a(1,l_col1),lda) enddo deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"invert_trm_real: error when deallocating tmp1 "//errorMessage stop endif end subroutine invert_trm_real subroutine cholesky_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success) !------------------------------------------------------------------------------- ! cholesky_complex: Cholesky factorization of a complex hermitian matrix ! ! Parameters ! ! na Order of matrix ! ! a(lda,matriCols) Distributed matrix which should be factorized. ! Distribution is like in Scalapack. ! Only upper triangle is needs to be set. ! On return, the upper triangle contains the Cholesky factor ! and the lower triangle is set to 0. ! ! lda Leading dimension of a ! matrixCols local columns of matrix a ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use precision implicit none integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols #ifdef DESPERATELY_WANT_ASSUMED_SIZE complex(kind=ck) :: a(lda,*) #else complex(kind=ck) :: a(lda,matrixCols) #endif integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx integer(kind=ik) :: n, nc, i, info integer(kind=ik) :: lcs, lce, lrs, lre integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile complex(kind=ck), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) logical, intent(in) :: wantDebug logical, intent(out) :: success integer(kind=ik) :: istat character(200) :: errorMessage #ifdef HAVE_DETAILED_TIMINGS call timer%start("cholesky_complex") #endif success = .true. call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) ! Matrix is split into tiles; work is done only for tiles on the diagonal or above tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide l_rows_tile = tile_size/np_rows ! local rows of a tile l_cols_tile = tile_size/np_cols ! local cols of a tile l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_complex: error when allocating tmp1 "//errorMessage stop endif allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_complex: error when allocating tmp2 "//errorMessage stop endif tmp1 = 0 tmp2 = 0 allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_complex: error when allocating tmatr "//errorMessage stop endif allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"cholesky_complex: error when allocating tmatc "//errorMessage stop endif tmatr = 0 tmatc = 0 do n = 1, na, nblk ! Calculate first local row and column of the still remaining matrix ! on the local processor l_row1 = local_index(n, my_prow, np_rows, nblk, +1) l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) if (n+nblk > na) then ! This is the last step, just do a Cholesky-Factorization ! of the remaining block if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then call zpotrf('U',na-n+1,a(l_row1,l_col1),lda,info) if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" success = .false. return endif endif exit ! Loop endif if (my_prow==prow(n, nblk, np_rows)) then if (my_pcol==pcol(n, nblk, np_cols)) then ! The process owning the upper left remaining block does the ! Cholesky-Factorization of this block call zpotrf('U',nblk,a(l_row1,l_col1),lda,info) if (info/=0) then if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" success = .false. return endif nc = 0 do i=1,nblk tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) nc = nc+i enddo endif #ifdef WITH_MPI call MPI_Bcast(tmp1,nblk*(nblk+1)/2,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) #endif nc = 0 do i=1,nblk tmp2(1:i,i) = tmp1(nc+1:nc+i) nc = nc+i enddo if (l_cols-l_colx+1>0) & call ztrsm('L','U','C','N',nblk,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) endif do i=1,nblk if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols)) #ifdef WITH_MPI if (l_cols-l_colx+1>0) & call MPI_Bcast(tmatc(l_colx,i),l_cols-l_colx+1,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) #endif enddo ! this has to be checked since it was changed substantially when doing type safe call elpa_transpose_vectors_complex (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & n, na, nblk, nblk) do i=0,(na-1)/tile_size lcs = max(l_colx,i*l_cols_tile+1) lce = min(l_cols,(i+1)*l_cols_tile) lrs = l_rowx lre = min(l_rows,(i+1)*l_rows_tile) if (lce0) & call ZTRMM('L','U','N','N',nb,l_cols-l_colx+1,(1.d0,0.d0),tmp2,ubound(tmp2,dim=1),a(l_row1,l_colx),lda) if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 endif if (l_row1>1) then if (my_pcol==pcol(n, nblk, np_cols)) then tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1) a(1:l_row1-1,l_col1:l_col1+nb-1) = 0 endif do i=1,nb #ifdef WITH_MPI call MPI_Bcast(tmat1(1,i),l_row1-1,MPI_DOUBLE_COMPLEX,pcol(n, nblk, np_cols),mpi_comm_cols,mpierr) #endif enddo endif #ifdef WITH_MPI if (l_cols-l_col1+1>0) & call MPI_Bcast(tmat2(1,l_col1),(l_cols-l_col1+1)*nblk,MPI_DOUBLE_COMPLEX,prow(n, nblk, np_rows),mpi_comm_rows,mpierr) #endif if (l_row1>1 .and. l_cols-l_col1+1>0) & call ZGEMM('N','N',l_row1-1,l_cols-l_col1+1,nb, (-1.d0,0.d0), & tmat1,ubound(tmat1,dim=1),tmat2(1,l_col1),ubound(tmat2,dim=1), & (1.d0,0.d0), a(1,l_col1),lda) enddo deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"invert_trm_complex: error when deallocating tmp1 "//errorMessage stop endif end subroutine invert_trm_complex integer function least_common_multiple(a, b) ! Returns the least common multiple of a and b ! There may be more efficient ways to do this, we use the most simple approach use precision implicit none integer(kind=ik), intent(in) :: a, b do least_common_multiple = a, a*(b-1), a if(mod(least_common_multiple,b)==0) exit enddo ! if the loop is left regularly, least_common_multiple = a*b end function subroutine hh_transform_real(alpha, xnorm_sq, xf, tau) ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:) ! and returns the factor xf by which x has to be scaled. ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 ! since this would be expensive for the parallel implementation. use precision implicit none real(kind=rk), intent(inout) :: alpha real(kind=rk), intent(in) :: xnorm_sq real(kind=rk), intent(out) :: xf, tau real(kind=rk) :: BETA if ( XNORM_SQ==0. ) then if ( ALPHA>=0. ) then TAU = 0. else TAU = 2. ALPHA = -ALPHA endif XF = 0. else BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA ) ALPHA = ALPHA + BETA IF ( BETA<0 ) THEN BETA = -BETA TAU = -ALPHA / BETA ELSE ALPHA = XNORM_SQ / ALPHA TAU = ALPHA / BETA ALPHA = -ALPHA END IF XF = 1./ALPHA ALPHA = BETA endif end subroutine subroutine hh_transform_complex(alpha, xnorm_sq, xf, tau) ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:) ! and returns the factor xf by which x has to be scaled. ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 ! since this would be expensive for the parallel implementation. use precision implicit none complex(kind=ck), intent(inout) :: alpha real(kind=rk), intent(in) :: xnorm_sq complex(kind=ck), intent(out) :: xf, tau real*8 ALPHR, ALPHI, BETA ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) if ( XNORM_SQ==0. .AND. ALPHI==0. ) then if ( ALPHR>=0. ) then TAU = 0. else TAU = 2. ALPHA = -ALPHA endif XF = 0. else BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR ) ALPHA = ALPHA + BETA IF ( BETA<0 ) THEN BETA = -BETA TAU = -ALPHA / BETA ELSE ALPHR = ALPHI * (ALPHI/DBLE( ALPHA )) ALPHR = ALPHR + XNORM_SQ/DBLE( ALPHA ) TAU = DCMPLX( ALPHR/BETA, -ALPHI/BETA ) ALPHA = DCMPLX( -ALPHR, ALPHI ) END IF XF = 1./ALPHA ALPHA = BETA endif end subroutine end module ELPA1_compute elpa-2016.05.001/src/elpa2_print_kernels.F900000644000312500001440000001120412717516040015034 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "config-f90.h" !> \file print_available_elpa2_kernels.F90 !> \par !> \brief Provide information which ELPA2 kernels are available on this system !> !> \details !> It is possible to configure ELPA2 such, that different compute intensive !> "ELPA2 kernels" can be choosen at runtime. !> The service binary print_available_elpa2_kernels will query the library and tell !> whether ELPA2 has been configured in this way, and if this is the case which kernels can be !> choosen at runtime. !> It will furthermore detail whether ELPA has been configured with OpenMP support !> !> Synopsis: print_available_elpa2_kernels !> !> \author A. Marek (MPCDF) program print_available_elpa2_kernels use precision use ELPA1 use ELPA2 use elpa2_utilities implicit none integer(kind=ik) :: i print *, "This program will give information on the ELPA2 kernels, " print *, "which are available with this library and it will give " print *, "information if (and how) the kernels can be choosen at " print *, "runtime" print * print * #ifdef WITH_OPENMP print *, " ELPA supports threads: yes" #else print *, " ELPA supports threads: no" #endif print *, "Information on ELPA2 real case: " print *, "=============================== " #ifdef HAVE_ENVIRONMENT_CHECKING print *, " choice via environment variable: yes" print *, " environment variable name : REAL_ELPA_KERNEL" #else print *, " choice via environment variable: no" #endif print * print *, " Available real kernels are: " #ifdef HAVE_AVX2 print *, " AVX kernels are optimized for FMA (AVX2)" #endif call print_available_real_kernels() print * print * print *, "Information on ELPA2 complex case: " print *, "=============================== " #ifdef HAVE_ENVIRONMENT_CHECKING print *, " choice via environment variable: yes" print *, " environment variable name : COMPLEX_ELPA_KERNEL" #else print *, " choice via environment variable: no" #endif print * print *, " Available complex kernels are: " #ifdef HAVE_AVX2 print *, " AVX kernels are optimized for FMA (AVX2)" #endif call print_available_complex_kernels() end program print_available_elpa2_kernels elpa-2016.05.001/src/mod_mpi.F900000644000312500001440000000374412717402663012535 00000000000000! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! Author: Andreas Marek, MPCDF #include "config-f90.h" module elpa_mpi #ifndef WITH_MPI use elpa_mpi_stubs #else implicit none public include "mpif.h" #endif end module elpa-2016.05.001/m4/0000755000312500001440000000000012717541041010424 500000000000000elpa-2016.05.001/m4/ax_prog_doxygen.m40000644000312500001440000004272512717402663014022 00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_prog_doxygen.html # =========================================================================== # # SYNOPSIS # # DX_INIT_DOXYGEN(PROJECT-NAME, DOXYFILE-PATH, [OUTPUT-DIR]) # DX_DOXYGEN_FEATURE(ON|OFF) # DX_DOT_FEATURE(ON|OFF) # DX_HTML_FEATURE(ON|OFF) # DX_CHM_FEATURE(ON|OFF) # DX_CHI_FEATURE(ON|OFF) # DX_MAN_FEATURE(ON|OFF) # DX_RTF_FEATURE(ON|OFF) # DX_XML_FEATURE(ON|OFF) # DX_PDF_FEATURE(ON|OFF) # DX_PS_FEATURE(ON|OFF) # # DESCRIPTION # # The DX_*_FEATURE macros control the default setting for the given # Doxygen feature. Supported features are 'DOXYGEN' itself, 'DOT' for # generating graphics, 'HTML' for plain HTML, 'CHM' for compressed HTML # help (for MS users), 'CHI' for generating a seperate .chi file by the # .chm file, and 'MAN', 'RTF', 'XML', 'PDF' and 'PS' for the appropriate # output formats. The environment variable DOXYGEN_PAPER_SIZE may be # specified to override the default 'a4wide' paper size. # # By default, HTML, PDF and PS documentation is generated as this seems to # be the most popular and portable combination. MAN pages created by # Doxygen are usually problematic, though by picking an appropriate subset # and doing some massaging they might be better than nothing. CHM and RTF # are specific for MS (note that you can't generate both HTML and CHM at # the same time). The XML is rather useless unless you apply specialized # post-processing to it. # # The macros mainly control the default state of the feature. The use can # override the default by specifying --enable or --disable. The macros # ensure that contradictory flags are not given (e.g., # --enable-doxygen-html and --enable-doxygen-chm, # --enable-doxygen-anything with --disable-doxygen, etc.) Finally, each # feature will be automatically disabled (with a warning) if the required # programs are missing. # # Once all the feature defaults have been specified, call DX_INIT_DOXYGEN # with the following parameters: a one-word name for the project for use # as a filename base etc., an optional configuration file name (the # default is 'Doxyfile', the same as Doxygen's default), and an optional # output directory name (the default is 'doxygen-doc'). # # Automake Support # # The following is a template aminclude.am file for use with Automake. # Make targets and variables values are controlled by the various # DX_COND_* conditionals set by autoconf. # # The provided targets are: # # doxygen-doc: Generate all doxygen documentation. # # doxygen-run: Run doxygen, which will generate some of the # documentation (HTML, CHM, CHI, MAN, RTF, XML) # but will not do the post processing required # for the rest of it (PS, PDF, and some MAN). # # doxygen-man: Rename some doxygen generated man pages. # # doxygen-ps: Generate doxygen PostScript documentation. # # doxygen-pdf: Generate doxygen PDF documentation. # # Note that by default these are not integrated into the automake targets. # If doxygen is used to generate man pages, you can achieve this # integration by setting man3_MANS to the list of man pages generated and # then adding the dependency: # # $(man3_MANS): doxygen-doc # # This will cause make to run doxygen and generate all the documentation. # # The following variable is intended for use in Makefile.am: # # DX_CLEANFILES = everything to clean. # # Then add this variable to MOSTLYCLEANFILES. # # ----- begin aminclude.am ------------------------------------- # # ## --------------------------------- ## # ## Format-independent Doxygen rules. ## # ## --------------------------------- ## # # if DX_COND_doc # # ## ------------------------------- ## # ## Rules specific for HTML output. ## # ## ------------------------------- ## # # if DX_COND_html # # DX_CLEAN_HTML = @DX_DOCDIR@/html # # endif DX_COND_html # # ## ------------------------------ ## # ## Rules specific for CHM output. ## # ## ------------------------------ ## # # if DX_COND_chm # # DX_CLEAN_CHM = @DX_DOCDIR@/chm # # if DX_COND_chi # # DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi # # endif DX_COND_chi # # endif DX_COND_chm # # ## ------------------------------ ## # ## Rules specific for MAN output. ## # ## ------------------------------ ## # # if DX_COND_man # # DX_CLEAN_MAN = @DX_DOCDIR@/man # # endif DX_COND_man # # ## ------------------------------ ## # ## Rules specific for RTF output. ## # ## ------------------------------ ## # # if DX_COND_rtf # # DX_CLEAN_RTF = @DX_DOCDIR@/rtf # # endif DX_COND_rtf # # ## ------------------------------ ## # ## Rules specific for XML output. ## # ## ------------------------------ ## # # if DX_COND_xml # # DX_CLEAN_XML = @DX_DOCDIR@/xml # # endif DX_COND_xml # # ## ----------------------------- ## # ## Rules specific for PS output. ## # ## ----------------------------- ## # # if DX_COND_ps # # DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps # # DX_PS_GOAL = doxygen-ps # # doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps # # @DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag # cd @DX_DOCDIR@/latex; \ # rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \ # $(DX_LATEX) refman.tex; \ # $(MAKEINDEX_PATH) refman.idx; \ # $(DX_LATEX) refman.tex; \ # countdown=5; \ # while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \ # refman.log > /dev/null 2>&1 \ # && test $$countdown -gt 0; do \ # $(DX_LATEX) refman.tex; \ # countdown=`expr $$countdown - 1`; \ # done; \ # $(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi # # endif DX_COND_ps # # ## ------------------------------ ## # ## Rules specific for PDF output. ## # ## ------------------------------ ## # # if DX_COND_pdf # # DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf # # DX_PDF_GOAL = doxygen-pdf # # doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf # # @DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag # cd @DX_DOCDIR@/latex; \ # rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \ # $(DX_PDFLATEX) refman.tex; \ # $(DX_MAKEINDEX) refman.idx; \ # $(DX_PDFLATEX) refman.tex; \ # countdown=5; \ # while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \ # refman.log > /dev/null 2>&1 \ # && test $$countdown -gt 0; do \ # $(DX_PDFLATEX) refman.tex; \ # countdown=`expr $$countdown - 1`; \ # done; \ # mv refman.pdf ../@PACKAGE@.pdf # # endif DX_COND_pdf # # ## ------------------------------------------------- ## # ## Rules specific for LaTeX (shared for PS and PDF). ## # ## ------------------------------------------------- ## # # if DX_COND_latex # # DX_CLEAN_LATEX = @DX_DOCDIR@/latex # # endif DX_COND_latex # # .PHONY: doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL) # # .INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL) # # doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag # # doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL) # # @DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS) # rm -rf @DX_DOCDIR@ # $(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG) # # DX_CLEANFILES = \ # @DX_DOCDIR@/@PACKAGE@.tag \ # -r \ # $(DX_CLEAN_HTML) \ # $(DX_CLEAN_CHM) \ # $(DX_CLEAN_CHI) \ # $(DX_CLEAN_MAN) \ # $(DX_CLEAN_RTF) \ # $(DX_CLEAN_XML) \ # $(DX_CLEAN_PS) \ # $(DX_CLEAN_PDF) \ # $(DX_CLEAN_LATEX) # # endif DX_COND_doc # # ----- end aminclude.am --------------------------------------- # # LICENSE # # Copyright (c) 2009 Oren Ben-Kiki # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 12 ## ----------## ## Defaults. ## ## ----------## DX_ENV="" AC_DEFUN([DX_FEATURE_doc], ON) AC_DEFUN([DX_FEATURE_dot], OFF) AC_DEFUN([DX_FEATURE_man], OFF) AC_DEFUN([DX_FEATURE_html], ON) AC_DEFUN([DX_FEATURE_chm], OFF) AC_DEFUN([DX_FEATURE_chi], OFF) AC_DEFUN([DX_FEATURE_rtf], OFF) AC_DEFUN([DX_FEATURE_xml], OFF) AC_DEFUN([DX_FEATURE_pdf], ON) AC_DEFUN([DX_FEATURE_ps], ON) ## --------------- ## ## Private macros. ## ## --------------- ## # DX_ENV_APPEND(VARIABLE, VALUE) # ------------------------------ # Append VARIABLE="VALUE" to DX_ENV for invoking doxygen. AC_DEFUN([DX_ENV_APPEND], [AC_SUBST([DX_ENV], ["$DX_ENV $1='$2'"])]) # DX_DIRNAME_EXPR # --------------- # Expand into a shell expression prints the directory part of a path. AC_DEFUN([DX_DIRNAME_EXPR], [[expr ".$1" : '\(\.\)[^/]*$' \| "x$1" : 'x\(.*\)/[^/]*$']]) # DX_IF_FEATURE(FEATURE, IF-ON, IF-OFF) # ------------------------------------- # Expands according to the M4 (static) status of the feature. AC_DEFUN([DX_IF_FEATURE], [ifelse(DX_FEATURE_$1, ON, [$2], [$3])]) # DX_REQUIRE_PROG(VARIABLE, PROGRAM) # ---------------------------------- # Require the specified program to be found for the DX_CURRENT_FEATURE to work. AC_DEFUN([DX_REQUIRE_PROG], [ AC_PATH_TOOL([$1], [$2]) if test "$DX_FLAG_[]DX_CURRENT_FEATURE$$1" = 1; then AC_MSG_WARN([$2 not found - will not DX_CURRENT_DESCRIPTION]) AC_SUBST(DX_FLAG_[]DX_CURRENT_FEATURE, 0) fi ]) # DX_TEST_FEATURE(FEATURE) # ------------------------ # Expand to a shell expression testing whether the feature is active. AC_DEFUN([DX_TEST_FEATURE], [test "$DX_FLAG_$1" = 1]) # DX_CHECK_DEPEND(REQUIRED_FEATURE, REQUIRED_STATE) # ------------------------------------------------- # Verify that a required features has the right state before trying to turn on # the DX_CURRENT_FEATURE. AC_DEFUN([DX_CHECK_DEPEND], [ test "$DX_FLAG_$1" = "$2" \ || AC_MSG_ERROR([doxygen-DX_CURRENT_FEATURE ifelse([$2], 1, requires, contradicts) doxygen-DX_CURRENT_FEATURE]) ]) # DX_CLEAR_DEPEND(FEATURE, REQUIRED_FEATURE, REQUIRED_STATE) # ---------------------------------------------------------- # Turn off the DX_CURRENT_FEATURE if the required feature is off. AC_DEFUN([DX_CLEAR_DEPEND], [ test "$DX_FLAG_$1" = "$2" || AC_SUBST(DX_FLAG_[]DX_CURRENT_FEATURE, 0) ]) # DX_FEATURE_ARG(FEATURE, DESCRIPTION, # CHECK_DEPEND, CLEAR_DEPEND, # REQUIRE, DO-IF-ON, DO-IF-OFF) # -------------------------------------------- # Parse the command-line option controlling a feature. CHECK_DEPEND is called # if the user explicitly turns the feature on (and invokes DX_CHECK_DEPEND), # otherwise CLEAR_DEPEND is called to turn off the default state if a required # feature is disabled (using DX_CLEAR_DEPEND). REQUIRE performs additional # requirement tests (DX_REQUIRE_PROG). Finally, an automake flag is set and # DO-IF-ON or DO-IF-OFF are called according to the final state of the feature. AC_DEFUN([DX_ARG_ABLE], [ AC_DEFUN([DX_CURRENT_FEATURE], [$1]) AC_DEFUN([DX_CURRENT_DESCRIPTION], [$2]) AC_ARG_ENABLE(doxygen-$1, [AS_HELP_STRING(DX_IF_FEATURE([$1], [--disable-doxygen-$1], [--enable-doxygen-$1]), DX_IF_FEATURE([$1], [don't $2], [$2]))], [ case "$enableval" in #( y|Y|yes|Yes|YES) AC_SUBST([DX_FLAG_$1], 1) $3 ;; #( n|N|no|No|NO) AC_SUBST([DX_FLAG_$1], 0) ;; #( *) AC_MSG_ERROR([invalid value '$enableval' given to doxygen-$1]) ;; esac ], [ AC_SUBST([DX_FLAG_$1], [DX_IF_FEATURE([$1], 1, 0)]) $4 ]) if DX_TEST_FEATURE([$1]); then $5 : fi AM_CONDITIONAL(DX_COND_$1, DX_TEST_FEATURE([$1])) if DX_TEST_FEATURE([$1]); then $6 : else $7 : fi ]) ## -------------- ## ## Public macros. ## ## -------------- ## # DX_XXX_FEATURE(DEFAULT_STATE) # ----------------------------- AC_DEFUN([DX_DOXYGEN_FEATURE], [AC_DEFUN([DX_FEATURE_doc], [$1])]) AC_DEFUN([DX_DOT_FEATURE], [AC_DEFUN([DX_FEATURE_dot], [$1])]) AC_DEFUN([DX_MAN_FEATURE], [AC_DEFUN([DX_FEATURE_man], [$1])]) AC_DEFUN([DX_HTML_FEATURE], [AC_DEFUN([DX_FEATURE_html], [$1])]) AC_DEFUN([DX_CHM_FEATURE], [AC_DEFUN([DX_FEATURE_chm], [$1])]) AC_DEFUN([DX_CHI_FEATURE], [AC_DEFUN([DX_FEATURE_chi], [$1])]) AC_DEFUN([DX_RTF_FEATURE], [AC_DEFUN([DX_FEATURE_rtf], [$1])]) AC_DEFUN([DX_XML_FEATURE], [AC_DEFUN([DX_FEATURE_xml], [$1])]) AC_DEFUN([DX_XML_FEATURE], [AC_DEFUN([DX_FEATURE_xml], [$1])]) AC_DEFUN([DX_PDF_FEATURE], [AC_DEFUN([DX_FEATURE_pdf], [$1])]) AC_DEFUN([DX_PS_FEATURE], [AC_DEFUN([DX_FEATURE_ps], [$1])]) # DX_INIT_DOXYGEN(PROJECT, [CONFIG-FILE], [OUTPUT-DOC-DIR]) # --------------------------------------------------------- # PROJECT also serves as the base name for the documentation files. # The default CONFIG-FILE is "Doxyfile" and OUTPUT-DOC-DIR is "doxygen-doc". AC_DEFUN([DX_INIT_DOXYGEN], [ # Files: AC_SUBST([DX_PROJECT], [$1]) AC_SUBST([DX_CONFIG], [ifelse([$2], [], Doxyfile, [$2])]) AC_SUBST([DX_DOCDIR], [ifelse([$3], [], doxygen-doc, [$3])]) # Environment variables used inside doxygen.cfg: DX_ENV_APPEND(SRCDIR, $srcdir) DX_ENV_APPEND(PROJECT, $DX_PROJECT) DX_ENV_APPEND(DOCDIR, $DX_DOCDIR) DX_ENV_APPEND(VERSION, $PACKAGE_VERSION) # Doxygen itself: DX_ARG_ABLE(doc, [generate any doxygen documentation], [], [], [DX_REQUIRE_PROG([DX_DOXYGEN], doxygen) DX_REQUIRE_PROG([DX_PERL], perl)], [DX_ENV_APPEND(PERL_PATH, $DX_PERL)]) # Dot for graphics: DX_ARG_ABLE(dot, [generate graphics for doxygen documentation], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [DX_REQUIRE_PROG([DX_DOT], dot)], [DX_ENV_APPEND(HAVE_DOT, YES) DX_ENV_APPEND(DOT_PATH, [`DX_DIRNAME_EXPR($DX_DOT)`])], [DX_ENV_APPEND(HAVE_DOT, NO)]) # Man pages generation: DX_ARG_ABLE(man, [generate doxygen manual pages], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [], [DX_ENV_APPEND(GENERATE_MAN, YES)], [DX_ENV_APPEND(GENERATE_MAN, NO)]) # RTF file generation: DX_ARG_ABLE(rtf, [generate doxygen RTF documentation], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [], [DX_ENV_APPEND(GENERATE_RTF, YES)], [DX_ENV_APPEND(GENERATE_RTF, NO)]) # XML file generation: DX_ARG_ABLE(xml, [generate doxygen XML documentation], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [], [DX_ENV_APPEND(GENERATE_XML, YES)], [DX_ENV_APPEND(GENERATE_XML, NO)]) # (Compressed) HTML help generation: DX_ARG_ABLE(chm, [generate doxygen compressed HTML help documentation], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [DX_REQUIRE_PROG([DX_HHC], hhc)], [DX_ENV_APPEND(HHC_PATH, $DX_HHC) DX_ENV_APPEND(GENERATE_HTML, YES) DX_ENV_APPEND(GENERATE_HTMLHELP, YES)], [DX_ENV_APPEND(GENERATE_HTMLHELP, NO)]) # Seperate CHI file generation. DX_ARG_ABLE(chi, [generate doxygen seperate compressed HTML help index file], [DX_CHECK_DEPEND(chm, 1)], [DX_CLEAR_DEPEND(chm, 1)], [], [DX_ENV_APPEND(GENERATE_CHI, YES)], [DX_ENV_APPEND(GENERATE_CHI, NO)]) # Plain HTML pages generation: DX_ARG_ABLE(html, [generate doxygen plain HTML documentation], [DX_CHECK_DEPEND(doc, 1) DX_CHECK_DEPEND(chm, 0)], [DX_CLEAR_DEPEND(doc, 1) DX_CLEAR_DEPEND(chm, 0)], [], [DX_ENV_APPEND(GENERATE_HTML, YES)], [DX_TEST_FEATURE(chm) || DX_ENV_APPEND(GENERATE_HTML, NO)]) # PostScript file generation: DX_ARG_ABLE(ps, [generate doxygen PostScript documentation], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [DX_REQUIRE_PROG([DX_LATEX], latex) DX_REQUIRE_PROG([DX_MAKEINDEX], makeindex) DX_REQUIRE_PROG([DX_DVIPS], dvips) DX_REQUIRE_PROG([DX_EGREP], egrep)]) # PDF file generation: DX_ARG_ABLE(pdf, [generate doxygen PDF documentation], [DX_CHECK_DEPEND(doc, 1)], [DX_CLEAR_DEPEND(doc, 1)], [DX_REQUIRE_PROG([DX_PDFLATEX], pdflatex) DX_REQUIRE_PROG([DX_MAKEINDEX], makeindex) DX_REQUIRE_PROG([DX_EGREP], egrep)]) # LaTeX generation for PS and/or PDF: AM_CONDITIONAL(DX_COND_latex, DX_TEST_FEATURE(ps) || DX_TEST_FEATURE(pdf)) if DX_TEST_FEATURE(ps) || DX_TEST_FEATURE(pdf); then DX_ENV_APPEND(GENERATE_LATEX, YES) else DX_ENV_APPEND(GENERATE_LATEX, NO) fi # Paper size for PS and/or PDF: AC_ARG_VAR(DOXYGEN_PAPER_SIZE, [a4wide (default), a4, letter, legal or executive]) case "$DOXYGEN_PAPER_SIZE" in #( "") AC_SUBST(DOXYGEN_PAPER_SIZE, "") ;; #( a4wide|a4|letter|legal|executive) DX_ENV_APPEND(PAPER_SIZE, $DOXYGEN_PAPER_SIZE) ;; #( *) AC_MSG_ERROR([unknown DOXYGEN_PAPER_SIZE='$DOXYGEN_PAPER_SIZE']) ;; esac #For debugging: #echo DX_FLAG_doc=$DX_FLAG_doc #echo DX_FLAG_dot=$DX_FLAG_dot #echo DX_FLAG_man=$DX_FLAG_man #echo DX_FLAG_html=$DX_FLAG_html #echo DX_FLAG_chm=$DX_FLAG_chm #echo DX_FLAG_chi=$DX_FLAG_chi #echo DX_FLAG_rtf=$DX_FLAG_rtf #echo DX_FLAG_xml=$DX_FLAG_xml #echo DX_FLAG_pdf=$DX_FLAG_pdf #echo DX_FLAG_ps=$DX_FLAG_ps #echo DX_ENV=$DX_ENV ]) elpa-2016.05.001/m4/ax_elpa_openmp.m40000644000312500001440000000705012664056454013611 00000000000000# openmp.m4 serial 4 dnl Copyright (C) 2006-2007 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, dnl with or without modifications, as long as this notice is preserved. dnl This file can be removed once we assume autoconf >= 2.62. # _AX_ELPA_LANG_OPENMP # --------------- # Expands to some language dependent source code for testing the presence of # OpenMP. AC_DEFUN([_AX_ELPA_LANG_OPENMP], [_AC_LANG_DISPATCH([$0], _AC_LANG, $@)]) # _AC_LANG_OPENMP(C) # ------------------ m4_define([_AX_ELPA_LANG_OPENMP(C)], [ #ifndef _OPENMP choke me #endif #include int main () { return omp_get_num_threads (); } ]) # _AX_ELPA_LANG_OPENMP(C++) # -------------------- m4_copy([_AX_ELPA_LANG_OPENMP(C)], [_AX_ELPA_LANG_OPENMP(C++)]) # _AX_ELPA_LANG_OPENMP(Fortran 77) # --------------------------- m4_define([_AX_ELPA_LANG_OPENMP(Fortran 77)], [ program test_openmp use omp_lib implicit none !$ integer :: foobar foobar = omp_get_num_threads() end program ]) # _AX_ELPA_LANG_OPENMP(Fortran) # --------------------------- m4_copy([_AX_ELPA_LANG_OPENMP(Fortran 77)], [_AX_ELPA_LANG_OPENMP(Fortran)]) # AC_ELPPA_OPENMP # --------- # Check which options need to be passed to the C compiler to support OpenMP. # Set the OPENMP_CFLAGS / OPENMP_CXXFLAGS / OPENMP_FFLAGS variable to these # options. # The options are necessary at compile time (so the #pragmas are understood) # and at link time (so the appropriate library is linked with). # This macro takes care to not produce redundant options if $CC $CFLAGS already # supports OpenMP. It also is careful to not pass options to compilers that # misinterpret them; for example, most compilers accept "-openmp" and create # an output file called 'penmp' rather than activating OpenMP support. AC_DEFUN([AX_ELPA_OPENMP], [ OPENMP_[]_AC_LANG_PREFIX[]FLAGS= enable_openmp="yes" if test "$enable_openmp" != no; then AC_CACHE_CHECK([for _AC_LANG_ABBREV option to support OpenMP], [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp], [AC_LINK_IFELSE([AC_LANG_SOURCE([_AX_ELPA_LANG_OPENMP])], [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp='none needed'], [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp='unsupported' dnl Try these flags: dnl GCC >= 4.2 -fopenmp dnl SunPRO C -xopenmp dnl Intel C -openmp dnl SGI C, PGI C -mp dnl Tru64 Compaq C -omp dnl IBM C (AIX, Linux) -qsmp=omp dnl If in this loop a compiler is passed an option that it doesn't dnl understand or that it misinterprets, the AC_LINK_IFELSE test dnl will fail (since we know that it failed without the option), dnl therefore the loop will continue searching for an option, and dnl no output file called 'penmp' or 'mp' is created. for ac_option in -openmp -fopenmp -xopenmp -mp -omp -qsmp=omp; do ac_save_[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $ac_option" AC_LINK_IFELSE([AC_LANG_SOURCE([_AX_ELPA_LANG_OPENMP])], [ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp=$ac_option]) _AC_LANG_PREFIX[]FLAGS=$ac_save_[]_AC_LANG_PREFIX[]FLAGS if test "$ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp" != unsupported; then break fi done])]) case $ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp in #( "none needed" | unsupported) ;; #( *) OPENMP_[]_AC_LANG_PREFIX[]FLAGS=$ac_cv_prog_[]_AC_LANG_ABBREV[]_openmp ;; esac fi AC_SUBST([OPENMP_]_AC_LANG_PREFIX[FLAGS]) ]) elpa-2016.05.001/m4/ax_elpa_specific_kernels.m40000644000312500001440000001477012717516040015621 00000000000000 dnl macro for testing whether the user wanted to compile only with one dnl specific real kernel dnl usage: DEFINE_OPTION([real-generic-kernel-only],[generic-kernel],[with_real_generic_kernel],[install_real_generic]) AC_DEFUN([DEFINE_OPTION_SPECIFIC_REAL_KERNEL],[ AC_ARG_WITH([$1], AS_HELP_STRING([--with-$1], [only compile $2 for real case]), [with_option=yes],[with_option=no]) if test x"${with_option}" = x"yes" ; then if test x"${use_specific_real_kernel}" = x"no" ; then dnl make sure that all the other kernels are unset install_real_generic=no install_real_generic_simple=no install_real_sse_assembly=no install_real_bgp=no install_real_bgq=no install_real_sse_block2=no install_real_sse_block4=no install_real_sse_block6=no install_real_avx_block2=no install_real_avx_block4=no install_real_avx_block6=no want_sse=no want_avx=no want_avx2=no # install_gpu=no use_specific_real_kernel=yes dnl now set the specific kernel $3=yes dnl take care of some dependencies if test x"${install_real_sse_block4}" = x"yes" ; then AC_MSG_NOTICE([$1 set. Also sse_block2 is needed]) install_real_sse_block2=yes fi if test x"${install_real_avx_block4}" = x"yes" ; then AC_MSG_NOTICE([$1 set. Also avx_block2 is needed]) install_real_avx_block2=yes fi if test x"${install_real_sse_block6}" = x"yes" ; then AC_MSG_NOTICE([$1 set. Also sse_block2 is needed]) AC_MSG_NOTICE([$1 set. Also sse_block4 is needed]) install_real_sse_block4=yes install_real_sse_block2=yes fi if test x"${install_real_avx_block6}" = x"yes" ; then AC_MSG_NOTICE([$1 set. Also avx_block2 is needed]) AC_MSG_NOTICE([$1 set. Also avx_block4 is needed]) install_real_avx_block4=yes install_real_avx_block2=yes fi dnl in case of SSE or AVX make sure that we can compile the choosen kernel if test x"${install_real_sse_assembly}" = x"yes" ; then if test x"${can_compile_sse_assembly}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) fi fi if test x"${install_real_sse_block2}" = x"yes" ; then if test x"${can_compile_sse_intrinsics}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_sse=yes fi fi if test x"${install_real_sse_block4}" = x"yes" ; then if test x"${can_compile_sse_intrinsics}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_sse=yes fi fi if test x"${install_real_sse_block6}" = x"yes" ; then if test x"${can_compile_sse_inrinsics}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_sse=yes fi fi if test x"${install_real_avx_block2}" = x"yes" ; then if test x"${can_compile_avx}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_avx=yes fi fi if test x"${install_real_avx_block4}" = x"yes" ; then if test x"${can_compile_avx}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_avx=yes fi fi if test x"${install_real_avx_block6}" = x"yes" ; then if test x"${can_compile_avx}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_avx=yes fi fi AC_MSG_NOTICE([$1 will be the only compiled kernel for real case]) # if test x"${want_gpu}" = x"yes" ; then # AC_MSG_WARN([At the moment this disables GPU support!]) # AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific real kernel]) # fi else AC_MSG_FAILURE([$1 failed; A specific kernel for real case has already been defined before!]) fi fi ]) AC_DEFUN([DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL],[ AC_ARG_WITH([$1], AS_HELP_STRING([--with-$1], [only compile $2 for complex case]), [with_option=yes],[with_option=no]) if test x"${with_option}" = x"yes" ; then if test x"${use_specific_complex_kernel}" = x"no" ; then dnl make sure that all the other kernels are unset install_complex_generic=no install_complex_generic_simple=no install_complex_sse_assembly=no install_complex_bgp=no install_complex_bgq=no install_complex_sse_block1=no install_complex_sse_block2=no install_complex_avx_block1=no install_complex_avx_block2=no want_sse=no want_avx=no want_avx2=no # install_gpu=no use_specific_complex_kernel=yes dnl now set the specific kernel $3=yes dnl take care of some dependencies if test x"${install_complex_sse_block2}" = x"yes" ; then install_complex_sse_block1=yes fi if test x"${install_complex_avx_block2}" = x"yes" ; then install_complex_avx_block1=yes fi dnl in case of SSE or AVX make sure that we can compile the choosen kernel if test x"${install_complex_sse_assembly}" = x"yes" ; then if test x"${can_compile_sse_assembly}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) fi fi if test x"${install_complex_sse_block1}" = x"yes" ; then if test x"${can_compile_sse_intrinsics}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_sse=yes fi fi if test x"${install_complex_sse_block2}" = x"yes" ; then if test x"${can_compile_sse_intrinsics}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_sse=yes fi fi if test x"${install_complex_avx_block1}" = x"yes" ; then if test x"${can_compile_avx}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_avx=yes fi fi if test x"${install_complex_avx_block2}" = x"yes" ; then if test x"${can_compile_avx}" = x"no" ; then AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) else want_avx=yes fi fi AC_MSG_NOTICE([$1 will be the only compiled kernel for complex case]) # if test x"${want_gpu}" = x"yes" ; then # AC_MSG_WARN([At the moment this disables GPU support!]) # AC_MSG_WARN([IF GPU support is wanted do NOT specify a specific complex kernel]) # fi else AC_MSG_FAILURE([$1 failed; A specific kernel for complex case has already been defined before!]) fi fi ]) elpa-2016.05.001/m4/ax_prog_fc_mpi.m40000644000312500001440000001414612664056454013602 00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_prog_fc_mpi.html # =========================================================================== # # SYNOPSIS # # AX_PROG_FC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]]) # # DESCRIPTION # # This macro tries to find out how to compile Fortran77 programs that use # MPI (Message Passing Interface), a standard API for parallel process # communication (see http://www-unix.mcs.anl.gov/mpi/). The macro has to # be used instead of the standard macro AC_PROG_FC and will replace the # standard variable FC with the found compiler. # # MPI-WANTED-TEST is used to test whether MPI is actually wanted by the # user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will # try to find out how to use MPI, if it fails, the macro will call # AC_PROG_CC to find a standard C compiler instead. # # When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found # (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If # ACTION-IF-FOUND is not set, the macro will define HAVE_MPI. # # The following example demonstrates usage of the macro: # # # If --with-mpi=auto is used, try to find MPI, but use standard FC compiler if it is not found. # # If --with-mpi=yes is used, try to find MPI and fail if it isn't found. # # If --with-mpi=no is used, use a standard FC compiler instead. # AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi], # [compile with MPI (parallelization) support. If none is found, # MPI is not used. Default: auto]) # ],,[with_mpi=auto]) # # AX_PROG_FC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[ # use_mpi=no # if test x"$with_mpi" = xyes; then # AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.]) # else # AC_MSG_WARN([No MPI compiler found, won't use MPI.]) # fi # ]) # # LICENSE # # Copyright (c) 2010,2011 Olaf Lenz # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 2 AC_DEFUN([AX_PROG_FC_MPI], [ AC_PREREQ(2.50) # Check for compiler # Needs to be split off into an extra macro to ensure right expansion # order. AC_REQUIRE([_AX_PROG_FC_MPI],[_AX_PROG_FC_MPI([$1])]) AS_IF([test x"$_ax_prog_fc_mpi_mpi_wanted" = xno], [ _ax_prog_fc_mpi_mpi_found=no ], [ AC_LANG_PUSH([Fortran]) # test whether MPI_INIT is available # We do not use AC_SEARCH_LIBS here, as it caches its outcome and # thus disallows corresponding calls in the other AX_PROG_*_MPI # macros. for lib in NONE mpichf90 fmpi fmpich; do save_LIBS=$LIBS if test x"$lib" = xNONE; then AC_MSG_CHECKING([for function MPI_INIT]) else AC_MSG_CHECKING([for function MPI_INIT in -l$lib]) LIBS="-l$lib $LIBS" fi AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_INIT])], [ _ax_prog_fc_mpi_mpi_found=yes ], [ _ax_prog_fc_mpi_mpi_found=no ]) AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_found) if test "x$_ax_prog_fc_mpi_mpi_found" = "xyes"; then break; fi LIBS=$save_LIBS done # Check for header AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [ AC_MSG_CHECKING([for mpif.h]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[[ include 'mpif.h' ]])], [ AC_MSG_RESULT(yes)], [ AC_MSG_RESULT(no) _ax_prog_fc_mpi_mpi_found=no ]) ]) AC_LANG_POP([Fortran]) ]) # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: AS_IF([test x"$_ax_prog_fc_mpi_mpi_found" = xyes], [ ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2]) : ],[ $3 : ]) ])dnl AX_PROG_FC_MPI dnl _AX_PROG_FC_MPI is an internal macro required by AX_PROG_FC_MPI. dnl To ensure the right expansion order, the main function AX_PROG_FC_MPI dnl has to be split into two parts. This part looks for the MPI dnl compiler, while the other one tests whether an MPI program can be dnl compiled. dnl AC_DEFUN([_AX_PROG_FC_MPI], [ ifelse([$1],,[_ax_prog_fc_mpi_mpi_wanted=yes],[ AC_MSG_CHECKING([whether to compile using MPI]) if $1; then _ax_prog_fc_mpi_mpi_wanted=yes else _ax_prog_fc_mpi_mpi_wanted=no fi AC_MSG_RESULT($_ax_prog_fc_mpi_mpi_wanted) ]) if test x"$_ax_prog_fc_mpi_mpi_wanted" = xyes; then AC_CHECK_TOOLS([FC], [mpiifort mpifort mpif95 mpxlf95_r mpxlf95 ftn mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90 mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90 xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77]) fi AC_PROG_FC ])dnl _AX_PROG_FC_MPI elpa-2016.05.001/m4/ax_prog_cc_mpi.m40000644000312500001440000001361112664056454013573 00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_prog_cc_mpi.html # =========================================================================== # # SYNOPSIS # # AX_PROG_CC_MPI([MPI-WANTED-TEST[, ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]]) # # DESCRIPTION # # This macro tries to find out how to compile C programs that use MPI # (Message Passing Interface), a standard API for parallel process # communication (see http://www-unix.mcs.anl.gov/mpi/). The macro has to # be used instead of the standard macro AC_PROG_CC and will replace the # standard variable CC with the found compiler. # # MPI-WANTED-TEST is used to test whether MPI is actually wanted by the # user. If MPI-WANTED_TEST is omitted or if it succeeds, the macro will # try to find out how to use MPI, if it fails, the macro will call # AC_PROG_CC to find a standard C compiler instead. # # When MPI is found, ACTION-IF-FOUND will be executed, if MPI is not found # (or MPI-WANTED-TEST fails) ACTION-IF-NOT-FOUND is executed. If # ACTION-IF-FOUND is not set, the macro will define HAVE_MPI. # # The following example demonstrates usage of the macro: # # # If --with-mpi=auto is used, try to find MPI, but use standard C compiler if it is not found. # # If --with-mpi=yes is used, try to find MPI and fail if it isn't found. # # If --with-mpi=no is used, use a standard C compiler instead. # AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi], # [compile with MPI (parallelization) support. If none is found, # MPI is not used. Default: auto]) # ],,[with_mpi=auto]) # # # AX_PROG_CC_MPI([test x"$with_mpi" != xno],[use_mpi=yes],[ # use_mpi=no # if test x"$with_mpi" = xyes; then # AC_MSG_FAILURE([MPI compiler requested, but couldn't use MPI.]) # else # AC_MSG_WARN([No MPI compiler found, won't use MPI.]) # fi # ]) # # LICENSE # # Copyright (c) 2010,2011 Olaf Lenz # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 1 AC_DEFUN([AX_PROG_CC_MPI], [ AC_PREREQ(2.50) # Check for compiler # Needs to be split off into an extra macro to ensure right expansion # order. AC_REQUIRE([_AX_PROG_CC_MPI],[_AX_PROG_CC_MPI([$1])]) AS_IF([test x"$_ax_prog_cc_mpi_mpi_wanted" = xno], [ _ax_prog_cc_mpi_mpi_found=no ], [ AC_LANG_PUSH([C]) # test whether MPI_Init is available # We do not use AC_SEARCH_LIBS here, as it caches its outcome and # thus disallows corresponding calls in the other AX_PROG_*_MPI # macros. for lib in NONE mpi mpich; do save_LIBS=$LIBS if test x"$lib" = xNONE; then AC_MSG_CHECKING([for function MPI_Init]) else AC_MSG_CHECKING([for function MPI_Init in -l$lib]) LIBS="-l$lib $LIBS" fi AC_LINK_IFELSE([AC_LANG_CALL([],[MPI_Init])], [ _ax_prog_cc_mpi_mpi_found=yes ], [ _ax_prog_cc_mpi_mpi_found=no ]) AC_MSG_RESULT($_ax_prog_cc_mpi_mpi_found) if test "x$_ax_prog_cc_mpi_mpi_found" = "xyes"; then break; fi LIBS=$save_LIBS done # Check for header AS_IF([test x"$_ax_prog_cc_mpi_mpi_found" = xyes], [ AC_MSG_CHECKING([for mpi.h]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include ])], [ AC_MSG_RESULT(yes)], [ AC_MSG_RESULT(no) _ax_prog_cc_mpi_mpi_found=no ]) ]) AC_LANG_POP([C]) ]) # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: AS_IF([test x"$_ax_prog_cc_mpi_mpi_found" = xyes], [ ifelse([$2],,[AC_DEFINE(HAVE_MPI,1,[Define if you have the MPI library.])],[$2]) : ],[ $3 : ]) ])dnl AX_PROG_CC_MPI dnl _AX_PROG_CC_MPI is an internal macro required by AX_PROG_CC_MPI. dnl To ensure the right expansion order, the main function AX_PROG_CC_MPI dnl has to be split into two parts. dnl dnl Known MPI C compilers: dnl mpicc dnl mpixlc_r dnl mpixlc dnl hcc dnl mpxlc_r dnl mpxlc dnl sxmpicc NEC SX dnl mpifcc Fujitsu dnl mpgcc dnl mpcc dnl cmpicc dnl cc dnl AC_DEFUN([_AX_PROG_CC_MPI], [ ifelse([$1],,[_ax_prog_cc_mpi_mpi_wanted=yes],[ AC_MSG_CHECKING([whether to compile using MPI]) if $1; then _ax_prog_cc_mpi_mpi_wanted=yes else _ax_prog_cc_mpi_mpi_wanted=no fi AC_MSG_RESULT($_ax_prog_cc_mpi_mpi_wanted) ]) if test x"$_ax_prog_cc_mpi_mpi_wanted" = xyes; then AC_CHECK_TOOLS([CC], [mpicc mpixlc_r mpixlc hcc mpxlc_r mpxlc sxmpicc mpifcc mpgcc mpcc cmpicc cc gcc]) fi AC_PROG_CC ])dnl _AX_PROG_CC_MPI elpa-2016.05.001/m4/ltversion.m40000644000312500001440000000127312717533401012637 00000000000000# ltversion.m4 -- version numbers -*- Autoconf -*- # # Copyright (C) 2004, 2011-2015 Free Software Foundation, Inc. # Written by Scott James Remnant, 2004 # # This file is free software; the Free Software Foundation gives # unlimited permission to copy and/or distribute it, with or without # modifications, as long as this notice is preserved. # @configure_input@ # serial 4179 ltversion.m4 # This file is part of GNU Libtool m4_define([LT_PACKAGE_VERSION], [2.4.6]) m4_define([LT_PACKAGE_REVISION], [2.4.6]) AC_DEFUN([LTVERSION_VERSION], [macro_version='2.4.6' macro_revision='2.4.6' _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?]) _LT_DECL(, macro_revision, 0) ]) elpa-2016.05.001/m4/ltoptions.m40000644000312500001440000003426212717533401012651 00000000000000# Helper functions for option handling. -*- Autoconf -*- # # Copyright (C) 2004-2005, 2007-2009, 2011-2015 Free Software # Foundation, Inc. # Written by Gary V. Vaughan, 2004 # # This file is free software; the Free Software Foundation gives # unlimited permission to copy and/or distribute it, with or without # modifications, as long as this notice is preserved. # serial 8 ltoptions.m4 # This is to help aclocal find these macros, as it can't see m4_define. AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])]) # _LT_MANGLE_OPTION(MACRO-NAME, OPTION-NAME) # ------------------------------------------ m4_define([_LT_MANGLE_OPTION], [[_LT_OPTION_]m4_bpatsubst($1__$2, [[^a-zA-Z0-9_]], [_])]) # _LT_SET_OPTION(MACRO-NAME, OPTION-NAME) # --------------------------------------- # Set option OPTION-NAME for macro MACRO-NAME, and if there is a # matching handler defined, dispatch to it. Other OPTION-NAMEs are # saved as a flag. m4_define([_LT_SET_OPTION], [m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]), _LT_MANGLE_DEFUN([$1], [$2]), [m4_warning([Unknown $1 option '$2'])])[]dnl ]) # _LT_IF_OPTION(MACRO-NAME, OPTION-NAME, IF-SET, [IF-NOT-SET]) # ------------------------------------------------------------ # Execute IF-SET if OPTION is set, IF-NOT-SET otherwise. m4_define([_LT_IF_OPTION], [m4_ifdef(_LT_MANGLE_OPTION([$1], [$2]), [$3], [$4])]) # _LT_UNLESS_OPTIONS(MACRO-NAME, OPTION-LIST, IF-NOT-SET) # ------------------------------------------------------- # Execute IF-NOT-SET unless all options in OPTION-LIST for MACRO-NAME # are set. m4_define([_LT_UNLESS_OPTIONS], [m4_foreach([_LT_Option], m4_split(m4_normalize([$2])), [m4_ifdef(_LT_MANGLE_OPTION([$1], _LT_Option), [m4_define([$0_found])])])[]dnl m4_ifdef([$0_found], [m4_undefine([$0_found])], [$3 ])[]dnl ]) # _LT_SET_OPTIONS(MACRO-NAME, OPTION-LIST) # ---------------------------------------- # OPTION-LIST is a space-separated list of Libtool options associated # with MACRO-NAME. If any OPTION has a matching handler declared with # LT_OPTION_DEFINE, dispatch to that macro; otherwise complain about # the unknown option and exit. m4_defun([_LT_SET_OPTIONS], [# Set options m4_foreach([_LT_Option], m4_split(m4_normalize([$2])), [_LT_SET_OPTION([$1], _LT_Option)]) m4_if([$1],[LT_INIT],[ dnl dnl Simply set some default values (i.e off) if boolean options were not dnl specified: _LT_UNLESS_OPTIONS([LT_INIT], [dlopen], [enable_dlopen=no ]) _LT_UNLESS_OPTIONS([LT_INIT], [win32-dll], [enable_win32_dll=no ]) dnl dnl If no reference was made to various pairs of opposing options, then dnl we run the default mode handler for the pair. For example, if neither dnl 'shared' nor 'disable-shared' was passed, we enable building of shared dnl archives by default: _LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED]) _LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC]) _LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC]) _LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install], [_LT_ENABLE_FAST_INSTALL]) _LT_UNLESS_OPTIONS([LT_INIT], [aix-soname=aix aix-soname=both aix-soname=svr4], [_LT_WITH_AIX_SONAME([aix])]) ]) ])# _LT_SET_OPTIONS ## --------------------------------- ## ## Macros to handle LT_INIT options. ## ## --------------------------------- ## # _LT_MANGLE_DEFUN(MACRO-NAME, OPTION-NAME) # ----------------------------------------- m4_define([_LT_MANGLE_DEFUN], [[_LT_OPTION_DEFUN_]m4_bpatsubst(m4_toupper([$1__$2]), [[^A-Z0-9_]], [_])]) # LT_OPTION_DEFINE(MACRO-NAME, OPTION-NAME, CODE) # ----------------------------------------------- m4_define([LT_OPTION_DEFINE], [m4_define(_LT_MANGLE_DEFUN([$1], [$2]), [$3])[]dnl ])# LT_OPTION_DEFINE # dlopen # ------ LT_OPTION_DEFINE([LT_INIT], [dlopen], [enable_dlopen=yes ]) AU_DEFUN([AC_LIBTOOL_DLOPEN], [_LT_SET_OPTION([LT_INIT], [dlopen]) AC_DIAGNOSE([obsolete], [$0: Remove this warning and the call to _LT_SET_OPTION when you put the 'dlopen' option into LT_INIT's first parameter.]) ]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_DLOPEN], []) # win32-dll # --------- # Declare package support for building win32 dll's. LT_OPTION_DEFINE([LT_INIT], [win32-dll], [enable_win32_dll=yes case $host in *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*) AC_CHECK_TOOL(AS, as, false) AC_CHECK_TOOL(DLLTOOL, dlltool, false) AC_CHECK_TOOL(OBJDUMP, objdump, false) ;; esac test -z "$AS" && AS=as _LT_DECL([], [AS], [1], [Assembler program])dnl test -z "$DLLTOOL" && DLLTOOL=dlltool _LT_DECL([], [DLLTOOL], [1], [DLL creation program])dnl test -z "$OBJDUMP" && OBJDUMP=objdump _LT_DECL([], [OBJDUMP], [1], [Object dumper program])dnl ])# win32-dll AU_DEFUN([AC_LIBTOOL_WIN32_DLL], [AC_REQUIRE([AC_CANONICAL_HOST])dnl _LT_SET_OPTION([LT_INIT], [win32-dll]) AC_DIAGNOSE([obsolete], [$0: Remove this warning and the call to _LT_SET_OPTION when you put the 'win32-dll' option into LT_INIT's first parameter.]) ]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], []) # _LT_ENABLE_SHARED([DEFAULT]) # ---------------------------- # implement the --enable-shared flag, and supports the 'shared' and # 'disable-shared' LT_INIT options. # DEFAULT is either 'yes' or 'no'. If omitted, it defaults to 'yes'. m4_define([_LT_ENABLE_SHARED], [m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl AC_ARG_ENABLE([shared], [AS_HELP_STRING([--enable-shared@<:@=PKGS@:>@], [build shared libraries @<:@default=]_LT_ENABLE_SHARED_DEFAULT[@:>@])], [p=${PACKAGE-default} case $enableval in yes) enable_shared=yes ;; no) enable_shared=no ;; *) enable_shared=no # Look at the argument we got. We use all the common list separators. lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR, for pkg in $enableval; do IFS=$lt_save_ifs if test "X$pkg" = "X$p"; then enable_shared=yes fi done IFS=$lt_save_ifs ;; esac], [enable_shared=]_LT_ENABLE_SHARED_DEFAULT) _LT_DECL([build_libtool_libs], [enable_shared], [0], [Whether or not to build shared libraries]) ])# _LT_ENABLE_SHARED LT_OPTION_DEFINE([LT_INIT], [shared], [_LT_ENABLE_SHARED([yes])]) LT_OPTION_DEFINE([LT_INIT], [disable-shared], [_LT_ENABLE_SHARED([no])]) # Old names: AC_DEFUN([AC_ENABLE_SHARED], [_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[shared]) ]) AC_DEFUN([AC_DISABLE_SHARED], [_LT_SET_OPTION([LT_INIT], [disable-shared]) ]) AU_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)]) AU_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AM_ENABLE_SHARED], []) dnl AC_DEFUN([AM_DISABLE_SHARED], []) # _LT_ENABLE_STATIC([DEFAULT]) # ---------------------------- # implement the --enable-static flag, and support the 'static' and # 'disable-static' LT_INIT options. # DEFAULT is either 'yes' or 'no'. If omitted, it defaults to 'yes'. m4_define([_LT_ENABLE_STATIC], [m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl AC_ARG_ENABLE([static], [AS_HELP_STRING([--enable-static@<:@=PKGS@:>@], [build static libraries @<:@default=]_LT_ENABLE_STATIC_DEFAULT[@:>@])], [p=${PACKAGE-default} case $enableval in yes) enable_static=yes ;; no) enable_static=no ;; *) enable_static=no # Look at the argument we got. We use all the common list separators. lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR, for pkg in $enableval; do IFS=$lt_save_ifs if test "X$pkg" = "X$p"; then enable_static=yes fi done IFS=$lt_save_ifs ;; esac], [enable_static=]_LT_ENABLE_STATIC_DEFAULT) _LT_DECL([build_old_libs], [enable_static], [0], [Whether or not to build static libraries]) ])# _LT_ENABLE_STATIC LT_OPTION_DEFINE([LT_INIT], [static], [_LT_ENABLE_STATIC([yes])]) LT_OPTION_DEFINE([LT_INIT], [disable-static], [_LT_ENABLE_STATIC([no])]) # Old names: AC_DEFUN([AC_ENABLE_STATIC], [_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[static]) ]) AC_DEFUN([AC_DISABLE_STATIC], [_LT_SET_OPTION([LT_INIT], [disable-static]) ]) AU_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)]) AU_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AM_ENABLE_STATIC], []) dnl AC_DEFUN([AM_DISABLE_STATIC], []) # _LT_ENABLE_FAST_INSTALL([DEFAULT]) # ---------------------------------- # implement the --enable-fast-install flag, and support the 'fast-install' # and 'disable-fast-install' LT_INIT options. # DEFAULT is either 'yes' or 'no'. If omitted, it defaults to 'yes'. m4_define([_LT_ENABLE_FAST_INSTALL], [m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl AC_ARG_ENABLE([fast-install], [AS_HELP_STRING([--enable-fast-install@<:@=PKGS@:>@], [optimize for fast installation @<:@default=]_LT_ENABLE_FAST_INSTALL_DEFAULT[@:>@])], [p=${PACKAGE-default} case $enableval in yes) enable_fast_install=yes ;; no) enable_fast_install=no ;; *) enable_fast_install=no # Look at the argument we got. We use all the common list separators. lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR, for pkg in $enableval; do IFS=$lt_save_ifs if test "X$pkg" = "X$p"; then enable_fast_install=yes fi done IFS=$lt_save_ifs ;; esac], [enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT) _LT_DECL([fast_install], [enable_fast_install], [0], [Whether or not to optimize for fast installation])dnl ])# _LT_ENABLE_FAST_INSTALL LT_OPTION_DEFINE([LT_INIT], [fast-install], [_LT_ENABLE_FAST_INSTALL([yes])]) LT_OPTION_DEFINE([LT_INIT], [disable-fast-install], [_LT_ENABLE_FAST_INSTALL([no])]) # Old names: AU_DEFUN([AC_ENABLE_FAST_INSTALL], [_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install]) AC_DIAGNOSE([obsolete], [$0: Remove this warning and the call to _LT_SET_OPTION when you put the 'fast-install' option into LT_INIT's first parameter.]) ]) AU_DEFUN([AC_DISABLE_FAST_INSTALL], [_LT_SET_OPTION([LT_INIT], [disable-fast-install]) AC_DIAGNOSE([obsolete], [$0: Remove this warning and the call to _LT_SET_OPTION when you put the 'disable-fast-install' option into LT_INIT's first parameter.]) ]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], []) dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], []) # _LT_WITH_AIX_SONAME([DEFAULT]) # ---------------------------------- # implement the --with-aix-soname flag, and support the `aix-soname=aix' # and `aix-soname=both' and `aix-soname=svr4' LT_INIT options. DEFAULT # is either `aix', `both' or `svr4'. If omitted, it defaults to `aix'. m4_define([_LT_WITH_AIX_SONAME], [m4_define([_LT_WITH_AIX_SONAME_DEFAULT], [m4_if($1, svr4, svr4, m4_if($1, both, both, aix))])dnl shared_archive_member_spec= case $host,$enable_shared in power*-*-aix[[5-9]]*,yes) AC_MSG_CHECKING([which variant of shared library versioning to provide]) AC_ARG_WITH([aix-soname], [AS_HELP_STRING([--with-aix-soname=aix|svr4|both], [shared library versioning (aka "SONAME") variant to provide on AIX, @<:@default=]_LT_WITH_AIX_SONAME_DEFAULT[@:>@.])], [case $withval in aix|svr4|both) ;; *) AC_MSG_ERROR([Unknown argument to --with-aix-soname]) ;; esac lt_cv_with_aix_soname=$with_aix_soname], [AC_CACHE_VAL([lt_cv_with_aix_soname], [lt_cv_with_aix_soname=]_LT_WITH_AIX_SONAME_DEFAULT) with_aix_soname=$lt_cv_with_aix_soname]) AC_MSG_RESULT([$with_aix_soname]) if test aix != "$with_aix_soname"; then # For the AIX way of multilib, we name the shared archive member # based on the bitwidth used, traditionally 'shr.o' or 'shr_64.o', # and 'shr.imp' or 'shr_64.imp', respectively, for the Import File. # Even when GNU compilers ignore OBJECT_MODE but need '-maix64' flag, # the AIX toolchain works better with OBJECT_MODE set (default 32). if test 64 = "${OBJECT_MODE-32}"; then shared_archive_member_spec=shr_64 else shared_archive_member_spec=shr fi fi ;; *) with_aix_soname=aix ;; esac _LT_DECL([], [shared_archive_member_spec], [0], [Shared archive member basename, for filename based shared library versioning on AIX])dnl ])# _LT_WITH_AIX_SONAME LT_OPTION_DEFINE([LT_INIT], [aix-soname=aix], [_LT_WITH_AIX_SONAME([aix])]) LT_OPTION_DEFINE([LT_INIT], [aix-soname=both], [_LT_WITH_AIX_SONAME([both])]) LT_OPTION_DEFINE([LT_INIT], [aix-soname=svr4], [_LT_WITH_AIX_SONAME([svr4])]) # _LT_WITH_PIC([MODE]) # -------------------- # implement the --with-pic flag, and support the 'pic-only' and 'no-pic' # LT_INIT options. # MODE is either 'yes' or 'no'. If omitted, it defaults to 'both'. m4_define([_LT_WITH_PIC], [AC_ARG_WITH([pic], [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@], [try to use only PIC/non-PIC objects @<:@default=use both@:>@])], [lt_p=${PACKAGE-default} case $withval in yes|no) pic_mode=$withval ;; *) pic_mode=default # Look at the argument we got. We use all the common list separators. lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR, for lt_pkg in $withval; do IFS=$lt_save_ifs if test "X$lt_pkg" = "X$lt_p"; then pic_mode=yes fi done IFS=$lt_save_ifs ;; esac], [pic_mode=m4_default([$1], [default])]) _LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl ])# _LT_WITH_PIC LT_OPTION_DEFINE([LT_INIT], [pic-only], [_LT_WITH_PIC([yes])]) LT_OPTION_DEFINE([LT_INIT], [no-pic], [_LT_WITH_PIC([no])]) # Old name: AU_DEFUN([AC_LIBTOOL_PICMODE], [_LT_SET_OPTION([LT_INIT], [pic-only]) AC_DIAGNOSE([obsolete], [$0: Remove this warning and the call to _LT_SET_OPTION when you put the 'pic-only' option into LT_INIT's first parameter.]) ]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_PICMODE], []) ## ----------------- ## ## LTDL_INIT Options ## ## ----------------- ## m4_define([_LTDL_MODE], []) LT_OPTION_DEFINE([LTDL_INIT], [nonrecursive], [m4_define([_LTDL_MODE], [nonrecursive])]) LT_OPTION_DEFINE([LTDL_INIT], [recursive], [m4_define([_LTDL_MODE], [recursive])]) LT_OPTION_DEFINE([LTDL_INIT], [subproject], [m4_define([_LTDL_MODE], [subproject])]) m4_define([_LTDL_TYPE], []) LT_OPTION_DEFINE([LTDL_INIT], [installable], [m4_define([_LTDL_TYPE], [installable])]) LT_OPTION_DEFINE([LTDL_INIT], [convenience], [m4_define([_LTDL_TYPE], [convenience])]) elpa-2016.05.001/m4/ltsugar.m40000644000312500001440000001044012717533401012267 00000000000000# ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*- # # Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software # Foundation, Inc. # Written by Gary V. Vaughan, 2004 # # This file is free software; the Free Software Foundation gives # unlimited permission to copy and/or distribute it, with or without # modifications, as long as this notice is preserved. # serial 6 ltsugar.m4 # This is to help aclocal find these macros, as it can't see m4_define. AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])]) # lt_join(SEP, ARG1, [ARG2...]) # ----------------------------- # Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their # associated separator. # Needed until we can rely on m4_join from Autoconf 2.62, since all earlier # versions in m4sugar had bugs. m4_define([lt_join], [m4_if([$#], [1], [], [$#], [2], [[$2]], [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])]) m4_define([_lt_join], [m4_if([$#$2], [2], [], [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])]) # lt_car(LIST) # lt_cdr(LIST) # ------------ # Manipulate m4 lists. # These macros are necessary as long as will still need to support # Autoconf-2.59, which quotes differently. m4_define([lt_car], [[$1]]) m4_define([lt_cdr], [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])], [$#], 1, [], [m4_dquote(m4_shift($@))])]) m4_define([lt_unquote], $1) # lt_append(MACRO-NAME, STRING, [SEPARATOR]) # ------------------------------------------ # Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'. # Note that neither SEPARATOR nor STRING are expanded; they are appended # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked). # No SEPARATOR is output if MACRO-NAME was previously undefined (different # than defined and empty). # # This macro is needed until we can rely on Autoconf 2.62, since earlier # versions of m4sugar mistakenly expanded SEPARATOR but not STRING. m4_define([lt_append], [m4_define([$1], m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])]) # lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...]) # ---------------------------------------------------------- # Produce a SEP delimited list of all paired combinations of elements of # PREFIX-LIST with SUFFIX1 through SUFFIXn. Each element of the list # has the form PREFIXmINFIXSUFFIXn. # Needed until we can rely on m4_combine added in Autoconf 2.62. m4_define([lt_combine], [m4_if(m4_eval([$# > 3]), [1], [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl [[m4_foreach([_Lt_prefix], [$2], [m4_foreach([_Lt_suffix], ]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[, [_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])]) # lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ]) # ----------------------------------------------------------------------- # Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited # by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ. m4_define([lt_if_append_uniq], [m4_ifdef([$1], [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1], [lt_append([$1], [$2], [$3])$4], [$5])], [lt_append([$1], [$2], [$3])$4])]) # lt_dict_add(DICT, KEY, VALUE) # ----------------------------- m4_define([lt_dict_add], [m4_define([$1($2)], [$3])]) # lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE) # -------------------------------------------- m4_define([lt_dict_add_subkey], [m4_define([$1($2:$3)], [$4])]) # lt_dict_fetch(DICT, KEY, [SUBKEY]) # ---------------------------------- m4_define([lt_dict_fetch], [m4_ifval([$3], m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]), m4_ifdef([$1($2)], [m4_defn([$1($2)])]))]) # lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE]) # ----------------------------------------------------------------- m4_define([lt_if_dict_fetch], [m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4], [$5], [$6])]) # lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...]) # -------------------------------------------------------------- m4_define([lt_dict_filter], [m4_if([$5], [], [], [lt_join(m4_quote(m4_default([$4], [[, ]])), lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]), [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl ]) elpa-2016.05.001/m4/lt~obsolete.m40000644000312500001440000001377412717533401013175 00000000000000# lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*- # # Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software # Foundation, Inc. # Written by Scott James Remnant, 2004. # # This file is free software; the Free Software Foundation gives # unlimited permission to copy and/or distribute it, with or without # modifications, as long as this notice is preserved. # serial 5 lt~obsolete.m4 # These exist entirely to fool aclocal when bootstrapping libtool. # # In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN), # which have later been changed to m4_define as they aren't part of the # exported API, or moved to Autoconf or Automake where they belong. # # The trouble is, aclocal is a bit thick. It'll see the old AC_DEFUN # in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us # using a macro with the same name in our local m4/libtool.m4 it'll # pull the old libtool.m4 in (it doesn't see our shiny new m4_define # and doesn't know about Autoconf macros at all.) # # So we provide this file, which has a silly filename so it's always # included after everything else. This provides aclocal with the # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything # because those macros already exist, or will be overwritten later. # We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. # # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here. # Yes, that means every name once taken will need to remain here until # we give up compatibility with versions before 1.7, at which point # we need to keep only those names which we still refer to. # This is to help aclocal find these macros, as it can't see m4_define. AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])]) m4_ifndef([AC_LIBTOOL_LINKER_OPTION], [AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])]) m4_ifndef([AC_PROG_EGREP], [AC_DEFUN([AC_PROG_EGREP])]) m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])]) m4_ifndef([_LT_AC_SHELL_INIT], [AC_DEFUN([_LT_AC_SHELL_INIT])]) m4_ifndef([_LT_AC_SYS_LIBPATH_AIX], [AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])]) m4_ifndef([_LT_PROG_LTMAIN], [AC_DEFUN([_LT_PROG_LTMAIN])]) m4_ifndef([_LT_AC_TAGVAR], [AC_DEFUN([_LT_AC_TAGVAR])]) m4_ifndef([AC_LTDL_ENABLE_INSTALL], [AC_DEFUN([AC_LTDL_ENABLE_INSTALL])]) m4_ifndef([AC_LTDL_PREOPEN], [AC_DEFUN([AC_LTDL_PREOPEN])]) m4_ifndef([_LT_AC_SYS_COMPILER], [AC_DEFUN([_LT_AC_SYS_COMPILER])]) m4_ifndef([_LT_AC_LOCK], [AC_DEFUN([_LT_AC_LOCK])]) m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE], [AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])]) m4_ifndef([_LT_AC_TRY_DLOPEN_SELF], [AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])]) m4_ifndef([AC_LIBTOOL_PROG_CC_C_O], [AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])]) m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])]) m4_ifndef([AC_LIBTOOL_OBJDIR], [AC_DEFUN([AC_LIBTOOL_OBJDIR])]) m4_ifndef([AC_LTDL_OBJDIR], [AC_DEFUN([AC_LTDL_OBJDIR])]) m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])]) m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP], [AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])]) m4_ifndef([AC_PATH_MAGIC], [AC_DEFUN([AC_PATH_MAGIC])]) m4_ifndef([AC_PROG_LD_GNU], [AC_DEFUN([AC_PROG_LD_GNU])]) m4_ifndef([AC_PROG_LD_RELOAD_FLAG], [AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])]) m4_ifndef([AC_DEPLIBS_CHECK_METHOD], [AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])]) m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])]) m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])]) m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])]) m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS], [AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])]) m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP], [AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])]) m4_ifndef([LT_AC_PROG_EGREP], [AC_DEFUN([LT_AC_PROG_EGREP])]) m4_ifndef([LT_AC_PROG_SED], [AC_DEFUN([LT_AC_PROG_SED])]) m4_ifndef([_LT_CC_BASENAME], [AC_DEFUN([_LT_CC_BASENAME])]) m4_ifndef([_LT_COMPILER_BOILERPLATE], [AC_DEFUN([_LT_COMPILER_BOILERPLATE])]) m4_ifndef([_LT_LINKER_BOILERPLATE], [AC_DEFUN([_LT_LINKER_BOILERPLATE])]) m4_ifndef([_AC_PROG_LIBTOOL], [AC_DEFUN([_AC_PROG_LIBTOOL])]) m4_ifndef([AC_LIBTOOL_SETUP], [AC_DEFUN([AC_LIBTOOL_SETUP])]) m4_ifndef([_LT_AC_CHECK_DLFCN], [AC_DEFUN([_LT_AC_CHECK_DLFCN])]) m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER], [AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])]) m4_ifndef([_LT_AC_TAGCONFIG], [AC_DEFUN([_LT_AC_TAGCONFIG])]) m4_ifndef([AC_DISABLE_FAST_INSTALL], [AC_DEFUN([AC_DISABLE_FAST_INSTALL])]) m4_ifndef([_LT_AC_LANG_CXX], [AC_DEFUN([_LT_AC_LANG_CXX])]) m4_ifndef([_LT_AC_LANG_F77], [AC_DEFUN([_LT_AC_LANG_F77])]) m4_ifndef([_LT_AC_LANG_GCJ], [AC_DEFUN([_LT_AC_LANG_GCJ])]) m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])]) m4_ifndef([_LT_AC_LANG_C_CONFIG], [AC_DEFUN([_LT_AC_LANG_C_CONFIG])]) m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])]) m4_ifndef([_LT_AC_LANG_CXX_CONFIG], [AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])]) m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])]) m4_ifndef([_LT_AC_LANG_F77_CONFIG], [AC_DEFUN([_LT_AC_LANG_F77_CONFIG])]) m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])]) m4_ifndef([_LT_AC_LANG_GCJ_CONFIG], [AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])]) m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])]) m4_ifndef([_LT_AC_LANG_RC_CONFIG], [AC_DEFUN([_LT_AC_LANG_RC_CONFIG])]) m4_ifndef([AC_LIBTOOL_CONFIG], [AC_DEFUN([AC_LIBTOOL_CONFIG])]) m4_ifndef([_LT_AC_FILE_LTDLL_C], [AC_DEFUN([_LT_AC_FILE_LTDLL_C])]) m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS], [AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])]) m4_ifndef([_LT_AC_PROG_CXXCPP], [AC_DEFUN([_LT_AC_PROG_CXXCPP])]) m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS], [AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])]) m4_ifndef([_LT_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])]) m4_ifndef([_LT_PROG_F77], [AC_DEFUN([_LT_PROG_F77])]) m4_ifndef([_LT_PROG_FC], [AC_DEFUN([_LT_PROG_FC])]) m4_ifndef([_LT_PROG_CXX], [AC_DEFUN([_LT_PROG_CXX])]) elpa-2016.05.001/m4/libtool.m40000644000312500001440000112475312717533401012270 00000000000000# libtool.m4 - Configure libtool for the host system. -*-Autoconf-*- # # Copyright (C) 1996-2001, 2003-2015 Free Software Foundation, Inc. # Written by Gordon Matzigkeit, 1996 # # This file is free software; the Free Software Foundation gives # unlimited permission to copy and/or distribute it, with or without # modifications, as long as this notice is preserved. m4_define([_LT_COPYING], [dnl # Copyright (C) 2014 Free Software Foundation, Inc. # This is free software; see the source for copying conditions. There is NO # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # GNU Libtool is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of of the License, or # (at your option) any later version. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program or library that is built # using GNU Libtool, you may include this file under the same # distribution terms that you use for the rest of that program. # # GNU Libtool is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ]) # serial 58 LT_INIT # LT_PREREQ(VERSION) # ------------------ # Complain and exit if this libtool version is less that VERSION. m4_defun([LT_PREREQ], [m4_if(m4_version_compare(m4_defn([LT_PACKAGE_VERSION]), [$1]), -1, [m4_default([$3], [m4_fatal([Libtool version $1 or higher is required], 63)])], [$2])]) # _LT_CHECK_BUILDDIR # ------------------ # Complain if the absolute build directory name contains unusual characters m4_defun([_LT_CHECK_BUILDDIR], [case `pwd` in *\ * | *\ *) AC_MSG_WARN([Libtool does not cope well with whitespace in `pwd`]) ;; esac ]) # LT_INIT([OPTIONS]) # ------------------ AC_DEFUN([LT_INIT], [AC_PREREQ([2.62])dnl We use AC_PATH_PROGS_FEATURE_CHECK AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl AC_BEFORE([$0], [LT_LANG])dnl AC_BEFORE([$0], [LT_OUTPUT])dnl AC_BEFORE([$0], [LTDL_INIT])dnl m4_require([_LT_CHECK_BUILDDIR])dnl dnl Autoconf doesn't catch unexpanded LT_ macros by default: m4_pattern_forbid([^_?LT_[A-Z_]+$])dnl m4_pattern_allow([^(_LT_EOF|LT_DLGLOBAL|LT_DLLAZY_OR_NOW|LT_MULTI_MODULE)$])dnl dnl aclocal doesn't pull ltoptions.m4, ltsugar.m4, or ltversion.m4 dnl unless we require an AC_DEFUNed macro: AC_REQUIRE([LTOPTIONS_VERSION])dnl AC_REQUIRE([LTSUGAR_VERSION])dnl AC_REQUIRE([LTVERSION_VERSION])dnl AC_REQUIRE([LTOBSOLETE_VERSION])dnl m4_require([_LT_PROG_LTMAIN])dnl _LT_SHELL_INIT([SHELL=${CONFIG_SHELL-/bin/sh}]) dnl Parse OPTIONS _LT_SET_OPTIONS([$0], [$1]) # This can be used to rebuild libtool when needed LIBTOOL_DEPS=$ltmain # Always use our own libtool. LIBTOOL='$(SHELL) $(top_builddir)/libtool' AC_SUBST(LIBTOOL)dnl _LT_SETUP # Only expand once: m4_define([LT_INIT]) ])# LT_INIT # Old names: AU_ALIAS([AC_PROG_LIBTOOL], [LT_INIT]) AU_ALIAS([AM_PROG_LIBTOOL], [LT_INIT]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_PROG_LIBTOOL], []) dnl AC_DEFUN([AM_PROG_LIBTOOL], []) # _LT_PREPARE_CC_BASENAME # ----------------------- m4_defun([_LT_PREPARE_CC_BASENAME], [ # Calculate cc_basename. Skip known compiler wrappers and cross-prefix. func_cc_basename () { for cc_temp in @S|@*""; do case $cc_temp in compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;; distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;; \-*) ;; *) break;; esac done func_cc_basename_result=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"` } ])# _LT_PREPARE_CC_BASENAME # _LT_CC_BASENAME(CC) # ------------------- # It would be clearer to call AC_REQUIREs from _LT_PREPARE_CC_BASENAME, # but that macro is also expanded into generated libtool script, which # arranges for $SED and $ECHO to be set by different means. m4_defun([_LT_CC_BASENAME], [m4_require([_LT_PREPARE_CC_BASENAME])dnl AC_REQUIRE([_LT_DECL_SED])dnl AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])dnl func_cc_basename $1 cc_basename=$func_cc_basename_result ]) # _LT_FILEUTILS_DEFAULTS # ---------------------- # It is okay to use these file commands and assume they have been set # sensibly after 'm4_require([_LT_FILEUTILS_DEFAULTS])'. m4_defun([_LT_FILEUTILS_DEFAULTS], [: ${CP="cp -f"} : ${MV="mv -f"} : ${RM="rm -f"} ])# _LT_FILEUTILS_DEFAULTS # _LT_SETUP # --------- m4_defun([_LT_SETUP], [AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_REQUIRE([AC_CANONICAL_BUILD])dnl AC_REQUIRE([_LT_PREPARE_SED_QUOTE_VARS])dnl AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])dnl _LT_DECL([], [PATH_SEPARATOR], [1], [The PATH separator for the build system])dnl dnl _LT_DECL([], [host_alias], [0], [The host system])dnl _LT_DECL([], [host], [0])dnl _LT_DECL([], [host_os], [0])dnl dnl _LT_DECL([], [build_alias], [0], [The build system])dnl _LT_DECL([], [build], [0])dnl _LT_DECL([], [build_os], [0])dnl dnl AC_REQUIRE([AC_PROG_CC])dnl AC_REQUIRE([LT_PATH_LD])dnl AC_REQUIRE([LT_PATH_NM])dnl dnl AC_REQUIRE([AC_PROG_LN_S])dnl test -z "$LN_S" && LN_S="ln -s" _LT_DECL([], [LN_S], [1], [Whether we need soft or hard links])dnl dnl AC_REQUIRE([LT_CMD_MAX_LEN])dnl _LT_DECL([objext], [ac_objext], [0], [Object file suffix (normally "o")])dnl _LT_DECL([], [exeext], [0], [Executable file suffix (normally "")])dnl dnl m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_CHECK_SHELL_FEATURES])dnl m4_require([_LT_PATH_CONVERSION_FUNCTIONS])dnl m4_require([_LT_CMD_RELOAD])dnl m4_require([_LT_CHECK_MAGIC_METHOD])dnl m4_require([_LT_CHECK_SHAREDLIB_FROM_LINKLIB])dnl m4_require([_LT_CMD_OLD_ARCHIVE])dnl m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl m4_require([_LT_WITH_SYSROOT])dnl m4_require([_LT_CMD_TRUNCATE])dnl _LT_CONFIG_LIBTOOL_INIT([ # See if we are running on zsh, and set the options that allow our # commands through without removal of \ escapes INIT. if test -n "\${ZSH_VERSION+set}"; then setopt NO_GLOB_SUBST fi ]) if test -n "${ZSH_VERSION+set}"; then setopt NO_GLOB_SUBST fi _LT_CHECK_OBJDIR m4_require([_LT_TAG_COMPILER])dnl case $host_os in aix3*) # AIX sometimes has problems with the GCC collect2 program. For some # reason, if we set the COLLECT_NAMES environment variable, the problems # vanish in a puff of smoke. if test set != "${COLLECT_NAMES+set}"; then COLLECT_NAMES= export COLLECT_NAMES fi ;; esac # Global variables: ofile=libtool can_build_shared=yes # All known linkers require a '.a' archive for static linking (except MSVC, # which needs '.lib'). libext=a with_gnu_ld=$lt_cv_prog_gnu_ld old_CC=$CC old_CFLAGS=$CFLAGS # Set sane defaults for various variables test -z "$CC" && CC=cc test -z "$LTCC" && LTCC=$CC test -z "$LTCFLAGS" && LTCFLAGS=$CFLAGS test -z "$LD" && LD=ld test -z "$ac_objext" && ac_objext=o _LT_CC_BASENAME([$compiler]) # Only perform the check for file, if the check method requires it test -z "$MAGIC_CMD" && MAGIC_CMD=file case $deplibs_check_method in file_magic*) if test "$file_magic_cmd" = '$MAGIC_CMD'; then _LT_PATH_MAGIC fi ;; esac # Use C for the default configuration in the libtool script LT_SUPPORTED_TAG([CC]) _LT_LANG_C_CONFIG _LT_LANG_DEFAULT_CONFIG _LT_CONFIG_COMMANDS ])# _LT_SETUP # _LT_PREPARE_SED_QUOTE_VARS # -------------------------- # Define a few sed substitution that help us do robust quoting. m4_defun([_LT_PREPARE_SED_QUOTE_VARS], [# Backslashify metacharacters that are still active within # double-quoted strings. sed_quote_subst='s/\([["`$\\]]\)/\\\1/g' # Same as above, but do not quote variable references. double_quote_subst='s/\([["`\\]]\)/\\\1/g' # Sed substitution to delay expansion of an escaped shell variable in a # double_quote_subst'ed string. delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g' # Sed substitution to delay expansion of an escaped single quote. delay_single_quote_subst='s/'\''/'\'\\\\\\\'\''/g' # Sed substitution to avoid accidental globbing in evaled expressions no_glob_subst='s/\*/\\\*/g' ]) # _LT_PROG_LTMAIN # --------------- # Note that this code is called both from 'configure', and 'config.status' # now that we use AC_CONFIG_COMMANDS to generate libtool. Notably, # 'config.status' has no value for ac_aux_dir unless we are using Automake, # so we pass a copy along to make sure it has a sensible value anyway. m4_defun([_LT_PROG_LTMAIN], [m4_ifdef([AC_REQUIRE_AUX_FILE], [AC_REQUIRE_AUX_FILE([ltmain.sh])])dnl _LT_CONFIG_LIBTOOL_INIT([ac_aux_dir='$ac_aux_dir']) ltmain=$ac_aux_dir/ltmain.sh ])# _LT_PROG_LTMAIN ## ------------------------------------- ## ## Accumulate code for creating libtool. ## ## ------------------------------------- ## # So that we can recreate a full libtool script including additional # tags, we accumulate the chunks of code to send to AC_CONFIG_COMMANDS # in macros and then make a single call at the end using the 'libtool' # label. # _LT_CONFIG_LIBTOOL_INIT([INIT-COMMANDS]) # ---------------------------------------- # Register INIT-COMMANDS to be passed to AC_CONFIG_COMMANDS later. m4_define([_LT_CONFIG_LIBTOOL_INIT], [m4_ifval([$1], [m4_append([_LT_OUTPUT_LIBTOOL_INIT], [$1 ])])]) # Initialize. m4_define([_LT_OUTPUT_LIBTOOL_INIT]) # _LT_CONFIG_LIBTOOL([COMMANDS]) # ------------------------------ # Register COMMANDS to be passed to AC_CONFIG_COMMANDS later. m4_define([_LT_CONFIG_LIBTOOL], [m4_ifval([$1], [m4_append([_LT_OUTPUT_LIBTOOL_COMMANDS], [$1 ])])]) # Initialize. m4_define([_LT_OUTPUT_LIBTOOL_COMMANDS]) # _LT_CONFIG_SAVE_COMMANDS([COMMANDS], [INIT_COMMANDS]) # ----------------------------------------------------- m4_defun([_LT_CONFIG_SAVE_COMMANDS], [_LT_CONFIG_LIBTOOL([$1]) _LT_CONFIG_LIBTOOL_INIT([$2]) ]) # _LT_FORMAT_COMMENT([COMMENT]) # ----------------------------- # Add leading comment marks to the start of each line, and a trailing # full-stop to the whole comment if one is not present already. m4_define([_LT_FORMAT_COMMENT], [m4_ifval([$1], [ m4_bpatsubst([m4_bpatsubst([$1], [^ *], [# ])], [['`$\]], [\\\&])]m4_bmatch([$1], [[!?.]$], [], [.]) )]) ## ------------------------ ## ## FIXME: Eliminate VARNAME ## ## ------------------------ ## # _LT_DECL([CONFIGNAME], VARNAME, VALUE, [DESCRIPTION], [IS-TAGGED?]) # ------------------------------------------------------------------- # CONFIGNAME is the name given to the value in the libtool script. # VARNAME is the (base) name used in the configure script. # VALUE may be 0, 1 or 2 for a computed quote escaped value based on # VARNAME. Any other value will be used directly. m4_define([_LT_DECL], [lt_if_append_uniq([lt_decl_varnames], [$2], [, ], [lt_dict_add_subkey([lt_decl_dict], [$2], [libtool_name], [m4_ifval([$1], [$1], [$2])]) lt_dict_add_subkey([lt_decl_dict], [$2], [value], [$3]) m4_ifval([$4], [lt_dict_add_subkey([lt_decl_dict], [$2], [description], [$4])]) lt_dict_add_subkey([lt_decl_dict], [$2], [tagged?], [m4_ifval([$5], [yes], [no])])]) ]) # _LT_TAGDECL([CONFIGNAME], VARNAME, VALUE, [DESCRIPTION]) # -------------------------------------------------------- m4_define([_LT_TAGDECL], [_LT_DECL([$1], [$2], [$3], [$4], [yes])]) # lt_decl_tag_varnames([SEPARATOR], [VARNAME1...]) # ------------------------------------------------ m4_define([lt_decl_tag_varnames], [_lt_decl_filter([tagged?], [yes], $@)]) # _lt_decl_filter(SUBKEY, VALUE, [SEPARATOR], [VARNAME1..]) # --------------------------------------------------------- m4_define([_lt_decl_filter], [m4_case([$#], [0], [m4_fatal([$0: too few arguments: $#])], [1], [m4_fatal([$0: too few arguments: $#: $1])], [2], [lt_dict_filter([lt_decl_dict], [$1], [$2], [], lt_decl_varnames)], [3], [lt_dict_filter([lt_decl_dict], [$1], [$2], [$3], lt_decl_varnames)], [lt_dict_filter([lt_decl_dict], $@)])[]dnl ]) # lt_decl_quote_varnames([SEPARATOR], [VARNAME1...]) # -------------------------------------------------- m4_define([lt_decl_quote_varnames], [_lt_decl_filter([value], [1], $@)]) # lt_decl_dquote_varnames([SEPARATOR], [VARNAME1...]) # --------------------------------------------------- m4_define([lt_decl_dquote_varnames], [_lt_decl_filter([value], [2], $@)]) # lt_decl_varnames_tagged([SEPARATOR], [VARNAME1...]) # --------------------------------------------------- m4_define([lt_decl_varnames_tagged], [m4_assert([$# <= 2])dnl _$0(m4_quote(m4_default([$1], [[, ]])), m4_ifval([$2], [[$2]], [m4_dquote(lt_decl_tag_varnames)]), m4_split(m4_normalize(m4_quote(_LT_TAGS)), [ ]))]) m4_define([_lt_decl_varnames_tagged], [m4_ifval([$3], [lt_combine([$1], [$2], [_], $3)])]) # lt_decl_all_varnames([SEPARATOR], [VARNAME1...]) # ------------------------------------------------ m4_define([lt_decl_all_varnames], [_$0(m4_quote(m4_default([$1], [[, ]])), m4_if([$2], [], m4_quote(lt_decl_varnames), m4_quote(m4_shift($@))))[]dnl ]) m4_define([_lt_decl_all_varnames], [lt_join($@, lt_decl_varnames_tagged([$1], lt_decl_tag_varnames([[, ]], m4_shift($@))))dnl ]) # _LT_CONFIG_STATUS_DECLARE([VARNAME]) # ------------------------------------ # Quote a variable value, and forward it to 'config.status' so that its # declaration there will have the same value as in 'configure'. VARNAME # must have a single quote delimited value for this to work. m4_define([_LT_CONFIG_STATUS_DECLARE], [$1='`$ECHO "$][$1" | $SED "$delay_single_quote_subst"`']) # _LT_CONFIG_STATUS_DECLARATIONS # ------------------------------ # We delimit libtool config variables with single quotes, so when # we write them to config.status, we have to be sure to quote all # embedded single quotes properly. In configure, this macro expands # each variable declared with _LT_DECL (and _LT_TAGDECL) into: # # ='`$ECHO "$" | $SED "$delay_single_quote_subst"`' m4_defun([_LT_CONFIG_STATUS_DECLARATIONS], [m4_foreach([_lt_var], m4_quote(lt_decl_all_varnames), [m4_n([_LT_CONFIG_STATUS_DECLARE(_lt_var)])])]) # _LT_LIBTOOL_TAGS # ---------------- # Output comment and list of tags supported by the script m4_defun([_LT_LIBTOOL_TAGS], [_LT_FORMAT_COMMENT([The names of the tagged configurations supported by this script])dnl available_tags='_LT_TAGS'dnl ]) # _LT_LIBTOOL_DECLARE(VARNAME, [TAG]) # ----------------------------------- # Extract the dictionary values for VARNAME (optionally with TAG) and # expand to a commented shell variable setting: # # # Some comment about what VAR is for. # visible_name=$lt_internal_name m4_define([_LT_LIBTOOL_DECLARE], [_LT_FORMAT_COMMENT(m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [description])))[]dnl m4_pushdef([_libtool_name], m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [libtool_name])))[]dnl m4_case(m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [value])), [0], [_libtool_name=[$]$1], [1], [_libtool_name=$lt_[]$1], [2], [_libtool_name=$lt_[]$1], [_libtool_name=lt_dict_fetch([lt_decl_dict], [$1], [value])])[]dnl m4_ifval([$2], [_$2])[]m4_popdef([_libtool_name])[]dnl ]) # _LT_LIBTOOL_CONFIG_VARS # ----------------------- # Produce commented declarations of non-tagged libtool config variables # suitable for insertion in the LIBTOOL CONFIG section of the 'libtool' # script. Tagged libtool config variables (even for the LIBTOOL CONFIG # section) are produced by _LT_LIBTOOL_TAG_VARS. m4_defun([_LT_LIBTOOL_CONFIG_VARS], [m4_foreach([_lt_var], m4_quote(_lt_decl_filter([tagged?], [no], [], lt_decl_varnames)), [m4_n([_LT_LIBTOOL_DECLARE(_lt_var)])])]) # _LT_LIBTOOL_TAG_VARS(TAG) # ------------------------- m4_define([_LT_LIBTOOL_TAG_VARS], [m4_foreach([_lt_var], m4_quote(lt_decl_tag_varnames), [m4_n([_LT_LIBTOOL_DECLARE(_lt_var, [$1])])])]) # _LT_TAGVAR(VARNAME, [TAGNAME]) # ------------------------------ m4_define([_LT_TAGVAR], [m4_ifval([$2], [$1_$2], [$1])]) # _LT_CONFIG_COMMANDS # ------------------- # Send accumulated output to $CONFIG_STATUS. Thanks to the lists of # variables for single and double quote escaping we saved from calls # to _LT_DECL, we can put quote escaped variables declarations # into 'config.status', and then the shell code to quote escape them in # for loops in 'config.status'. Finally, any additional code accumulated # from calls to _LT_CONFIG_LIBTOOL_INIT is expanded. m4_defun([_LT_CONFIG_COMMANDS], [AC_PROVIDE_IFELSE([LT_OUTPUT], dnl If the libtool generation code has been placed in $CONFIG_LT, dnl instead of duplicating it all over again into config.status, dnl then we will have config.status run $CONFIG_LT later, so it dnl needs to know what name is stored there: [AC_CONFIG_COMMANDS([libtool], [$SHELL $CONFIG_LT || AS_EXIT(1)], [CONFIG_LT='$CONFIG_LT'])], dnl If the libtool generation code is destined for config.status, dnl expand the accumulated commands and init code now: [AC_CONFIG_COMMANDS([libtool], [_LT_OUTPUT_LIBTOOL_COMMANDS], [_LT_OUTPUT_LIBTOOL_COMMANDS_INIT])]) ])#_LT_CONFIG_COMMANDS # Initialize. m4_define([_LT_OUTPUT_LIBTOOL_COMMANDS_INIT], [ # The HP-UX ksh and POSIX shell print the target directory to stdout # if CDPATH is set. (unset CDPATH) >/dev/null 2>&1 && unset CDPATH sed_quote_subst='$sed_quote_subst' double_quote_subst='$double_quote_subst' delay_variable_subst='$delay_variable_subst' _LT_CONFIG_STATUS_DECLARATIONS LTCC='$LTCC' LTCFLAGS='$LTCFLAGS' compiler='$compiler_DEFAULT' # A function that is used when there is no print builtin or printf. func_fallback_echo () { eval 'cat <<_LTECHO_EOF \$[]1 _LTECHO_EOF' } # Quote evaled strings. for var in lt_decl_all_varnames([[ \ ]], lt_decl_quote_varnames); do case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in *[[\\\\\\\`\\"\\\$]]*) eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\"" ## exclude from sc_prohibit_nested_quotes ;; *) eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\"" ;; esac done # Double-quote double-evaled strings. for var in lt_decl_all_varnames([[ \ ]], lt_decl_dquote_varnames); do case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in *[[\\\\\\\`\\"\\\$]]*) eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\"" ## exclude from sc_prohibit_nested_quotes ;; *) eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\"" ;; esac done _LT_OUTPUT_LIBTOOL_INIT ]) # _LT_GENERATED_FILE_INIT(FILE, [COMMENT]) # ------------------------------------ # Generate a child script FILE with all initialization necessary to # reuse the environment learned by the parent script, and make the # file executable. If COMMENT is supplied, it is inserted after the # '#!' sequence but before initialization text begins. After this # macro, additional text can be appended to FILE to form the body of # the child script. The macro ends with non-zero status if the # file could not be fully written (such as if the disk is full). m4_ifdef([AS_INIT_GENERATED], [m4_defun([_LT_GENERATED_FILE_INIT],[AS_INIT_GENERATED($@)])], [m4_defun([_LT_GENERATED_FILE_INIT], [m4_require([AS_PREPARE])]dnl [m4_pushdef([AS_MESSAGE_LOG_FD])]dnl [lt_write_fail=0 cat >$1 <<_ASEOF || lt_write_fail=1 #! $SHELL # Generated by $as_me. $2 SHELL=\${CONFIG_SHELL-$SHELL} export SHELL _ASEOF cat >>$1 <<\_ASEOF || lt_write_fail=1 AS_SHELL_SANITIZE _AS_PREPARE exec AS_MESSAGE_FD>&1 _ASEOF test 0 = "$lt_write_fail" && chmod +x $1[]dnl m4_popdef([AS_MESSAGE_LOG_FD])])])# _LT_GENERATED_FILE_INIT # LT_OUTPUT # --------- # This macro allows early generation of the libtool script (before # AC_OUTPUT is called), incase it is used in configure for compilation # tests. AC_DEFUN([LT_OUTPUT], [: ${CONFIG_LT=./config.lt} AC_MSG_NOTICE([creating $CONFIG_LT]) _LT_GENERATED_FILE_INIT(["$CONFIG_LT"], [# Run this file to recreate a libtool stub with the current configuration.]) cat >>"$CONFIG_LT" <<\_LTEOF lt_cl_silent=false exec AS_MESSAGE_LOG_FD>>config.log { echo AS_BOX([Running $as_me.]) } >&AS_MESSAGE_LOG_FD lt_cl_help="\ '$as_me' creates a local libtool stub from the current configuration, for use in further configure time tests before the real libtool is generated. Usage: $[0] [[OPTIONS]] -h, --help print this help, then exit -V, --version print version number, then exit -q, --quiet do not print progress messages -d, --debug don't remove temporary files Report bugs to ." lt_cl_version="\ m4_ifset([AC_PACKAGE_NAME], [AC_PACKAGE_NAME ])config.lt[]dnl m4_ifset([AC_PACKAGE_VERSION], [ AC_PACKAGE_VERSION]) configured by $[0], generated by m4_PACKAGE_STRING. Copyright (C) 2011 Free Software Foundation, Inc. This config.lt script is free software; the Free Software Foundation gives unlimited permision to copy, distribute and modify it." while test 0 != $[#] do case $[1] in --version | --v* | -V ) echo "$lt_cl_version"; exit 0 ;; --help | --h* | -h ) echo "$lt_cl_help"; exit 0 ;; --debug | --d* | -d ) debug=: ;; --quiet | --q* | --silent | --s* | -q ) lt_cl_silent=: ;; -*) AC_MSG_ERROR([unrecognized option: $[1] Try '$[0] --help' for more information.]) ;; *) AC_MSG_ERROR([unrecognized argument: $[1] Try '$[0] --help' for more information.]) ;; esac shift done if $lt_cl_silent; then exec AS_MESSAGE_FD>/dev/null fi _LTEOF cat >>"$CONFIG_LT" <<_LTEOF _LT_OUTPUT_LIBTOOL_COMMANDS_INIT _LTEOF cat >>"$CONFIG_LT" <<\_LTEOF AC_MSG_NOTICE([creating $ofile]) _LT_OUTPUT_LIBTOOL_COMMANDS AS_EXIT(0) _LTEOF chmod +x "$CONFIG_LT" # configure is writing to config.log, but config.lt does its own redirection, # appending to config.log, which fails on DOS, as config.log is still kept # open by configure. Here we exec the FD to /dev/null, effectively closing # config.log, so it can be properly (re)opened and appended to by config.lt. lt_cl_success=: test yes = "$silent" && lt_config_lt_args="$lt_config_lt_args --quiet" exec AS_MESSAGE_LOG_FD>/dev/null $SHELL "$CONFIG_LT" $lt_config_lt_args || lt_cl_success=false exec AS_MESSAGE_LOG_FD>>config.log $lt_cl_success || AS_EXIT(1) ])# LT_OUTPUT # _LT_CONFIG(TAG) # --------------- # If TAG is the built-in tag, create an initial libtool script with a # default configuration from the untagged config vars. Otherwise add code # to config.status for appending the configuration named by TAG from the # matching tagged config vars. m4_defun([_LT_CONFIG], [m4_require([_LT_FILEUTILS_DEFAULTS])dnl _LT_CONFIG_SAVE_COMMANDS([ m4_define([_LT_TAG], m4_if([$1], [], [C], [$1]))dnl m4_if(_LT_TAG, [C], [ # See if we are running on zsh, and set the options that allow our # commands through without removal of \ escapes. if test -n "${ZSH_VERSION+set}"; then setopt NO_GLOB_SUBST fi cfgfile=${ofile}T trap "$RM \"$cfgfile\"; exit 1" 1 2 15 $RM "$cfgfile" cat <<_LT_EOF >> "$cfgfile" #! $SHELL # Generated automatically by $as_me ($PACKAGE) $VERSION # NOTE: Changes made to this file will be lost: look at ltmain.sh. # Provide generalized library-building support services. # Written by Gordon Matzigkeit, 1996 _LT_COPYING _LT_LIBTOOL_TAGS # Configured defaults for sys_lib_dlsearch_path munging. : \${LT_SYS_LIBRARY_PATH="$configure_time_lt_sys_library_path"} # ### BEGIN LIBTOOL CONFIG _LT_LIBTOOL_CONFIG_VARS _LT_LIBTOOL_TAG_VARS # ### END LIBTOOL CONFIG _LT_EOF cat <<'_LT_EOF' >> "$cfgfile" # ### BEGIN FUNCTIONS SHARED WITH CONFIGURE _LT_PREPARE_MUNGE_PATH_LIST _LT_PREPARE_CC_BASENAME # ### END FUNCTIONS SHARED WITH CONFIGURE _LT_EOF case $host_os in aix3*) cat <<\_LT_EOF >> "$cfgfile" # AIX sometimes has problems with the GCC collect2 program. For some # reason, if we set the COLLECT_NAMES environment variable, the problems # vanish in a puff of smoke. if test set != "${COLLECT_NAMES+set}"; then COLLECT_NAMES= export COLLECT_NAMES fi _LT_EOF ;; esac _LT_PROG_LTMAIN # We use sed instead of cat because bash on DJGPP gets confused if # if finds mixed CR/LF and LF-only lines. Since sed operates in # text mode, it properly converts lines to CR/LF. This bash problem # is reportedly fixed, but why not run on old versions too? sed '$q' "$ltmain" >> "$cfgfile" \ || (rm -f "$cfgfile"; exit 1) mv -f "$cfgfile" "$ofile" || (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile") chmod +x "$ofile" ], [cat <<_LT_EOF >> "$ofile" dnl Unfortunately we have to use $1 here, since _LT_TAG is not expanded dnl in a comment (ie after a #). # ### BEGIN LIBTOOL TAG CONFIG: $1 _LT_LIBTOOL_TAG_VARS(_LT_TAG) # ### END LIBTOOL TAG CONFIG: $1 _LT_EOF ])dnl /m4_if ], [m4_if([$1], [], [ PACKAGE='$PACKAGE' VERSION='$VERSION' RM='$RM' ofile='$ofile'], []) ])dnl /_LT_CONFIG_SAVE_COMMANDS ])# _LT_CONFIG # LT_SUPPORTED_TAG(TAG) # --------------------- # Trace this macro to discover what tags are supported by the libtool # --tag option, using: # autoconf --trace 'LT_SUPPORTED_TAG:$1' AC_DEFUN([LT_SUPPORTED_TAG], []) # C support is built-in for now m4_define([_LT_LANG_C_enabled], []) m4_define([_LT_TAGS], []) # LT_LANG(LANG) # ------------- # Enable libtool support for the given language if not already enabled. AC_DEFUN([LT_LANG], [AC_BEFORE([$0], [LT_OUTPUT])dnl m4_case([$1], [C], [_LT_LANG(C)], [C++], [_LT_LANG(CXX)], [Go], [_LT_LANG(GO)], [Java], [_LT_LANG(GCJ)], [Fortran 77], [_LT_LANG(F77)], [Fortran], [_LT_LANG(FC)], [Windows Resource], [_LT_LANG(RC)], [m4_ifdef([_LT_LANG_]$1[_CONFIG], [_LT_LANG($1)], [m4_fatal([$0: unsupported language: "$1"])])])dnl ])# LT_LANG # _LT_LANG(LANGNAME) # ------------------ m4_defun([_LT_LANG], [m4_ifdef([_LT_LANG_]$1[_enabled], [], [LT_SUPPORTED_TAG([$1])dnl m4_append([_LT_TAGS], [$1 ])dnl m4_define([_LT_LANG_]$1[_enabled], [])dnl _LT_LANG_$1_CONFIG($1)])dnl ])# _LT_LANG m4_ifndef([AC_PROG_GO], [ ############################################################ # NOTE: This macro has been submitted for inclusion into # # GNU Autoconf as AC_PROG_GO. When it is available in # # a released version of Autoconf we should remove this # # macro and use it instead. # ############################################################ m4_defun([AC_PROG_GO], [AC_LANG_PUSH(Go)dnl AC_ARG_VAR([GOC], [Go compiler command])dnl AC_ARG_VAR([GOFLAGS], [Go compiler flags])dnl _AC_ARG_VAR_LDFLAGS()dnl AC_CHECK_TOOL(GOC, gccgo) if test -z "$GOC"; then if test -n "$ac_tool_prefix"; then AC_CHECK_PROG(GOC, [${ac_tool_prefix}gccgo], [${ac_tool_prefix}gccgo]) fi fi if test -z "$GOC"; then AC_CHECK_PROG(GOC, gccgo, gccgo, false) fi ])#m4_defun ])#m4_ifndef # _LT_LANG_DEFAULT_CONFIG # ----------------------- m4_defun([_LT_LANG_DEFAULT_CONFIG], [AC_PROVIDE_IFELSE([AC_PROG_CXX], [LT_LANG(CXX)], [m4_define([AC_PROG_CXX], defn([AC_PROG_CXX])[LT_LANG(CXX)])]) AC_PROVIDE_IFELSE([AC_PROG_F77], [LT_LANG(F77)], [m4_define([AC_PROG_F77], defn([AC_PROG_F77])[LT_LANG(F77)])]) AC_PROVIDE_IFELSE([AC_PROG_FC], [LT_LANG(FC)], [m4_define([AC_PROG_FC], defn([AC_PROG_FC])[LT_LANG(FC)])]) dnl The call to [A][M_PROG_GCJ] is quoted like that to stop aclocal dnl pulling things in needlessly. AC_PROVIDE_IFELSE([AC_PROG_GCJ], [LT_LANG(GCJ)], [AC_PROVIDE_IFELSE([A][M_PROG_GCJ], [LT_LANG(GCJ)], [AC_PROVIDE_IFELSE([LT_PROG_GCJ], [LT_LANG(GCJ)], [m4_ifdef([AC_PROG_GCJ], [m4_define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[LT_LANG(GCJ)])]) m4_ifdef([A][M_PROG_GCJ], [m4_define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[LT_LANG(GCJ)])]) m4_ifdef([LT_PROG_GCJ], [m4_define([LT_PROG_GCJ], defn([LT_PROG_GCJ])[LT_LANG(GCJ)])])])])]) AC_PROVIDE_IFELSE([AC_PROG_GO], [LT_LANG(GO)], [m4_define([AC_PROG_GO], defn([AC_PROG_GO])[LT_LANG(GO)])]) AC_PROVIDE_IFELSE([LT_PROG_RC], [LT_LANG(RC)], [m4_define([LT_PROG_RC], defn([LT_PROG_RC])[LT_LANG(RC)])]) ])# _LT_LANG_DEFAULT_CONFIG # Obsolete macros: AU_DEFUN([AC_LIBTOOL_CXX], [LT_LANG(C++)]) AU_DEFUN([AC_LIBTOOL_F77], [LT_LANG(Fortran 77)]) AU_DEFUN([AC_LIBTOOL_FC], [LT_LANG(Fortran)]) AU_DEFUN([AC_LIBTOOL_GCJ], [LT_LANG(Java)]) AU_DEFUN([AC_LIBTOOL_RC], [LT_LANG(Windows Resource)]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_CXX], []) dnl AC_DEFUN([AC_LIBTOOL_F77], []) dnl AC_DEFUN([AC_LIBTOOL_FC], []) dnl AC_DEFUN([AC_LIBTOOL_GCJ], []) dnl AC_DEFUN([AC_LIBTOOL_RC], []) # _LT_TAG_COMPILER # ---------------- m4_defun([_LT_TAG_COMPILER], [AC_REQUIRE([AC_PROG_CC])dnl _LT_DECL([LTCC], [CC], [1], [A C compiler])dnl _LT_DECL([LTCFLAGS], [CFLAGS], [1], [LTCC compiler flags])dnl _LT_TAGDECL([CC], [compiler], [1], [A language specific compiler])dnl _LT_TAGDECL([with_gcc], [GCC], [0], [Is the compiler the GNU compiler?])dnl # If no C compiler was specified, use CC. LTCC=${LTCC-"$CC"} # If no C compiler flags were specified, use CFLAGS. LTCFLAGS=${LTCFLAGS-"$CFLAGS"} # Allow CC to be a program name with arguments. compiler=$CC ])# _LT_TAG_COMPILER # _LT_COMPILER_BOILERPLATE # ------------------------ # Check for compiler boilerplate output or warnings with # the simple compiler test code. m4_defun([_LT_COMPILER_BOILERPLATE], [m4_require([_LT_DECL_SED])dnl ac_outfile=conftest.$ac_objext echo "$lt_simple_compile_test_code" >conftest.$ac_ext eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err _lt_compiler_boilerplate=`cat conftest.err` $RM conftest* ])# _LT_COMPILER_BOILERPLATE # _LT_LINKER_BOILERPLATE # ---------------------- # Check for linker boilerplate output or warnings with # the simple link test code. m4_defun([_LT_LINKER_BOILERPLATE], [m4_require([_LT_DECL_SED])dnl ac_outfile=conftest.$ac_objext echo "$lt_simple_link_test_code" >conftest.$ac_ext eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err _lt_linker_boilerplate=`cat conftest.err` $RM -r conftest* ])# _LT_LINKER_BOILERPLATE # _LT_REQUIRED_DARWIN_CHECKS # ------------------------- m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[ case $host_os in rhapsody* | darwin*) AC_CHECK_TOOL([DSYMUTIL], [dsymutil], [:]) AC_CHECK_TOOL([NMEDIT], [nmedit], [:]) AC_CHECK_TOOL([LIPO], [lipo], [:]) AC_CHECK_TOOL([OTOOL], [otool], [:]) AC_CHECK_TOOL([OTOOL64], [otool64], [:]) _LT_DECL([], [DSYMUTIL], [1], [Tool to manipulate archived DWARF debug symbol files on Mac OS X]) _LT_DECL([], [NMEDIT], [1], [Tool to change global to local symbols on Mac OS X]) _LT_DECL([], [LIPO], [1], [Tool to manipulate fat objects and archives on Mac OS X]) _LT_DECL([], [OTOOL], [1], [ldd/readelf like tool for Mach-O binaries on Mac OS X]) _LT_DECL([], [OTOOL64], [1], [ldd/readelf like tool for 64 bit Mach-O binaries on Mac OS X 10.4]) AC_CACHE_CHECK([for -single_module linker flag],[lt_cv_apple_cc_single_mod], [lt_cv_apple_cc_single_mod=no if test -z "$LT_MULTI_MODULE"; then # By default we will add the -single_module flag. You can override # by either setting the environment variable LT_MULTI_MODULE # non-empty at configure time, or by adding -multi_module to the # link flags. rm -rf libconftest.dylib* echo "int foo(void){return 1;}" > conftest.c echo "$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \ -dynamiclib -Wl,-single_module conftest.c" >&AS_MESSAGE_LOG_FD $LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \ -dynamiclib -Wl,-single_module conftest.c 2>conftest.err _lt_result=$? # If there is a non-empty error log, and "single_module" # appears in it, assume the flag caused a linker warning if test -s conftest.err && $GREP single_module conftest.err; then cat conftest.err >&AS_MESSAGE_LOG_FD # Otherwise, if the output was created with a 0 exit code from # the compiler, it worked. elif test -f libconftest.dylib && test 0 = "$_lt_result"; then lt_cv_apple_cc_single_mod=yes else cat conftest.err >&AS_MESSAGE_LOG_FD fi rm -rf libconftest.dylib* rm -f conftest.* fi]) AC_CACHE_CHECK([for -exported_symbols_list linker flag], [lt_cv_ld_exported_symbols_list], [lt_cv_ld_exported_symbols_list=no save_LDFLAGS=$LDFLAGS echo "_main" > conftest.sym LDFLAGS="$LDFLAGS -Wl,-exported_symbols_list,conftest.sym" AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])], [lt_cv_ld_exported_symbols_list=yes], [lt_cv_ld_exported_symbols_list=no]) LDFLAGS=$save_LDFLAGS ]) AC_CACHE_CHECK([for -force_load linker flag],[lt_cv_ld_force_load], [lt_cv_ld_force_load=no cat > conftest.c << _LT_EOF int forced_loaded() { return 2;} _LT_EOF echo "$LTCC $LTCFLAGS -c -o conftest.o conftest.c" >&AS_MESSAGE_LOG_FD $LTCC $LTCFLAGS -c -o conftest.o conftest.c 2>&AS_MESSAGE_LOG_FD echo "$AR cru libconftest.a conftest.o" >&AS_MESSAGE_LOG_FD $AR cru libconftest.a conftest.o 2>&AS_MESSAGE_LOG_FD echo "$RANLIB libconftest.a" >&AS_MESSAGE_LOG_FD $RANLIB libconftest.a 2>&AS_MESSAGE_LOG_FD cat > conftest.c << _LT_EOF int main() { return 0;} _LT_EOF echo "$LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a" >&AS_MESSAGE_LOG_FD $LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a 2>conftest.err _lt_result=$? if test -s conftest.err && $GREP force_load conftest.err; then cat conftest.err >&AS_MESSAGE_LOG_FD elif test -f conftest && test 0 = "$_lt_result" && $GREP forced_load conftest >/dev/null 2>&1; then lt_cv_ld_force_load=yes else cat conftest.err >&AS_MESSAGE_LOG_FD fi rm -f conftest.err libconftest.a conftest conftest.c rm -rf conftest.dSYM ]) case $host_os in rhapsody* | darwin1.[[012]]) _lt_dar_allow_undefined='$wl-undefined ${wl}suppress' ;; darwin1.*) _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;; darwin*) # darwin 5.x on # if running on 10.5 or later, the deployment target defaults # to the OS version, if on x86, and 10.4, the deployment # target defaults to 10.4. Don't you love it? case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in 10.0,*86*-darwin8*|10.0,*-darwin[[91]]*) _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;; 10.[[012]][[,.]]*) _lt_dar_allow_undefined='$wl-flat_namespace $wl-undefined ${wl}suppress' ;; 10.*) _lt_dar_allow_undefined='$wl-undefined ${wl}dynamic_lookup' ;; esac ;; esac if test yes = "$lt_cv_apple_cc_single_mod"; then _lt_dar_single_mod='$single_module' fi if test yes = "$lt_cv_ld_exported_symbols_list"; then _lt_dar_export_syms=' $wl-exported_symbols_list,$output_objdir/$libname-symbols.expsym' else _lt_dar_export_syms='~$NMEDIT -s $output_objdir/$libname-symbols.expsym $lib' fi if test : != "$DSYMUTIL" && test no = "$lt_cv_ld_force_load"; then _lt_dsymutil='~$DSYMUTIL $lib || :' else _lt_dsymutil= fi ;; esac ]) # _LT_DARWIN_LINKER_FEATURES([TAG]) # --------------------------------- # Checks for linker and compiler features on darwin m4_defun([_LT_DARWIN_LINKER_FEATURES], [ m4_require([_LT_REQUIRED_DARWIN_CHECKS]) _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_automatic, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported if test yes = "$lt_cv_ld_force_load"; then _LT_TAGVAR(whole_archive_flag_spec, $1)='`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience $wl-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`' m4_case([$1], [F77], [_LT_TAGVAR(compiler_needs_object, $1)=yes], [FC], [_LT_TAGVAR(compiler_needs_object, $1)=yes]) else _LT_TAGVAR(whole_archive_flag_spec, $1)='' fi _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(allow_undefined_flag, $1)=$_lt_dar_allow_undefined case $cc_basename in ifort*|nagfor*) _lt_dar_can_shared=yes ;; *) _lt_dar_can_shared=$GCC ;; esac if test yes = "$_lt_dar_can_shared"; then output_verbose_link_cmd=func_echo_all _LT_TAGVAR(archive_cmds, $1)="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod$_lt_dsymutil" _LT_TAGVAR(module_cmds, $1)="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags$_lt_dsymutil" _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod$_lt_dar_export_syms$_lt_dsymutil" _LT_TAGVAR(module_expsym_cmds, $1)="sed -e 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags$_lt_dar_export_syms$_lt_dsymutil" m4_if([$1], [CXX], [ if test yes != "$lt_cv_apple_cc_single_mod"; then _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dsymutil" _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dar_export_syms$_lt_dsymutil" fi ],[]) else _LT_TAGVAR(ld_shlibs, $1)=no fi ]) # _LT_SYS_MODULE_PATH_AIX([TAGNAME]) # ---------------------------------- # Links a minimal program and checks the executable # for the system default hardcoded library path. In most cases, # this is /usr/lib:/lib, but when the MPI compilers are used # the location of the communication and MPI libs are included too. # If we don't find anything, use the default library path according # to the aix ld manual. # Store the results from the different compilers for each TAGNAME. # Allow to override them for all tags through lt_cv_aix_libpath. m4_defun([_LT_SYS_MODULE_PATH_AIX], [m4_require([_LT_DECL_SED])dnl if test set = "${lt_cv_aix_libpath+set}"; then aix_libpath=$lt_cv_aix_libpath else AC_CACHE_VAL([_LT_TAGVAR([lt_cv_aix_libpath_], [$1])], [AC_LINK_IFELSE([AC_LANG_PROGRAM],[ lt_aix_libpath_sed='[ /Import File Strings/,/^$/ { /^0/ { s/^0 *\([^ ]*\) *$/\1/ p } }]' _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"` # Check for a 64-bit object if we didn't find anything. if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"` fi],[]) if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=/usr/lib:/lib fi ]) aix_libpath=$_LT_TAGVAR([lt_cv_aix_libpath_], [$1]) fi ])# _LT_SYS_MODULE_PATH_AIX # _LT_SHELL_INIT(ARG) # ------------------- m4_define([_LT_SHELL_INIT], [m4_divert_text([M4SH-INIT], [$1 ])])# _LT_SHELL_INIT # _LT_PROG_ECHO_BACKSLASH # ----------------------- # Find how we can fake an echo command that does not interpret backslash. # In particular, with Autoconf 2.60 or later we add some code to the start # of the generated configure script that will find a shell with a builtin # printf (that we can use as an echo command). m4_defun([_LT_PROG_ECHO_BACKSLASH], [ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO AC_MSG_CHECKING([how to print strings]) # Test print first, because it will be a builtin if present. if test "X`( print -r -- -n ) 2>/dev/null`" = X-n && \ test "X`print -r -- $ECHO 2>/dev/null`" = "X$ECHO"; then ECHO='print -r --' elif test "X`printf %s $ECHO 2>/dev/null`" = "X$ECHO"; then ECHO='printf %s\n' else # Use this function as a fallback that always works. func_fallback_echo () { eval 'cat <<_LTECHO_EOF $[]1 _LTECHO_EOF' } ECHO='func_fallback_echo' fi # func_echo_all arg... # Invoke $ECHO with all args, space-separated. func_echo_all () { $ECHO "$*" } case $ECHO in printf*) AC_MSG_RESULT([printf]) ;; print*) AC_MSG_RESULT([print -r]) ;; *) AC_MSG_RESULT([cat]) ;; esac m4_ifdef([_AS_DETECT_SUGGESTED], [_AS_DETECT_SUGGESTED([ test -n "${ZSH_VERSION+set}${BASH_VERSION+set}" || ( ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO PATH=/empty FPATH=/empty; export PATH FPATH test "X`printf %s $ECHO`" = "X$ECHO" \ || test "X`print -r -- $ECHO`" = "X$ECHO" )])]) _LT_DECL([], [SHELL], [1], [Shell to use when invoking shell scripts]) _LT_DECL([], [ECHO], [1], [An echo program that protects backslashes]) ])# _LT_PROG_ECHO_BACKSLASH # _LT_WITH_SYSROOT # ---------------- AC_DEFUN([_LT_WITH_SYSROOT], [AC_MSG_CHECKING([for sysroot]) AC_ARG_WITH([sysroot], [AS_HELP_STRING([--with-sysroot@<:@=DIR@:>@], [Search for dependent libraries within DIR (or the compiler's sysroot if not specified).])], [], [with_sysroot=no]) dnl lt_sysroot will always be passed unquoted. We quote it here dnl in case the user passed a directory name. lt_sysroot= case $with_sysroot in #( yes) if test yes = "$GCC"; then lt_sysroot=`$CC --print-sysroot 2>/dev/null` fi ;; #( /*) lt_sysroot=`echo "$with_sysroot" | sed -e "$sed_quote_subst"` ;; #( no|'') ;; #( *) AC_MSG_RESULT([$with_sysroot]) AC_MSG_ERROR([The sysroot must be an absolute path.]) ;; esac AC_MSG_RESULT([${lt_sysroot:-no}]) _LT_DECL([], [lt_sysroot], [0], [The root where to search for ]dnl [dependent libraries, and where our libraries should be installed.])]) # _LT_ENABLE_LOCK # --------------- m4_defun([_LT_ENABLE_LOCK], [AC_ARG_ENABLE([libtool-lock], [AS_HELP_STRING([--disable-libtool-lock], [avoid locking (might break parallel builds)])]) test no = "$enable_libtool_lock" || enable_libtool_lock=yes # Some flags need to be propagated to the compiler or linker for good # libtool support. case $host in ia64-*-hpux*) # Find out what ABI is being produced by ac_compile, and set mode # options accordingly. echo 'int i;' > conftest.$ac_ext if AC_TRY_EVAL(ac_compile); then case `/usr/bin/file conftest.$ac_objext` in *ELF-32*) HPUX_IA64_MODE=32 ;; *ELF-64*) HPUX_IA64_MODE=64 ;; esac fi rm -rf conftest* ;; *-*-irix6*) # Find out what ABI is being produced by ac_compile, and set linker # options accordingly. echo '[#]line '$LINENO' "configure"' > conftest.$ac_ext if AC_TRY_EVAL(ac_compile); then if test yes = "$lt_cv_prog_gnu_ld"; then case `/usr/bin/file conftest.$ac_objext` in *32-bit*) LD="${LD-ld} -melf32bsmip" ;; *N32*) LD="${LD-ld} -melf32bmipn32" ;; *64-bit*) LD="${LD-ld} -melf64bmip" ;; esac else case `/usr/bin/file conftest.$ac_objext` in *32-bit*) LD="${LD-ld} -32" ;; *N32*) LD="${LD-ld} -n32" ;; *64-bit*) LD="${LD-ld} -64" ;; esac fi fi rm -rf conftest* ;; mips64*-*linux*) # Find out what ABI is being produced by ac_compile, and set linker # options accordingly. echo '[#]line '$LINENO' "configure"' > conftest.$ac_ext if AC_TRY_EVAL(ac_compile); then emul=elf case `/usr/bin/file conftest.$ac_objext` in *32-bit*) emul="${emul}32" ;; *64-bit*) emul="${emul}64" ;; esac case `/usr/bin/file conftest.$ac_objext` in *MSB*) emul="${emul}btsmip" ;; *LSB*) emul="${emul}ltsmip" ;; esac case `/usr/bin/file conftest.$ac_objext` in *N32*) emul="${emul}n32" ;; esac LD="${LD-ld} -m $emul" fi rm -rf conftest* ;; x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \ s390*-*linux*|s390*-*tpf*|sparc*-*linux*) # Find out what ABI is being produced by ac_compile, and set linker # options accordingly. Note that the listed cases only cover the # situations where additional linker options are needed (such as when # doing 32-bit compilation for a host where ld defaults to 64-bit, or # vice versa); the common cases where no linker options are needed do # not appear in the list. echo 'int i;' > conftest.$ac_ext if AC_TRY_EVAL(ac_compile); then case `/usr/bin/file conftest.o` in *32-bit*) case $host in x86_64-*kfreebsd*-gnu) LD="${LD-ld} -m elf_i386_fbsd" ;; x86_64-*linux*) case `/usr/bin/file conftest.o` in *x86-64*) LD="${LD-ld} -m elf32_x86_64" ;; *) LD="${LD-ld} -m elf_i386" ;; esac ;; powerpc64le-*linux*) LD="${LD-ld} -m elf32lppclinux" ;; powerpc64-*linux*) LD="${LD-ld} -m elf32ppclinux" ;; s390x-*linux*) LD="${LD-ld} -m elf_s390" ;; sparc64-*linux*) LD="${LD-ld} -m elf32_sparc" ;; esac ;; *64-bit*) case $host in x86_64-*kfreebsd*-gnu) LD="${LD-ld} -m elf_x86_64_fbsd" ;; x86_64-*linux*) LD="${LD-ld} -m elf_x86_64" ;; powerpcle-*linux*) LD="${LD-ld} -m elf64lppc" ;; powerpc-*linux*) LD="${LD-ld} -m elf64ppc" ;; s390*-*linux*|s390*-*tpf*) LD="${LD-ld} -m elf64_s390" ;; sparc*-*linux*) LD="${LD-ld} -m elf64_sparc" ;; esac ;; esac fi rm -rf conftest* ;; *-*-sco3.2v5*) # On SCO OpenServer 5, we need -belf to get full-featured binaries. SAVE_CFLAGS=$CFLAGS CFLAGS="$CFLAGS -belf" AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf, [AC_LANG_PUSH(C) AC_LINK_IFELSE([AC_LANG_PROGRAM([[]],[[]])],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no]) AC_LANG_POP]) if test yes != "$lt_cv_cc_needs_belf"; then # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf CFLAGS=$SAVE_CFLAGS fi ;; *-*solaris*) # Find out what ABI is being produced by ac_compile, and set linker # options accordingly. echo 'int i;' > conftest.$ac_ext if AC_TRY_EVAL(ac_compile); then case `/usr/bin/file conftest.o` in *64-bit*) case $lt_cv_prog_gnu_ld in yes*) case $host in i?86-*-solaris*|x86_64-*-solaris*) LD="${LD-ld} -m elf_x86_64" ;; sparc*-*-solaris*) LD="${LD-ld} -m elf64_sparc" ;; esac # GNU ld 2.21 introduced _sol2 emulations. Use them if available. if ${LD-ld} -V | grep _sol2 >/dev/null 2>&1; then LD=${LD-ld}_sol2 fi ;; *) if ${LD-ld} -64 -r -o conftest2.o conftest.o >/dev/null 2>&1; then LD="${LD-ld} -64" fi ;; esac ;; esac fi rm -rf conftest* ;; esac need_locks=$enable_libtool_lock ])# _LT_ENABLE_LOCK # _LT_PROG_AR # ----------- m4_defun([_LT_PROG_AR], [AC_CHECK_TOOLS(AR, [ar], false) : ${AR=ar} : ${AR_FLAGS=cru} _LT_DECL([], [AR], [1], [The archiver]) _LT_DECL([], [AR_FLAGS], [1], [Flags to create an archive]) AC_CACHE_CHECK([for archiver @FILE support], [lt_cv_ar_at_file], [lt_cv_ar_at_file=no AC_COMPILE_IFELSE([AC_LANG_PROGRAM], [echo conftest.$ac_objext > conftest.lst lt_ar_try='$AR $AR_FLAGS libconftest.a @conftest.lst >&AS_MESSAGE_LOG_FD' AC_TRY_EVAL([lt_ar_try]) if test 0 -eq "$ac_status"; then # Ensure the archiver fails upon bogus file names. rm -f conftest.$ac_objext libconftest.a AC_TRY_EVAL([lt_ar_try]) if test 0 -ne "$ac_status"; then lt_cv_ar_at_file=@ fi fi rm -f conftest.* libconftest.a ]) ]) if test no = "$lt_cv_ar_at_file"; then archiver_list_spec= else archiver_list_spec=$lt_cv_ar_at_file fi _LT_DECL([], [archiver_list_spec], [1], [How to feed a file listing to the archiver]) ])# _LT_PROG_AR # _LT_CMD_OLD_ARCHIVE # ------------------- m4_defun([_LT_CMD_OLD_ARCHIVE], [_LT_PROG_AR AC_CHECK_TOOL(STRIP, strip, :) test -z "$STRIP" && STRIP=: _LT_DECL([], [STRIP], [1], [A symbol stripping program]) AC_CHECK_TOOL(RANLIB, ranlib, :) test -z "$RANLIB" && RANLIB=: _LT_DECL([], [RANLIB], [1], [Commands used to install an old-style archive]) # Determine commands to create old-style static archives. old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs' old_postinstall_cmds='chmod 644 $oldlib' old_postuninstall_cmds= if test -n "$RANLIB"; then case $host_os in bitrig* | openbsd*) old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB -t \$tool_oldlib" ;; *) old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB \$tool_oldlib" ;; esac old_archive_cmds="$old_archive_cmds~\$RANLIB \$tool_oldlib" fi case $host_os in darwin*) lock_old_archive_extraction=yes ;; *) lock_old_archive_extraction=no ;; esac _LT_DECL([], [old_postinstall_cmds], [2]) _LT_DECL([], [old_postuninstall_cmds], [2]) _LT_TAGDECL([], [old_archive_cmds], [2], [Commands used to build an old-style archive]) _LT_DECL([], [lock_old_archive_extraction], [0], [Whether to use a lock for old archive extraction]) ])# _LT_CMD_OLD_ARCHIVE # _LT_COMPILER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS, # [OUTPUT-FILE], [ACTION-SUCCESS], [ACTION-FAILURE]) # ---------------------------------------------------------------- # Check whether the given compiler option works AC_DEFUN([_LT_COMPILER_OPTION], [m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_DECL_SED])dnl AC_CACHE_CHECK([$1], [$2], [$2=no m4_if([$4], , [ac_outfile=conftest.$ac_objext], [ac_outfile=$4]) echo "$lt_simple_compile_test_code" > conftest.$ac_ext lt_compiler_flag="$3" ## exclude from sc_useless_quotes_in_assignment # Insert the option either (1) after the last *FLAGS variable, or # (2) before a word containing "conftest.", or (3) at the end. # Note that $ac_compile itself does not contain backslashes and begins # with a dollar sign (not a hyphen), so the echo should work correctly. # The option is referenced via a variable to avoid confusing sed. lt_compile=`echo "$ac_compile" | $SED \ -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [[^ ]]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&AS_MESSAGE_LOG_FD) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&AS_MESSAGE_LOG_FD echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2 if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then $2=yes fi fi $RM conftest* ]) if test yes = "[$]$2"; then m4_if([$5], , :, [$5]) else m4_if([$6], , :, [$6]) fi ])# _LT_COMPILER_OPTION # Old name: AU_ALIAS([AC_LIBTOOL_COMPILER_OPTION], [_LT_COMPILER_OPTION]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_COMPILER_OPTION], []) # _LT_LINKER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS, # [ACTION-SUCCESS], [ACTION-FAILURE]) # ---------------------------------------------------- # Check whether the given linker option works AC_DEFUN([_LT_LINKER_OPTION], [m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_DECL_SED])dnl AC_CACHE_CHECK([$1], [$2], [$2=no save_LDFLAGS=$LDFLAGS LDFLAGS="$LDFLAGS $3" echo "$lt_simple_link_test_code" > conftest.$ac_ext if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then # The linker can only warn and ignore the option if not recognized # So say no if there are warnings if test -s conftest.err; then # Append any errors to the config.log. cat conftest.err 1>&AS_MESSAGE_LOG_FD $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2 if diff conftest.exp conftest.er2 >/dev/null; then $2=yes fi else $2=yes fi fi $RM -r conftest* LDFLAGS=$save_LDFLAGS ]) if test yes = "[$]$2"; then m4_if([$4], , :, [$4]) else m4_if([$5], , :, [$5]) fi ])# _LT_LINKER_OPTION # Old name: AU_ALIAS([AC_LIBTOOL_LINKER_OPTION], [_LT_LINKER_OPTION]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_LINKER_OPTION], []) # LT_CMD_MAX_LEN #--------------- AC_DEFUN([LT_CMD_MAX_LEN], [AC_REQUIRE([AC_CANONICAL_HOST])dnl # find the maximum length of command line arguments AC_MSG_CHECKING([the maximum length of command line arguments]) AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl i=0 teststring=ABCD case $build_os in msdosdjgpp*) # On DJGPP, this test can blow up pretty badly due to problems in libc # (any single argument exceeding 2000 bytes causes a buffer overrun # during glob expansion). Even if it were fixed, the result of this # check would be larger than it should be. lt_cv_sys_max_cmd_len=12288; # 12K is about right ;; gnu*) # Under GNU Hurd, this test is not required because there is # no limit to the length of command line arguments. # Libtool will interpret -1 as no limit whatsoever lt_cv_sys_max_cmd_len=-1; ;; cygwin* | mingw* | cegcc*) # On Win9x/ME, this test blows up -- it succeeds, but takes # about 5 minutes as the teststring grows exponentially. # Worse, since 9x/ME are not pre-emptively multitasking, # you end up with a "frozen" computer, even though with patience # the test eventually succeeds (with a max line length of 256k). # Instead, let's just punt: use the minimum linelength reported by # all of the supported platforms: 8192 (on NT/2K/XP). lt_cv_sys_max_cmd_len=8192; ;; mint*) # On MiNT this can take a long time and run out of memory. lt_cv_sys_max_cmd_len=8192; ;; amigaos*) # On AmigaOS with pdksh, this test takes hours, literally. # So we just punt and use a minimum line length of 8192. lt_cv_sys_max_cmd_len=8192; ;; bitrig* | darwin* | dragonfly* | freebsd* | netbsd* | openbsd*) # This has been around since 386BSD, at least. Likely further. if test -x /sbin/sysctl; then lt_cv_sys_max_cmd_len=`/sbin/sysctl -n kern.argmax` elif test -x /usr/sbin/sysctl; then lt_cv_sys_max_cmd_len=`/usr/sbin/sysctl -n kern.argmax` else lt_cv_sys_max_cmd_len=65536 # usable default for all BSDs fi # And add a safety zone lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4` lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3` ;; interix*) # We know the value 262144 and hardcode it with a safety zone (like BSD) lt_cv_sys_max_cmd_len=196608 ;; os2*) # The test takes a long time on OS/2. lt_cv_sys_max_cmd_len=8192 ;; osf*) # Dr. Hans Ekkehard Plesser reports seeing a kernel panic running configure # due to this test when exec_disable_arg_limit is 1 on Tru64. It is not # nice to cause kernel panics so lets avoid the loop below. # First set a reasonable default. lt_cv_sys_max_cmd_len=16384 # if test -x /sbin/sysconfig; then case `/sbin/sysconfig -q proc exec_disable_arg_limit` in *1*) lt_cv_sys_max_cmd_len=-1 ;; esac fi ;; sco3.2v5*) lt_cv_sys_max_cmd_len=102400 ;; sysv5* | sco5v6* | sysv4.2uw2*) kargmax=`grep ARG_MAX /etc/conf/cf.d/stune 2>/dev/null` if test -n "$kargmax"; then lt_cv_sys_max_cmd_len=`echo $kargmax | sed 's/.*[[ ]]//'` else lt_cv_sys_max_cmd_len=32768 fi ;; *) lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null` if test -n "$lt_cv_sys_max_cmd_len" && \ test undefined != "$lt_cv_sys_max_cmd_len"; then lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4` lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3` else # Make teststring a little bigger before we do anything with it. # a 1K string should be a reasonable start. for i in 1 2 3 4 5 6 7 8; do teststring=$teststring$teststring done SHELL=${SHELL-${CONFIG_SHELL-/bin/sh}} # If test is not a shell built-in, we'll probably end up computing a # maximum length that is only half of the actual maximum length, but # we can't tell. while { test X`env echo "$teststring$teststring" 2>/dev/null` \ = "X$teststring$teststring"; } >/dev/null 2>&1 && test 17 != "$i" # 1/2 MB should be enough do i=`expr $i + 1` teststring=$teststring$teststring done # Only check the string length outside the loop. lt_cv_sys_max_cmd_len=`expr "X$teststring" : ".*" 2>&1` teststring= # Add a significant safety factor because C++ compilers can tack on # massive amounts of additional arguments before passing them to the # linker. It appears as though 1/2 is a usable value. lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 2` fi ;; esac ]) if test -n "$lt_cv_sys_max_cmd_len"; then AC_MSG_RESULT($lt_cv_sys_max_cmd_len) else AC_MSG_RESULT(none) fi max_cmd_len=$lt_cv_sys_max_cmd_len _LT_DECL([], [max_cmd_len], [0], [What is the maximum length of a command?]) ])# LT_CMD_MAX_LEN # Old name: AU_ALIAS([AC_LIBTOOL_SYS_MAX_CMD_LEN], [LT_CMD_MAX_LEN]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_SYS_MAX_CMD_LEN], []) # _LT_HEADER_DLFCN # ---------------- m4_defun([_LT_HEADER_DLFCN], [AC_CHECK_HEADERS([dlfcn.h], [], [], [AC_INCLUDES_DEFAULT])dnl ])# _LT_HEADER_DLFCN # _LT_TRY_DLOPEN_SELF (ACTION-IF-TRUE, ACTION-IF-TRUE-W-USCORE, # ACTION-IF-FALSE, ACTION-IF-CROSS-COMPILING) # ---------------------------------------------------------------- m4_defun([_LT_TRY_DLOPEN_SELF], [m4_require([_LT_HEADER_DLFCN])dnl if test yes = "$cross_compiling"; then : [$4] else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF [#line $LINENO "configure" #include "confdefs.h" #if HAVE_DLFCN_H #include #endif #include #ifdef RTLD_GLOBAL # define LT_DLGLOBAL RTLD_GLOBAL #else # ifdef DL_GLOBAL # define LT_DLGLOBAL DL_GLOBAL # else # define LT_DLGLOBAL 0 # endif #endif /* We may have to define LT_DLLAZY_OR_NOW in the command line if we find out it does not work in some platform. */ #ifndef LT_DLLAZY_OR_NOW # ifdef RTLD_LAZY # define LT_DLLAZY_OR_NOW RTLD_LAZY # else # ifdef DL_LAZY # define LT_DLLAZY_OR_NOW DL_LAZY # else # ifdef RTLD_NOW # define LT_DLLAZY_OR_NOW RTLD_NOW # else # ifdef DL_NOW # define LT_DLLAZY_OR_NOW DL_NOW # else # define LT_DLLAZY_OR_NOW 0 # endif # endif # endif # endif #endif /* When -fvisibility=hidden is used, assume the code has been annotated correspondingly for the symbols needed. */ #if defined __GNUC__ && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)) int fnord () __attribute__((visibility("default"))); #endif int fnord () { return 42; } int main () { void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW); int status = $lt_dlunknown; if (self) { if (dlsym (self,"fnord")) status = $lt_dlno_uscore; else { if (dlsym( self,"_fnord")) status = $lt_dlneed_uscore; else puts (dlerror ()); } /* dlclose (self); */ } else puts (dlerror ()); return status; }] _LT_EOF if AC_TRY_EVAL(ac_link) && test -s "conftest$ac_exeext" 2>/dev/null; then (./conftest; exit; ) >&AS_MESSAGE_LOG_FD 2>/dev/null lt_status=$? case x$lt_status in x$lt_dlno_uscore) $1 ;; x$lt_dlneed_uscore) $2 ;; x$lt_dlunknown|x*) $3 ;; esac else : # compilation failed $3 fi fi rm -fr conftest* ])# _LT_TRY_DLOPEN_SELF # LT_SYS_DLOPEN_SELF # ------------------ AC_DEFUN([LT_SYS_DLOPEN_SELF], [m4_require([_LT_HEADER_DLFCN])dnl if test yes != "$enable_dlopen"; then enable_dlopen=unknown enable_dlopen_self=unknown enable_dlopen_self_static=unknown else lt_cv_dlopen=no lt_cv_dlopen_libs= case $host_os in beos*) lt_cv_dlopen=load_add_on lt_cv_dlopen_libs= lt_cv_dlopen_self=yes ;; mingw* | pw32* | cegcc*) lt_cv_dlopen=LoadLibrary lt_cv_dlopen_libs= ;; cygwin*) lt_cv_dlopen=dlopen lt_cv_dlopen_libs= ;; darwin*) # if libdl is installed we need to link against it AC_CHECK_LIB([dl], [dlopen], [lt_cv_dlopen=dlopen lt_cv_dlopen_libs=-ldl],[ lt_cv_dlopen=dyld lt_cv_dlopen_libs= lt_cv_dlopen_self=yes ]) ;; tpf*) # Don't try to run any link tests for TPF. We know it's impossible # because TPF is a cross-compiler, and we know how we open DSOs. lt_cv_dlopen=dlopen lt_cv_dlopen_libs= lt_cv_dlopen_self=no ;; *) AC_CHECK_FUNC([shl_load], [lt_cv_dlopen=shl_load], [AC_CHECK_LIB([dld], [shl_load], [lt_cv_dlopen=shl_load lt_cv_dlopen_libs=-ldld], [AC_CHECK_FUNC([dlopen], [lt_cv_dlopen=dlopen], [AC_CHECK_LIB([dl], [dlopen], [lt_cv_dlopen=dlopen lt_cv_dlopen_libs=-ldl], [AC_CHECK_LIB([svld], [dlopen], [lt_cv_dlopen=dlopen lt_cv_dlopen_libs=-lsvld], [AC_CHECK_LIB([dld], [dld_link], [lt_cv_dlopen=dld_link lt_cv_dlopen_libs=-ldld]) ]) ]) ]) ]) ]) ;; esac if test no = "$lt_cv_dlopen"; then enable_dlopen=no else enable_dlopen=yes fi case $lt_cv_dlopen in dlopen) save_CPPFLAGS=$CPPFLAGS test yes = "$ac_cv_header_dlfcn_h" && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H" save_LDFLAGS=$LDFLAGS wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\" save_LIBS=$LIBS LIBS="$lt_cv_dlopen_libs $LIBS" AC_CACHE_CHECK([whether a program can dlopen itself], lt_cv_dlopen_self, [dnl _LT_TRY_DLOPEN_SELF( lt_cv_dlopen_self=yes, lt_cv_dlopen_self=yes, lt_cv_dlopen_self=no, lt_cv_dlopen_self=cross) ]) if test yes = "$lt_cv_dlopen_self"; then wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $lt_prog_compiler_static\" AC_CACHE_CHECK([whether a statically linked program can dlopen itself], lt_cv_dlopen_self_static, [dnl _LT_TRY_DLOPEN_SELF( lt_cv_dlopen_self_static=yes, lt_cv_dlopen_self_static=yes, lt_cv_dlopen_self_static=no, lt_cv_dlopen_self_static=cross) ]) fi CPPFLAGS=$save_CPPFLAGS LDFLAGS=$save_LDFLAGS LIBS=$save_LIBS ;; esac case $lt_cv_dlopen_self in yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;; *) enable_dlopen_self=unknown ;; esac case $lt_cv_dlopen_self_static in yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;; *) enable_dlopen_self_static=unknown ;; esac fi _LT_DECL([dlopen_support], [enable_dlopen], [0], [Whether dlopen is supported]) _LT_DECL([dlopen_self], [enable_dlopen_self], [0], [Whether dlopen of programs is supported]) _LT_DECL([dlopen_self_static], [enable_dlopen_self_static], [0], [Whether dlopen of statically linked programs is supported]) ])# LT_SYS_DLOPEN_SELF # Old name: AU_ALIAS([AC_LIBTOOL_DLOPEN_SELF], [LT_SYS_DLOPEN_SELF]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_LIBTOOL_DLOPEN_SELF], []) # _LT_COMPILER_C_O([TAGNAME]) # --------------------------- # Check to see if options -c and -o are simultaneously supported by compiler. # This macro does not hard code the compiler like AC_PROG_CC_C_O. m4_defun([_LT_COMPILER_C_O], [m4_require([_LT_DECL_SED])dnl m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_TAG_COMPILER])dnl AC_CACHE_CHECK([if $compiler supports -c -o file.$ac_objext], [_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)], [_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=no $RM -r conftest 2>/dev/null mkdir conftest cd conftest mkdir out echo "$lt_simple_compile_test_code" > conftest.$ac_ext lt_compiler_flag="-o out/conftest2.$ac_objext" # Insert the option either (1) after the last *FLAGS variable, or # (2) before a word containing "conftest.", or (3) at the end. # Note that $ac_compile itself does not contain backslashes and begins # with a dollar sign (not a hyphen), so the echo should work correctly. lt_compile=`echo "$ac_compile" | $SED \ -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [[^ ]]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&AS_MESSAGE_LOG_FD) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&AS_MESSAGE_LOG_FD echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2 if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then _LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=yes fi fi chmod u+w . 2>&AS_MESSAGE_LOG_FD $RM conftest* # SGI C++ compiler will create directory out/ii_files/ for # template instantiation test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files $RM out/* && rmdir out cd .. $RM -r conftest $RM conftest* ]) _LT_TAGDECL([compiler_c_o], [lt_cv_prog_compiler_c_o], [1], [Does compiler simultaneously support -c and -o options?]) ])# _LT_COMPILER_C_O # _LT_COMPILER_FILE_LOCKS([TAGNAME]) # ---------------------------------- # Check to see if we can do hard links to lock some files if needed m4_defun([_LT_COMPILER_FILE_LOCKS], [m4_require([_LT_ENABLE_LOCK])dnl m4_require([_LT_FILEUTILS_DEFAULTS])dnl _LT_COMPILER_C_O([$1]) hard_links=nottested if test no = "$_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)" && test no != "$need_locks"; then # do not overwrite the value of need_locks provided by the user AC_MSG_CHECKING([if we can lock with hard links]) hard_links=yes $RM conftest* ln conftest.a conftest.b 2>/dev/null && hard_links=no touch conftest.a ln conftest.a conftest.b 2>&5 || hard_links=no ln conftest.a conftest.b 2>/dev/null && hard_links=no AC_MSG_RESULT([$hard_links]) if test no = "$hard_links"; then AC_MSG_WARN(['$CC' does not support '-c -o', so 'make -j' may be unsafe]) need_locks=warn fi else need_locks=no fi _LT_DECL([], [need_locks], [1], [Must we lock files when doing compilation?]) ])# _LT_COMPILER_FILE_LOCKS # _LT_CHECK_OBJDIR # ---------------- m4_defun([_LT_CHECK_OBJDIR], [AC_CACHE_CHECK([for objdir], [lt_cv_objdir], [rm -f .libs 2>/dev/null mkdir .libs 2>/dev/null if test -d .libs; then lt_cv_objdir=.libs else # MS-DOS does not allow filenames that begin with a dot. lt_cv_objdir=_libs fi rmdir .libs 2>/dev/null]) objdir=$lt_cv_objdir _LT_DECL([], [objdir], [0], [The name of the directory that contains temporary libtool files])dnl m4_pattern_allow([LT_OBJDIR])dnl AC_DEFINE_UNQUOTED([LT_OBJDIR], "$lt_cv_objdir/", [Define to the sub-directory where libtool stores uninstalled libraries.]) ])# _LT_CHECK_OBJDIR # _LT_LINKER_HARDCODE_LIBPATH([TAGNAME]) # -------------------------------------- # Check hardcoding attributes. m4_defun([_LT_LINKER_HARDCODE_LIBPATH], [AC_MSG_CHECKING([how to hardcode library paths into programs]) _LT_TAGVAR(hardcode_action, $1)= if test -n "$_LT_TAGVAR(hardcode_libdir_flag_spec, $1)" || test -n "$_LT_TAGVAR(runpath_var, $1)" || test yes = "$_LT_TAGVAR(hardcode_automatic, $1)"; then # We can hardcode non-existent directories. if test no != "$_LT_TAGVAR(hardcode_direct, $1)" && # If the only mechanism to avoid hardcoding is shlibpath_var, we # have to relink, otherwise we might link with an installed library # when we should be linking with a yet-to-be-installed one ## test no != "$_LT_TAGVAR(hardcode_shlibpath_var, $1)" && test no != "$_LT_TAGVAR(hardcode_minus_L, $1)"; then # Linking always hardcodes the temporary library directory. _LT_TAGVAR(hardcode_action, $1)=relink else # We can link without hardcoding, and we can hardcode nonexisting dirs. _LT_TAGVAR(hardcode_action, $1)=immediate fi else # We cannot hardcode anything, or else we can only hardcode existing # directories. _LT_TAGVAR(hardcode_action, $1)=unsupported fi AC_MSG_RESULT([$_LT_TAGVAR(hardcode_action, $1)]) if test relink = "$_LT_TAGVAR(hardcode_action, $1)" || test yes = "$_LT_TAGVAR(inherit_rpath, $1)"; then # Fast installation is not supported enable_fast_install=no elif test yes = "$shlibpath_overrides_runpath" || test no = "$enable_shared"; then # Fast installation is not necessary enable_fast_install=needless fi _LT_TAGDECL([], [hardcode_action], [0], [How to hardcode a shared library path into an executable]) ])# _LT_LINKER_HARDCODE_LIBPATH # _LT_CMD_STRIPLIB # ---------------- m4_defun([_LT_CMD_STRIPLIB], [m4_require([_LT_DECL_EGREP]) striplib= old_striplib= AC_MSG_CHECKING([whether stripping libraries is possible]) if test -n "$STRIP" && $STRIP -V 2>&1 | $GREP "GNU strip" >/dev/null; then test -z "$old_striplib" && old_striplib="$STRIP --strip-debug" test -z "$striplib" && striplib="$STRIP --strip-unneeded" AC_MSG_RESULT([yes]) else # FIXME - insert some real tests, host_os isn't really good enough case $host_os in darwin*) if test -n "$STRIP"; then striplib="$STRIP -x" old_striplib="$STRIP -S" AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) fi ;; *) AC_MSG_RESULT([no]) ;; esac fi _LT_DECL([], [old_striplib], [1], [Commands to strip libraries]) _LT_DECL([], [striplib], [1]) ])# _LT_CMD_STRIPLIB # _LT_PREPARE_MUNGE_PATH_LIST # --------------------------- # Make sure func_munge_path_list() is defined correctly. m4_defun([_LT_PREPARE_MUNGE_PATH_LIST], [[# func_munge_path_list VARIABLE PATH # ----------------------------------- # VARIABLE is name of variable containing _space_ separated list of # directories to be munged by the contents of PATH, which is string # having a format: # "DIR[:DIR]:" # string "DIR[ DIR]" will be prepended to VARIABLE # ":DIR[:DIR]" # string "DIR[ DIR]" will be appended to VARIABLE # "DIRP[:DIRP]::[DIRA:]DIRA" # string "DIRP[ DIRP]" will be prepended to VARIABLE and string # "DIRA[ DIRA]" will be appended to VARIABLE # "DIR[:DIR]" # VARIABLE will be replaced by "DIR[ DIR]" func_munge_path_list () { case x@S|@2 in x) ;; *:) eval @S|@1=\"`$ECHO @S|@2 | $SED 's/:/ /g'` \@S|@@S|@1\" ;; x:*) eval @S|@1=\"\@S|@@S|@1 `$ECHO @S|@2 | $SED 's/:/ /g'`\" ;; *::*) eval @S|@1=\"\@S|@@S|@1\ `$ECHO @S|@2 | $SED -e 's/.*:://' -e 's/:/ /g'`\" eval @S|@1=\"`$ECHO @S|@2 | $SED -e 's/::.*//' -e 's/:/ /g'`\ \@S|@@S|@1\" ;; *) eval @S|@1=\"`$ECHO @S|@2 | $SED 's/:/ /g'`\" ;; esac } ]])# _LT_PREPARE_PATH_LIST # _LT_SYS_DYNAMIC_LINKER([TAG]) # ----------------------------- # PORTME Fill in your ld.so characteristics m4_defun([_LT_SYS_DYNAMIC_LINKER], [AC_REQUIRE([AC_CANONICAL_HOST])dnl m4_require([_LT_DECL_EGREP])dnl m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_DECL_OBJDUMP])dnl m4_require([_LT_DECL_SED])dnl m4_require([_LT_CHECK_SHELL_FEATURES])dnl m4_require([_LT_PREPARE_MUNGE_PATH_LIST])dnl AC_MSG_CHECKING([dynamic linker characteristics]) m4_if([$1], [], [ if test yes = "$GCC"; then case $host_os in darwin*) lt_awk_arg='/^libraries:/,/LR/' ;; *) lt_awk_arg='/^libraries:/' ;; esac case $host_os in mingw* | cegcc*) lt_sed_strip_eq='s|=\([[A-Za-z]]:\)|\1|g' ;; *) lt_sed_strip_eq='s|=/|/|g' ;; esac lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq` case $lt_search_path_spec in *\;*) # if the path contains ";" then we assume it to be the separator # otherwise default to the standard path separator (i.e. ":") - it is # assumed that no part of a normal pathname contains ";" but that should # okay in the real world where ";" in dirpaths is itself problematic. lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED 's/;/ /g'` ;; *) lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED "s/$PATH_SEPARATOR/ /g"` ;; esac # Ok, now we have the path, separated by spaces, we can step through it # and add multilib dir if necessary... lt_tmp_lt_search_path_spec= lt_multi_os_dir=/`$CC $CPPFLAGS $CFLAGS $LDFLAGS -print-multi-os-directory 2>/dev/null` # ...but if some path component already ends with the multilib dir we assume # that all is fine and trust -print-search-dirs as is (GCC 4.2? or newer). case "$lt_multi_os_dir; $lt_search_path_spec " in "/; "* | "/.; "* | "/./; "* | *"$lt_multi_os_dir "* | *"$lt_multi_os_dir/ "*) lt_multi_os_dir= ;; esac for lt_sys_path in $lt_search_path_spec; do if test -d "$lt_sys_path$lt_multi_os_dir"; then lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path$lt_multi_os_dir" elif test -n "$lt_multi_os_dir"; then test -d "$lt_sys_path" && \ lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path" fi done lt_search_path_spec=`$ECHO "$lt_tmp_lt_search_path_spec" | awk ' BEGIN {RS = " "; FS = "/|\n";} { lt_foo = ""; lt_count = 0; for (lt_i = NF; lt_i > 0; lt_i--) { if ($lt_i != "" && $lt_i != ".") { if ($lt_i == "..") { lt_count++; } else { if (lt_count == 0) { lt_foo = "/" $lt_i lt_foo; } else { lt_count--; } } } } if (lt_foo != "") { lt_freq[[lt_foo]]++; } if (lt_freq[[lt_foo]] == 1) { print lt_foo; } }'` # AWK program above erroneously prepends '/' to C:/dos/paths # for these hosts. case $host_os in mingw* | cegcc*) lt_search_path_spec=`$ECHO "$lt_search_path_spec" |\ $SED 's|/\([[A-Za-z]]:\)|\1|g'` ;; esac sys_lib_search_path_spec=`$ECHO "$lt_search_path_spec" | $lt_NL2SP` else sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib" fi]) library_names_spec= libname_spec='lib$name' soname_spec= shrext_cmds=.so postinstall_cmds= postuninstall_cmds= finish_cmds= finish_eval= shlibpath_var= shlibpath_overrides_runpath=unknown version_type=none dynamic_linker="$host_os ld.so" sys_lib_dlsearch_path_spec="/lib /usr/lib" need_lib_prefix=unknown hardcode_into_libs=no # when you set need_version to no, make sure it does not cause -set_version # flags to be left without arguments need_version=unknown AC_ARG_VAR([LT_SYS_LIBRARY_PATH], [User-defined run-time library search path.]) case $host_os in aix3*) version_type=linux # correct to gnu/linux during the next big refactor library_names_spec='$libname$release$shared_ext$versuffix $libname.a' shlibpath_var=LIBPATH # AIX 3 has no versioning support, so we append a major version to the name. soname_spec='$libname$release$shared_ext$major' ;; aix[[4-9]]*) version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no hardcode_into_libs=yes if test ia64 = "$host_cpu"; then # AIX 5 supports IA64 library_names_spec='$libname$release$shared_ext$major $libname$release$shared_ext$versuffix $libname$shared_ext' shlibpath_var=LD_LIBRARY_PATH else # With GCC up to 2.95.x, collect2 would create an import file # for dependence libraries. The import file would start with # the line '#! .'. This would cause the generated library to # depend on '.', always an invalid library. This was fixed in # development snapshots of GCC prior to 3.0. case $host_os in aix4 | aix4.[[01]] | aix4.[[01]].*) if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)' echo ' yes ' echo '#endif'; } | $CC -E - | $GREP yes > /dev/null; then : else can_build_shared=no fi ;; esac # Using Import Files as archive members, it is possible to support # filename-based versioning of shared library archives on AIX. While # this would work for both with and without runtime linking, it will # prevent static linking of such archives. So we do filename-based # shared library versioning with .so extension only, which is used # when both runtime linking and shared linking is enabled. # Unfortunately, runtime linking may impact performance, so we do # not want this to be the default eventually. Also, we use the # versioned .so libs for executables only if there is the -brtl # linker flag in LDFLAGS as well, or --with-aix-soname=svr4 only. # To allow for filename-based versioning support, we need to create # libNAME.so.V as an archive file, containing: # *) an Import File, referring to the versioned filename of the # archive as well as the shared archive member, telling the # bitwidth (32 or 64) of that shared object, and providing the # list of exported symbols of that shared object, eventually # decorated with the 'weak' keyword # *) the shared object with the F_LOADONLY flag set, to really avoid # it being seen by the linker. # At run time we better use the real file rather than another symlink, # but for link time we create the symlink libNAME.so -> libNAME.so.V case $with_aix_soname,$aix_use_runtimelinking in # AIX (on Power*) has no versioning support, so currently we cannot hardcode correct # soname into executable. Probably we can add versioning support to # collect2, so additional links can be useful in future. aix,yes) # traditional libtool dynamic_linker='AIX unversionable lib.so' # If using run time linking (on AIX 4.2 or later) use lib.so # instead of lib.a to let people know that these are not # typical AIX shared libraries. library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' ;; aix,no) # traditional AIX only dynamic_linker='AIX lib.a[(]lib.so.V[)]' # We preserve .a as extension for shared libraries through AIX4.2 # and later when we are not doing run time linking. library_names_spec='$libname$release.a $libname.a' soname_spec='$libname$release$shared_ext$major' ;; svr4,*) # full svr4 only dynamic_linker="AIX lib.so.V[(]$shared_archive_member_spec.o[)]" library_names_spec='$libname$release$shared_ext$major $libname$shared_ext' # We do not specify a path in Import Files, so LIBPATH fires. shlibpath_overrides_runpath=yes ;; *,yes) # both, prefer svr4 dynamic_linker="AIX lib.so.V[(]$shared_archive_member_spec.o[)], lib.a[(]lib.so.V[)]" library_names_spec='$libname$release$shared_ext$major $libname$shared_ext' # unpreferred sharedlib libNAME.a needs extra handling postinstall_cmds='test -n "$linkname" || linkname="$realname"~func_stripname "" ".so" "$linkname"~$install_shared_prog "$dir/$func_stripname_result.$libext" "$destdir/$func_stripname_result.$libext"~test -z "$tstripme" || test -z "$striplib" || $striplib "$destdir/$func_stripname_result.$libext"' postuninstall_cmds='for n in $library_names $old_library; do :; done~func_stripname "" ".so" "$n"~test "$func_stripname_result" = "$n" || func_append rmfiles " $odir/$func_stripname_result.$libext"' # We do not specify a path in Import Files, so LIBPATH fires. shlibpath_overrides_runpath=yes ;; *,no) # both, prefer aix dynamic_linker="AIX lib.a[(]lib.so.V[)], lib.so.V[(]$shared_archive_member_spec.o[)]" library_names_spec='$libname$release.a $libname.a' soname_spec='$libname$release$shared_ext$major' # unpreferred sharedlib libNAME.so.V and symlink libNAME.so need extra handling postinstall_cmds='test -z "$dlname" || $install_shared_prog $dir/$dlname $destdir/$dlname~test -z "$tstripme" || test -z "$striplib" || $striplib $destdir/$dlname~test -n "$linkname" || linkname=$realname~func_stripname "" ".a" "$linkname"~(cd "$destdir" && $LN_S -f $dlname $func_stripname_result.so)' postuninstall_cmds='test -z "$dlname" || func_append rmfiles " $odir/$dlname"~for n in $old_library $library_names; do :; done~func_stripname "" ".a" "$n"~func_append rmfiles " $odir/$func_stripname_result.so"' ;; esac shlibpath_var=LIBPATH fi ;; amigaos*) case $host_cpu in powerpc) # Since July 2007 AmigaOS4 officially supports .so libraries. # When compiling the executable, add -use-dynld -Lsobjs: to the compileline. library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' ;; m68k) library_names_spec='$libname.ixlibrary $libname.a' # Create ${libname}_ixlibrary.a entries in /sys/libs. finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([[^/]]*\)\.ixlibrary$%\1%'\''`; $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done' ;; esac ;; beos*) library_names_spec='$libname$shared_ext' dynamic_linker="$host_os ld.so" shlibpath_var=LIBRARY_PATH ;; bsdi[[45]]*) version_type=linux # correct to gnu/linux during the next big refactor need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir' shlibpath_var=LD_LIBRARY_PATH sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib" sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib" # the default ld.so.conf also contains /usr/contrib/lib and # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow # libtool to hard-code these into programs ;; cygwin* | mingw* | pw32* | cegcc*) version_type=windows shrext_cmds=.dll need_version=no need_lib_prefix=no case $GCC,$cc_basename in yes,*) # gcc library_names_spec='$libname.dll.a' # DLL is installed to $(libdir)/../bin by postinstall_cmds postinstall_cmds='base_file=`basename \$file`~ dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\$base_file'\''i; echo \$dlname'\''`~ dldir=$destdir/`dirname \$dlpath`~ test -d \$dldir || mkdir -p \$dldir~ $install_prog $dir/$dlname \$dldir/$dlname~ chmod a+x \$dldir/$dlname~ if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then eval '\''$striplib \$dldir/$dlname'\'' || exit \$?; fi' postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~ dlpath=$dir/\$dldll~ $RM \$dlpath' shlibpath_overrides_runpath=yes case $host_os in cygwin*) # Cygwin DLLs use 'cyg' prefix rather than 'lib' soname_spec='`echo $libname | sed -e 's/^lib/cyg/'``echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext' m4_if([$1], [],[ sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/lib/w32api"]) ;; mingw* | cegcc*) # MinGW DLLs use traditional 'lib' prefix soname_spec='$libname`echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext' ;; pw32*) # pw32 DLLs use 'pw' prefix rather than 'lib' library_names_spec='`echo $libname | sed -e 's/^lib/pw/'``echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext' ;; esac dynamic_linker='Win32 ld.exe' ;; *,cl*) # Native MSVC libname_spec='$name' soname_spec='$libname`echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext' library_names_spec='$libname.dll.lib' case $build_os in mingw*) sys_lib_search_path_spec= lt_save_ifs=$IFS IFS=';' for lt_path in $LIB do IFS=$lt_save_ifs # Let DOS variable expansion print the short 8.3 style file name. lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"` sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path" done IFS=$lt_save_ifs # Convert to MSYS style. sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([[a-zA-Z]]\\):| /\\1|g' -e 's|^ ||'` ;; cygwin*) # Convert to unix form, then to dos form, then back to unix form # but this time dos style (no spaces!) so that the unix form looks # like /cygdrive/c/PROGRA~1:/cygdr... sys_lib_search_path_spec=`cygpath --path --unix "$LIB"` sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null` sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"` ;; *) sys_lib_search_path_spec=$LIB if $ECHO "$sys_lib_search_path_spec" | [$GREP ';[c-zC-Z]:/' >/dev/null]; then # It is most probably a Windows format PATH. sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'` else sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"` fi # FIXME: find the short name or the path components, as spaces are # common. (e.g. "Program Files" -> "PROGRA~1") ;; esac # DLL is installed to $(libdir)/../bin by postinstall_cmds postinstall_cmds='base_file=`basename \$file`~ dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\$base_file'\''i; echo \$dlname'\''`~ dldir=$destdir/`dirname \$dlpath`~ test -d \$dldir || mkdir -p \$dldir~ $install_prog $dir/$dlname \$dldir/$dlname' postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~ dlpath=$dir/\$dldll~ $RM \$dlpath' shlibpath_overrides_runpath=yes dynamic_linker='Win32 link.exe' ;; *) # Assume MSVC wrapper library_names_spec='$libname`echo $release | $SED -e 's/[[.]]/-/g'`$versuffix$shared_ext $libname.lib' dynamic_linker='Win32 ld.exe' ;; esac # FIXME: first we should search . and the directory the executable is in shlibpath_var=PATH ;; darwin* | rhapsody*) dynamic_linker="$host_os dyld" version_type=darwin need_lib_prefix=no need_version=no library_names_spec='$libname$release$major$shared_ext $libname$shared_ext' soname_spec='$libname$release$major$shared_ext' shlibpath_overrides_runpath=yes shlibpath_var=DYLD_LIBRARY_PATH shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`' m4_if([$1], [],[ sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/local/lib"]) sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib' ;; dgux*) version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LD_LIBRARY_PATH ;; freebsd* | dragonfly*) # DragonFly does not have aout. When/if they implement a new # versioning mechanism, adjust this. if test -x /usr/bin/objformat; then objformat=`/usr/bin/objformat` else case $host_os in freebsd[[23]].*) objformat=aout ;; *) objformat=elf ;; esac fi version_type=freebsd-$objformat case $version_type in freebsd-elf*) library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' need_version=no need_lib_prefix=no ;; freebsd-*) library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix' need_version=yes ;; esac shlibpath_var=LD_LIBRARY_PATH case $host_os in freebsd2.*) shlibpath_overrides_runpath=yes ;; freebsd3.[[01]]* | freebsdelf3.[[01]]*) shlibpath_overrides_runpath=yes hardcode_into_libs=yes ;; freebsd3.[[2-9]]* | freebsdelf3.[[2-9]]* | \ freebsd4.[[0-5]] | freebsdelf4.[[0-5]] | freebsd4.1.1 | freebsdelf4.1.1) shlibpath_overrides_runpath=no hardcode_into_libs=yes ;; *) # from 4.6 on, and DragonFly shlibpath_overrides_runpath=yes hardcode_into_libs=yes ;; esac ;; haiku*) version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no dynamic_linker="$host_os runtime_loader" library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LIBRARY_PATH shlibpath_overrides_runpath=no sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib' hardcode_into_libs=yes ;; hpux9* | hpux10* | hpux11*) # Give a soname corresponding to the major version so that dld.sl refuses to # link against other versions. version_type=sunos need_lib_prefix=no need_version=no case $host_cpu in ia64*) shrext_cmds='.so' hardcode_into_libs=yes dynamic_linker="$host_os dld.so" shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes # Unless +noenvvar is specified. library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' if test 32 = "$HPUX_IA64_MODE"; then sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib" sys_lib_dlsearch_path_spec=/usr/lib/hpux32 else sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64" sys_lib_dlsearch_path_spec=/usr/lib/hpux64 fi ;; hppa*64*) shrext_cmds='.sl' hardcode_into_libs=yes dynamic_linker="$host_os dld.sl" shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH shlibpath_overrides_runpath=yes # Unless +noenvvar is specified. library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64" sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec ;; *) shrext_cmds='.sl' dynamic_linker="$host_os dld.sl" shlibpath_var=SHLIB_PATH shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' ;; esac # HP-UX runs *really* slowly unless shared libraries are mode 555, ... postinstall_cmds='chmod 555 $lib' # or fails outright, so override atomically: install_override_mode=555 ;; interix[[3-9]]*) version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=no hardcode_into_libs=yes ;; irix5* | irix6* | nonstopux*) case $host_os in nonstopux*) version_type=nonstopux ;; *) if test yes = "$lt_cv_prog_gnu_ld"; then version_type=linux # correct to gnu/linux during the next big refactor else version_type=irix fi ;; esac need_lib_prefix=no need_version=no soname_spec='$libname$release$shared_ext$major' library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$release$shared_ext $libname$shared_ext' case $host_os in irix5* | nonstopux*) libsuff= shlibsuff= ;; *) case $LD in # libtool.m4 will add one of these switches to LD *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ") libsuff= shlibsuff= libmagic=32-bit;; *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ") libsuff=32 shlibsuff=N32 libmagic=N32;; *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ") libsuff=64 shlibsuff=64 libmagic=64-bit;; *) libsuff= shlibsuff= libmagic=never-match;; esac ;; esac shlibpath_var=LD_LIBRARY${shlibsuff}_PATH shlibpath_overrides_runpath=no sys_lib_search_path_spec="/usr/lib$libsuff /lib$libsuff /usr/local/lib$libsuff" sys_lib_dlsearch_path_spec="/usr/lib$libsuff /lib$libsuff" hardcode_into_libs=yes ;; # No shared lib support for Linux oldld, aout, or coff. linux*oldld* | linux*aout* | linux*coff*) dynamic_linker=no ;; linux*android*) version_type=none # Android doesn't support versioned libraries. need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext' soname_spec='$libname$release$shared_ext' finish_cmds= shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes # This implies no fast_install, which is unacceptable. # Some rework will be needed to allow for fast_install # before this can be enabled. hardcode_into_libs=yes dynamic_linker='Android linker' # Don't embed -rpath directories since the linker doesn't support them. _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' ;; # This must be glibc/ELF. linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*) version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=no # Some binutils ld are patched to set DT_RUNPATH AC_CACHE_VAL([lt_cv_shlibpath_overrides_runpath], [lt_cv_shlibpath_overrides_runpath=no save_LDFLAGS=$LDFLAGS save_libdir=$libdir eval "libdir=/foo; wl=\"$_LT_TAGVAR(lt_prog_compiler_wl, $1)\"; \ LDFLAGS=\"\$LDFLAGS $_LT_TAGVAR(hardcode_libdir_flag_spec, $1)\"" AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])], [AS_IF([ ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null], [lt_cv_shlibpath_overrides_runpath=yes])]) LDFLAGS=$save_LDFLAGS libdir=$save_libdir ]) shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath # This implies no fast_install, which is unacceptable. # Some rework will be needed to allow for fast_install # before this can be enabled. hardcode_into_libs=yes # Ideally, we could use ldconfig to report *all* directores which are # searched for libraries, however this is still not possible. Aside from not # being certain /sbin/ldconfig is available, command # 'ldconfig -N -X -v | grep ^/' on 64bit Fedora does not report /usr/lib64, # even though it is searched at run-time. Try to do the best guess by # appending ld.so.conf contents (and includes) to the search path. if test -f /etc/ld.so.conf; then lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \[$]2)); skip = 1; } { if (!skip) print \[$]0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[ ]*hwcap[ ]/d;s/[:, ]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '` sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra" fi # We used to test for /lib/ld.so.1 and disable shared libraries on # powerpc, because MkLinux only supported shared libraries with the # GNU dynamic linker. Since this was broken with cross compilers, # most powerpc-linux boxes support dynamic linking these days and # people can always --disable-shared, the test was removed, and we # assume the GNU/Linux dynamic linker is in use. dynamic_linker='GNU/Linux ld.so' ;; netbsd*) version_type=sunos need_lib_prefix=no need_version=no if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix' finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir' dynamic_linker='NetBSD (a.out) ld.so' else library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' dynamic_linker='NetBSD ld.elf_so' fi shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes hardcode_into_libs=yes ;; newsos6) version_type=linux # correct to gnu/linux during the next big refactor library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes ;; *nto* | *qnx*) version_type=qnx need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=no hardcode_into_libs=yes dynamic_linker='ldqnx.so' ;; openbsd* | bitrig*) version_type=sunos sys_lib_dlsearch_path_spec=/usr/lib need_lib_prefix=no if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`"; then need_version=no else need_version=yes fi library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix' finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes ;; os2*) libname_spec='$name' version_type=windows shrext_cmds=.dll need_version=no need_lib_prefix=no # OS/2 can only load a DLL with a base name of 8 characters or less. soname_spec='`test -n "$os2dllname" && libname="$os2dllname"; v=$($ECHO $release$versuffix | tr -d .-); n=$($ECHO $libname | cut -b -$((8 - ${#v})) | tr . _); $ECHO $n$v`$shared_ext' library_names_spec='${libname}_dll.$libext' dynamic_linker='OS/2 ld.exe' shlibpath_var=BEGINLIBPATH sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib" sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec postinstall_cmds='base_file=`basename \$file`~ dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\$base_file'\''i; $ECHO \$dlname'\''`~ dldir=$destdir/`dirname \$dlpath`~ test -d \$dldir || mkdir -p \$dldir~ $install_prog $dir/$dlname \$dldir/$dlname~ chmod a+x \$dldir/$dlname~ if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then eval '\''$striplib \$dldir/$dlname'\'' || exit \$?; fi' postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; $ECHO \$dlname'\''`~ dlpath=$dir/\$dldll~ $RM \$dlpath' ;; osf3* | osf4* | osf5*) version_type=osf need_lib_prefix=no need_version=no soname_spec='$libname$release$shared_ext$major' library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' shlibpath_var=LD_LIBRARY_PATH sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib" sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec ;; rdos*) dynamic_linker=no ;; solaris*) version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes hardcode_into_libs=yes # ldd complains unless libraries are executable postinstall_cmds='chmod +x $lib' ;; sunos4*) version_type=sunos library_names_spec='$libname$release$shared_ext$versuffix $libname$shared_ext$versuffix' finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes if test yes = "$with_gnu_ld"; then need_lib_prefix=no fi need_version=yes ;; sysv4 | sysv4.3*) version_type=linux # correct to gnu/linux during the next big refactor library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LD_LIBRARY_PATH case $host_vendor in sni) shlibpath_overrides_runpath=no need_lib_prefix=no runpath_var=LD_RUN_PATH ;; siemens) need_lib_prefix=no ;; motorola) need_lib_prefix=no need_version=no shlibpath_overrides_runpath=no sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib' ;; esac ;; sysv4*MP*) if test -d /usr/nec; then version_type=linux # correct to gnu/linux during the next big refactor library_names_spec='$libname$shared_ext.$versuffix $libname$shared_ext.$major $libname$shared_ext' soname_spec='$libname$shared_ext.$major' shlibpath_var=LD_LIBRARY_PATH fi ;; sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*) version_type=sco need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=yes hardcode_into_libs=yes if test yes = "$with_gnu_ld"; then sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib' else sys_lib_search_path_spec='/usr/ccs/lib /usr/lib' case $host_os in sco3.2v5*) sys_lib_search_path_spec="$sys_lib_search_path_spec /lib" ;; esac fi sys_lib_dlsearch_path_spec='/usr/lib' ;; tpf*) # TPF is a cross-target only. Preferred cross-host = GNU/Linux. version_type=linux # correct to gnu/linux during the next big refactor need_lib_prefix=no need_version=no library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' shlibpath_var=LD_LIBRARY_PATH shlibpath_overrides_runpath=no hardcode_into_libs=yes ;; uts4*) version_type=linux # correct to gnu/linux during the next big refactor library_names_spec='$libname$release$shared_ext$versuffix $libname$release$shared_ext$major $libname$shared_ext' soname_spec='$libname$release$shared_ext$major' shlibpath_var=LD_LIBRARY_PATH ;; *) dynamic_linker=no ;; esac AC_MSG_RESULT([$dynamic_linker]) test no = "$dynamic_linker" && can_build_shared=no variables_saved_for_relink="PATH $shlibpath_var $runpath_var" if test yes = "$GCC"; then variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH" fi if test set = "${lt_cv_sys_lib_search_path_spec+set}"; then sys_lib_search_path_spec=$lt_cv_sys_lib_search_path_spec fi if test set = "${lt_cv_sys_lib_dlsearch_path_spec+set}"; then sys_lib_dlsearch_path_spec=$lt_cv_sys_lib_dlsearch_path_spec fi # remember unaugmented sys_lib_dlsearch_path content for libtool script decls... configure_time_dlsearch_path=$sys_lib_dlsearch_path_spec # ... but it needs LT_SYS_LIBRARY_PATH munging for other configure-time code func_munge_path_list sys_lib_dlsearch_path_spec "$LT_SYS_LIBRARY_PATH" # to be used as default LT_SYS_LIBRARY_PATH value in generated libtool configure_time_lt_sys_library_path=$LT_SYS_LIBRARY_PATH _LT_DECL([], [variables_saved_for_relink], [1], [Variables whose values should be saved in libtool wrapper scripts and restored at link time]) _LT_DECL([], [need_lib_prefix], [0], [Do we need the "lib" prefix for modules?]) _LT_DECL([], [need_version], [0], [Do we need a version for libraries?]) _LT_DECL([], [version_type], [0], [Library versioning type]) _LT_DECL([], [runpath_var], [0], [Shared library runtime path variable]) _LT_DECL([], [shlibpath_var], [0],[Shared library path variable]) _LT_DECL([], [shlibpath_overrides_runpath], [0], [Is shlibpath searched before the hard-coded library search path?]) _LT_DECL([], [libname_spec], [1], [Format of library name prefix]) _LT_DECL([], [library_names_spec], [1], [[List of archive names. First name is the real one, the rest are links. The last name is the one that the linker finds with -lNAME]]) _LT_DECL([], [soname_spec], [1], [[The coded name of the library, if different from the real name]]) _LT_DECL([], [install_override_mode], [1], [Permission mode override for installation of shared libraries]) _LT_DECL([], [postinstall_cmds], [2], [Command to use after installation of a shared archive]) _LT_DECL([], [postuninstall_cmds], [2], [Command to use after uninstallation of a shared archive]) _LT_DECL([], [finish_cmds], [2], [Commands used to finish a libtool library installation in a directory]) _LT_DECL([], [finish_eval], [1], [[As "finish_cmds", except a single script fragment to be evaled but not shown]]) _LT_DECL([], [hardcode_into_libs], [0], [Whether we should hardcode library paths into libraries]) _LT_DECL([], [sys_lib_search_path_spec], [2], [Compile-time system search path for libraries]) _LT_DECL([sys_lib_dlsearch_path_spec], [configure_time_dlsearch_path], [2], [Detected run-time system search path for libraries]) _LT_DECL([], [configure_time_lt_sys_library_path], [2], [Explicit LT_SYS_LIBRARY_PATH set during ./configure time]) ])# _LT_SYS_DYNAMIC_LINKER # _LT_PATH_TOOL_PREFIX(TOOL) # -------------------------- # find a file program that can recognize shared library AC_DEFUN([_LT_PATH_TOOL_PREFIX], [m4_require([_LT_DECL_EGREP])dnl AC_MSG_CHECKING([for $1]) AC_CACHE_VAL(lt_cv_path_MAGIC_CMD, [case $MAGIC_CMD in [[\\/*] | ?:[\\/]*]) lt_cv_path_MAGIC_CMD=$MAGIC_CMD # Let the user override the test with a path. ;; *) lt_save_MAGIC_CMD=$MAGIC_CMD lt_save_ifs=$IFS; IFS=$PATH_SEPARATOR dnl $ac_dummy forces splitting on constant user-supplied paths. dnl POSIX.2 word splitting is done only on the output of word expansions, dnl not every word. This closes a longstanding sh security hole. ac_dummy="m4_if([$2], , $PATH, [$2])" for ac_dir in $ac_dummy; do IFS=$lt_save_ifs test -z "$ac_dir" && ac_dir=. if test -f "$ac_dir/$1"; then lt_cv_path_MAGIC_CMD=$ac_dir/"$1" if test -n "$file_magic_test_file"; then case $deplibs_check_method in "file_magic "*) file_magic_regex=`expr "$deplibs_check_method" : "file_magic \(.*\)"` MAGIC_CMD=$lt_cv_path_MAGIC_CMD if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null | $EGREP "$file_magic_regex" > /dev/null; then : else cat <<_LT_EOF 1>&2 *** Warning: the command libtool uses to detect shared libraries, *** $file_magic_cmd, produces output that libtool cannot recognize. *** The result is that libtool may fail to recognize shared libraries *** as such. This will affect the creation of libtool libraries that *** depend on shared libraries, but programs linked with such libtool *** libraries will work regardless of this problem. Nevertheless, you *** may want to report the problem to your system manager and/or to *** bug-libtool@gnu.org _LT_EOF fi ;; esac fi break fi done IFS=$lt_save_ifs MAGIC_CMD=$lt_save_MAGIC_CMD ;; esac]) MAGIC_CMD=$lt_cv_path_MAGIC_CMD if test -n "$MAGIC_CMD"; then AC_MSG_RESULT($MAGIC_CMD) else AC_MSG_RESULT(no) fi _LT_DECL([], [MAGIC_CMD], [0], [Used to examine libraries when file_magic_cmd begins with "file"])dnl ])# _LT_PATH_TOOL_PREFIX # Old name: AU_ALIAS([AC_PATH_TOOL_PREFIX], [_LT_PATH_TOOL_PREFIX]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_PATH_TOOL_PREFIX], []) # _LT_PATH_MAGIC # -------------- # find a file program that can recognize a shared library m4_defun([_LT_PATH_MAGIC], [_LT_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin$PATH_SEPARATOR$PATH) if test -z "$lt_cv_path_MAGIC_CMD"; then if test -n "$ac_tool_prefix"; then _LT_PATH_TOOL_PREFIX(file, /usr/bin$PATH_SEPARATOR$PATH) else MAGIC_CMD=: fi fi ])# _LT_PATH_MAGIC # LT_PATH_LD # ---------- # find the pathname to the GNU or non-GNU linker AC_DEFUN([LT_PATH_LD], [AC_REQUIRE([AC_PROG_CC])dnl AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_REQUIRE([AC_CANONICAL_BUILD])dnl m4_require([_LT_DECL_SED])dnl m4_require([_LT_DECL_EGREP])dnl m4_require([_LT_PROG_ECHO_BACKSLASH])dnl AC_ARG_WITH([gnu-ld], [AS_HELP_STRING([--with-gnu-ld], [assume the C compiler uses GNU ld @<:@default=no@:>@])], [test no = "$withval" || with_gnu_ld=yes], [with_gnu_ld=no])dnl ac_prog=ld if test yes = "$GCC"; then # Check if gcc -print-prog-name=ld gives a path. AC_MSG_CHECKING([for ld used by $CC]) case $host in *-*-mingw*) # gcc leaves a trailing carriage return, which upsets mingw ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;; *) ac_prog=`($CC -print-prog-name=ld) 2>&5` ;; esac case $ac_prog in # Accept absolute paths. [[\\/]]* | ?:[[\\/]]*) re_direlt='/[[^/]][[^/]]*/\.\./' # Canonicalize the pathname of ld ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'` while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"` done test -z "$LD" && LD=$ac_prog ;; "") # If it fails, then pretend we aren't using GCC. ac_prog=ld ;; *) # If it is relative, then search for the first ld in PATH. with_gnu_ld=unknown ;; esac elif test yes = "$with_gnu_ld"; then AC_MSG_CHECKING([for GNU ld]) else AC_MSG_CHECKING([for non-GNU ld]) fi AC_CACHE_VAL(lt_cv_path_LD, [if test -z "$LD"; then lt_save_ifs=$IFS; IFS=$PATH_SEPARATOR for ac_dir in $PATH; do IFS=$lt_save_ifs test -z "$ac_dir" && ac_dir=. if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then lt_cv_path_LD=$ac_dir/$ac_prog # Check to see if the program is GNU ld. I'd rather use --version, # but apparently some variants of GNU ld only accept -v. # Break only if it was the GNU/non-GNU ld that we prefer. case `"$lt_cv_path_LD" -v 2>&1 &1 conftest.i cat conftest.i conftest.i >conftest2.i : ${lt_DD:=$DD} AC_PATH_PROGS_FEATURE_CHECK([lt_DD], [dd], [if "$ac_path_lt_DD" bs=32 count=1 conftest.out 2>/dev/null; then cmp -s conftest.i conftest.out \ && ac_cv_path_lt_DD="$ac_path_lt_DD" ac_path_lt_DD_found=: fi]) rm -f conftest.i conftest2.i conftest.out]) ])# _LT_PATH_DD # _LT_CMD_TRUNCATE # ---------------- # find command to truncate a binary pipe m4_defun([_LT_CMD_TRUNCATE], [m4_require([_LT_PATH_DD]) AC_CACHE_CHECK([how to truncate binary pipes], [lt_cv_truncate_bin], [printf 0123456789abcdef0123456789abcdef >conftest.i cat conftest.i conftest.i >conftest2.i lt_cv_truncate_bin= if "$ac_cv_path_lt_DD" bs=32 count=1 conftest.out 2>/dev/null; then cmp -s conftest.i conftest.out \ && lt_cv_truncate_bin="$ac_cv_path_lt_DD bs=4096 count=1" fi rm -f conftest.i conftest2.i conftest.out test -z "$lt_cv_truncate_bin" && lt_cv_truncate_bin="$SED -e 4q"]) _LT_DECL([lt_truncate_bin], [lt_cv_truncate_bin], [1], [Command to truncate a binary pipe]) ])# _LT_CMD_TRUNCATE # _LT_CHECK_MAGIC_METHOD # ---------------------- # how to check for library dependencies # -- PORTME fill in with the dynamic library characteristics m4_defun([_LT_CHECK_MAGIC_METHOD], [m4_require([_LT_DECL_EGREP]) m4_require([_LT_DECL_OBJDUMP]) AC_CACHE_CHECK([how to recognize dependent libraries], lt_cv_deplibs_check_method, [lt_cv_file_magic_cmd='$MAGIC_CMD' lt_cv_file_magic_test_file= lt_cv_deplibs_check_method='unknown' # Need to set the preceding variable on all platforms that support # interlibrary dependencies. # 'none' -- dependencies not supported. # 'unknown' -- same as none, but documents that we really don't know. # 'pass_all' -- all dependencies passed with no checks. # 'test_compile' -- check by making test program. # 'file_magic [[regex]]' -- check by looking for files in library path # that responds to the $file_magic_cmd with a given extended regex. # If you have 'file' or equivalent on your system and you're not sure # whether 'pass_all' will *always* work, you probably want this one. case $host_os in aix[[4-9]]*) lt_cv_deplibs_check_method=pass_all ;; beos*) lt_cv_deplibs_check_method=pass_all ;; bsdi[[45]]*) lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib)' lt_cv_file_magic_cmd='/usr/bin/file -L' lt_cv_file_magic_test_file=/shlib/libc.so ;; cygwin*) # func_win32_libid is a shell function defined in ltmain.sh lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL' lt_cv_file_magic_cmd='func_win32_libid' ;; mingw* | pw32*) # Base MSYS/MinGW do not provide the 'file' command needed by # func_win32_libid shell function, so use a weaker test based on 'objdump', # unless we find 'file', for example because we are cross-compiling. if ( file / ) >/dev/null 2>&1; then lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL' lt_cv_file_magic_cmd='func_win32_libid' else # Keep this pattern in sync with the one in func_win32_libid. lt_cv_deplibs_check_method='file_magic file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)' lt_cv_file_magic_cmd='$OBJDUMP -f' fi ;; cegcc*) # use the weaker test based on 'objdump'. See mingw*. lt_cv_deplibs_check_method='file_magic file format pe-arm-.*little(.*architecture: arm)?' lt_cv_file_magic_cmd='$OBJDUMP -f' ;; darwin* | rhapsody*) lt_cv_deplibs_check_method=pass_all ;; freebsd* | dragonfly*) if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then case $host_cpu in i*86 ) # Not sure whether the presence of OpenBSD here was a mistake. # Let's accept both of them until this is cleared up. lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD|DragonFly)/i[[3-9]]86 (compact )?demand paged shared library' lt_cv_file_magic_cmd=/usr/bin/file lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*` ;; esac else lt_cv_deplibs_check_method=pass_all fi ;; haiku*) lt_cv_deplibs_check_method=pass_all ;; hpux10.20* | hpux11*) lt_cv_file_magic_cmd=/usr/bin/file case $host_cpu in ia64*) lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|ELF-[[0-9]][[0-9]]) shared object file - IA64' lt_cv_file_magic_test_file=/usr/lib/hpux32/libc.so ;; hppa*64*) [lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|ELF[ -][0-9][0-9])(-bit)?( [LM]SB)? shared object( file)?[, -]* PA-RISC [0-9]\.[0-9]'] lt_cv_file_magic_test_file=/usr/lib/pa20_64/libc.sl ;; *) lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|PA-RISC[[0-9]]\.[[0-9]]) shared library' lt_cv_file_magic_test_file=/usr/lib/libc.sl ;; esac ;; interix[[3-9]]*) # PIC code is broken on Interix 3.x, that's why |\.a not |_pic\.a here lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so|\.a)$' ;; irix5* | irix6* | nonstopux*) case $LD in *-32|*"-32 ") libmagic=32-bit;; *-n32|*"-n32 ") libmagic=N32;; *-64|*"-64 ") libmagic=64-bit;; *) libmagic=never-match;; esac lt_cv_deplibs_check_method=pass_all ;; # This must be glibc/ELF. linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*) lt_cv_deplibs_check_method=pass_all ;; netbsd*) if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$' else lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so|_pic\.a)$' fi ;; newos6*) lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (executable|dynamic lib)' lt_cv_file_magic_cmd=/usr/bin/file lt_cv_file_magic_test_file=/usr/lib/libnls.so ;; *nto* | *qnx*) lt_cv_deplibs_check_method=pass_all ;; openbsd* | bitrig*) if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`"; then lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|\.so|_pic\.a)$' else lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$' fi ;; osf3* | osf4* | osf5*) lt_cv_deplibs_check_method=pass_all ;; rdos*) lt_cv_deplibs_check_method=pass_all ;; solaris*) lt_cv_deplibs_check_method=pass_all ;; sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*) lt_cv_deplibs_check_method=pass_all ;; sysv4 | sysv4.3*) case $host_vendor in motorola) lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib) M[[0-9]][[0-9]]* Version [[0-9]]' lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*` ;; ncr) lt_cv_deplibs_check_method=pass_all ;; sequent) lt_cv_file_magic_cmd='/bin/file' lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB (shared object|dynamic lib )' ;; sni) lt_cv_file_magic_cmd='/bin/file' lt_cv_deplibs_check_method="file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB dynamic lib" lt_cv_file_magic_test_file=/lib/libc.so ;; siemens) lt_cv_deplibs_check_method=pass_all ;; pc) lt_cv_deplibs_check_method=pass_all ;; esac ;; tpf*) lt_cv_deplibs_check_method=pass_all ;; os2*) lt_cv_deplibs_check_method=pass_all ;; esac ]) file_magic_glob= want_nocaseglob=no if test "$build" = "$host"; then case $host_os in mingw* | pw32*) if ( shopt | grep nocaseglob ) >/dev/null 2>&1; then want_nocaseglob=yes else file_magic_glob=`echo aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ | $SED -e "s/\(..\)/s\/[[\1]]\/[[\1]]\/g;/g"` fi ;; esac fi file_magic_cmd=$lt_cv_file_magic_cmd deplibs_check_method=$lt_cv_deplibs_check_method test -z "$deplibs_check_method" && deplibs_check_method=unknown _LT_DECL([], [deplibs_check_method], [1], [Method to check whether dependent libraries are shared objects]) _LT_DECL([], [file_magic_cmd], [1], [Command to use when deplibs_check_method = "file_magic"]) _LT_DECL([], [file_magic_glob], [1], [How to find potential files when deplibs_check_method = "file_magic"]) _LT_DECL([], [want_nocaseglob], [1], [Find potential files using nocaseglob when deplibs_check_method = "file_magic"]) ])# _LT_CHECK_MAGIC_METHOD # LT_PATH_NM # ---------- # find the pathname to a BSD- or MS-compatible name lister AC_DEFUN([LT_PATH_NM], [AC_REQUIRE([AC_PROG_CC])dnl AC_CACHE_CHECK([for BSD- or MS-compatible name lister (nm)], lt_cv_path_NM, [if test -n "$NM"; then # Let the user override the test. lt_cv_path_NM=$NM else lt_nm_to_check=${ac_tool_prefix}nm if test -n "$ac_tool_prefix" && test "$build" = "$host"; then lt_nm_to_check="$lt_nm_to_check nm" fi for lt_tmp_nm in $lt_nm_to_check; do lt_save_ifs=$IFS; IFS=$PATH_SEPARATOR for ac_dir in $PATH /usr/ccs/bin/elf /usr/ccs/bin /usr/ucb /bin; do IFS=$lt_save_ifs test -z "$ac_dir" && ac_dir=. tmp_nm=$ac_dir/$lt_tmp_nm if test -f "$tmp_nm" || test -f "$tmp_nm$ac_exeext"; then # Check to see if the nm accepts a BSD-compat flag. # Adding the 'sed 1q' prevents false positives on HP-UX, which says: # nm: unknown option "B" ignored # Tru64's nm complains that /dev/null is an invalid object file # MSYS converts /dev/null to NUL, MinGW nm treats NUL as empty case $build_os in mingw*) lt_bad_file=conftest.nm/nofile ;; *) lt_bad_file=/dev/null ;; esac case `"$tmp_nm" -B $lt_bad_file 2>&1 | sed '1q'` in *$lt_bad_file* | *'Invalid file or object type'*) lt_cv_path_NM="$tmp_nm -B" break 2 ;; *) case `"$tmp_nm" -p /dev/null 2>&1 | sed '1q'` in */dev/null*) lt_cv_path_NM="$tmp_nm -p" break 2 ;; *) lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but continue # so that we can try to find one that supports BSD flags ;; esac ;; esac fi done IFS=$lt_save_ifs done : ${lt_cv_path_NM=no} fi]) if test no != "$lt_cv_path_NM"; then NM=$lt_cv_path_NM else # Didn't find any BSD compatible name lister, look for dumpbin. if test -n "$DUMPBIN"; then : # Let the user override the test. else AC_CHECK_TOOLS(DUMPBIN, [dumpbin "link -dump"], :) case `$DUMPBIN -symbols -headers /dev/null 2>&1 | sed '1q'` in *COFF*) DUMPBIN="$DUMPBIN -symbols -headers" ;; *) DUMPBIN=: ;; esac fi AC_SUBST([DUMPBIN]) if test : != "$DUMPBIN"; then NM=$DUMPBIN fi fi test -z "$NM" && NM=nm AC_SUBST([NM]) _LT_DECL([], [NM], [1], [A BSD- or MS-compatible name lister])dnl AC_CACHE_CHECK([the name lister ($NM) interface], [lt_cv_nm_interface], [lt_cv_nm_interface="BSD nm" echo "int some_variable = 0;" > conftest.$ac_ext (eval echo "\"\$as_me:$LINENO: $ac_compile\"" >&AS_MESSAGE_LOG_FD) (eval "$ac_compile" 2>conftest.err) cat conftest.err >&AS_MESSAGE_LOG_FD (eval echo "\"\$as_me:$LINENO: $NM \\\"conftest.$ac_objext\\\"\"" >&AS_MESSAGE_LOG_FD) (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out) cat conftest.err >&AS_MESSAGE_LOG_FD (eval echo "\"\$as_me:$LINENO: output\"" >&AS_MESSAGE_LOG_FD) cat conftest.out >&AS_MESSAGE_LOG_FD if $GREP 'External.*some_variable' conftest.out > /dev/null; then lt_cv_nm_interface="MS dumpbin" fi rm -f conftest*]) ])# LT_PATH_NM # Old names: AU_ALIAS([AM_PROG_NM], [LT_PATH_NM]) AU_ALIAS([AC_PROG_NM], [LT_PATH_NM]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AM_PROG_NM], []) dnl AC_DEFUN([AC_PROG_NM], []) # _LT_CHECK_SHAREDLIB_FROM_LINKLIB # -------------------------------- # how to determine the name of the shared library # associated with a specific link library. # -- PORTME fill in with the dynamic library characteristics m4_defun([_LT_CHECK_SHAREDLIB_FROM_LINKLIB], [m4_require([_LT_DECL_EGREP]) m4_require([_LT_DECL_OBJDUMP]) m4_require([_LT_DECL_DLLTOOL]) AC_CACHE_CHECK([how to associate runtime and link libraries], lt_cv_sharedlib_from_linklib_cmd, [lt_cv_sharedlib_from_linklib_cmd='unknown' case $host_os in cygwin* | mingw* | pw32* | cegcc*) # two different shell functions defined in ltmain.sh; # decide which one to use based on capabilities of $DLLTOOL case `$DLLTOOL --help 2>&1` in *--identify-strict*) lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib ;; *) lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib_fallback ;; esac ;; *) # fallback: assume linklib IS sharedlib lt_cv_sharedlib_from_linklib_cmd=$ECHO ;; esac ]) sharedlib_from_linklib_cmd=$lt_cv_sharedlib_from_linklib_cmd test -z "$sharedlib_from_linklib_cmd" && sharedlib_from_linklib_cmd=$ECHO _LT_DECL([], [sharedlib_from_linklib_cmd], [1], [Command to associate shared and link libraries]) ])# _LT_CHECK_SHAREDLIB_FROM_LINKLIB # _LT_PATH_MANIFEST_TOOL # ---------------------- # locate the manifest tool m4_defun([_LT_PATH_MANIFEST_TOOL], [AC_CHECK_TOOL(MANIFEST_TOOL, mt, :) test -z "$MANIFEST_TOOL" && MANIFEST_TOOL=mt AC_CACHE_CHECK([if $MANIFEST_TOOL is a manifest tool], [lt_cv_path_mainfest_tool], [lt_cv_path_mainfest_tool=no echo "$as_me:$LINENO: $MANIFEST_TOOL '-?'" >&AS_MESSAGE_LOG_FD $MANIFEST_TOOL '-?' 2>conftest.err > conftest.out cat conftest.err >&AS_MESSAGE_LOG_FD if $GREP 'Manifest Tool' conftest.out > /dev/null; then lt_cv_path_mainfest_tool=yes fi rm -f conftest*]) if test yes != "$lt_cv_path_mainfest_tool"; then MANIFEST_TOOL=: fi _LT_DECL([], [MANIFEST_TOOL], [1], [Manifest tool])dnl ])# _LT_PATH_MANIFEST_TOOL # _LT_DLL_DEF_P([FILE]) # --------------------- # True iff FILE is a Windows DLL '.def' file. # Keep in sync with func_dll_def_p in the libtool script AC_DEFUN([_LT_DLL_DEF_P], [dnl test DEF = "`$SED -n dnl -e '\''s/^[[ ]]*//'\'' dnl Strip leading whitespace -e '\''/^\(;.*\)*$/d'\'' dnl Delete empty lines and comments -e '\''s/^\(EXPORTS\|LIBRARY\)\([[ ]].*\)*$/DEF/p'\'' dnl -e q dnl Only consider the first "real" line $1`" dnl ])# _LT_DLL_DEF_P # LT_LIB_M # -------- # check for math library AC_DEFUN([LT_LIB_M], [AC_REQUIRE([AC_CANONICAL_HOST])dnl LIBM= case $host in *-*-beos* | *-*-cegcc* | *-*-cygwin* | *-*-haiku* | *-*-pw32* | *-*-darwin*) # These system don't have libm, or don't need it ;; *-ncr-sysv4.3*) AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM=-lmw) AC_CHECK_LIB(m, cos, LIBM="$LIBM -lm") ;; *) AC_CHECK_LIB(m, cos, LIBM=-lm) ;; esac AC_SUBST([LIBM]) ])# LT_LIB_M # Old name: AU_ALIAS([AC_CHECK_LIBM], [LT_LIB_M]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([AC_CHECK_LIBM], []) # _LT_COMPILER_NO_RTTI([TAGNAME]) # ------------------------------- m4_defun([_LT_COMPILER_NO_RTTI], [m4_require([_LT_TAG_COMPILER])dnl _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)= if test yes = "$GCC"; then case $cc_basename in nvcc*) _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -Xcompiler -fno-builtin' ;; *) _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin' ;; esac _LT_COMPILER_OPTION([if $compiler supports -fno-rtti -fno-exceptions], lt_cv_prog_compiler_rtti_exceptions, [-fno-rtti -fno-exceptions], [], [_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)="$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1) -fno-rtti -fno-exceptions"]) fi _LT_TAGDECL([no_builtin_flag], [lt_prog_compiler_no_builtin_flag], [1], [Compiler flag to turn off builtin functions]) ])# _LT_COMPILER_NO_RTTI # _LT_CMD_GLOBAL_SYMBOLS # ---------------------- m4_defun([_LT_CMD_GLOBAL_SYMBOLS], [AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_REQUIRE([AC_PROG_CC])dnl AC_REQUIRE([AC_PROG_AWK])dnl AC_REQUIRE([LT_PATH_NM])dnl AC_REQUIRE([LT_PATH_LD])dnl m4_require([_LT_DECL_SED])dnl m4_require([_LT_DECL_EGREP])dnl m4_require([_LT_TAG_COMPILER])dnl # Check for command to grab the raw symbol name followed by C symbol from nm. AC_MSG_CHECKING([command to parse $NM output from $compiler object]) AC_CACHE_VAL([lt_cv_sys_global_symbol_pipe], [ # These are sane defaults that work on at least a few old systems. # [They come from Ultrix. What could be older than Ultrix?!! ;)] # Character class describing NM global symbol codes. symcode='[[BCDEGRST]]' # Regexp to match symbols that can be accessed directly from C. sympat='\([[_A-Za-z]][[_A-Za-z0-9]]*\)' # Define system-specific variables. case $host_os in aix*) symcode='[[BCDT]]' ;; cygwin* | mingw* | pw32* | cegcc*) symcode='[[ABCDGISTW]]' ;; hpux*) if test ia64 = "$host_cpu"; then symcode='[[ABCDEGRST]]' fi ;; irix* | nonstopux*) symcode='[[BCDEGRST]]' ;; osf*) symcode='[[BCDEGQRST]]' ;; solaris*) symcode='[[BDRT]]' ;; sco3.2v5*) symcode='[[DT]]' ;; sysv4.2uw2*) symcode='[[DT]]' ;; sysv5* | sco5v6* | unixware* | OpenUNIX*) symcode='[[ABDT]]' ;; sysv4) symcode='[[DFNSTU]]' ;; esac # If we're using GNU nm, then use its standard symbol codes. case `$NM -V 2>&1` in *GNU* | *'with BFD'*) symcode='[[ABCDGIRSTW]]' ;; esac if test "$lt_cv_nm_interface" = "MS dumpbin"; then # Gets list of data symbols to import. lt_cv_sys_global_symbol_to_import="sed -n -e 's/^I .* \(.*\)$/\1/p'" # Adjust the below global symbol transforms to fixup imported variables. lt_cdecl_hook=" -e 's/^I .* \(.*\)$/extern __declspec(dllimport) char \1;/p'" lt_c_name_hook=" -e 's/^I .* \(.*\)$/ {\"\1\", (void *) 0},/p'" lt_c_name_lib_hook="\ -e 's/^I .* \(lib.*\)$/ {\"\1\", (void *) 0},/p'\ -e 's/^I .* \(.*\)$/ {\"lib\1\", (void *) 0},/p'" else # Disable hooks by default. lt_cv_sys_global_symbol_to_import= lt_cdecl_hook= lt_c_name_hook= lt_c_name_lib_hook= fi # Transform an extracted symbol line into a proper C declaration. # Some systems (esp. on ia64) link data and code symbols differently, # so use this general approach. lt_cv_sys_global_symbol_to_cdecl="sed -n"\ $lt_cdecl_hook\ " -e 's/^T .* \(.*\)$/extern int \1();/p'"\ " -e 's/^$symcode$symcode* .* \(.*\)$/extern char \1;/p'" # Transform an extracted symbol line into symbol name and symbol address lt_cv_sys_global_symbol_to_c_name_address="sed -n"\ $lt_c_name_hook\ " -e 's/^: \(.*\) .*$/ {\"\1\", (void *) 0},/p'"\ " -e 's/^$symcode$symcode* .* \(.*\)$/ {\"\1\", (void *) \&\1},/p'" # Transform an extracted symbol line into symbol name with lib prefix and # symbol address. lt_cv_sys_global_symbol_to_c_name_address_lib_prefix="sed -n"\ $lt_c_name_lib_hook\ " -e 's/^: \(.*\) .*$/ {\"\1\", (void *) 0},/p'"\ " -e 's/^$symcode$symcode* .* \(lib.*\)$/ {\"\1\", (void *) \&\1},/p'"\ " -e 's/^$symcode$symcode* .* \(.*\)$/ {\"lib\1\", (void *) \&\1},/p'" # Handle CRLF in mingw tool chain opt_cr= case $build_os in mingw*) opt_cr=`$ECHO 'x\{0,1\}' | tr x '\015'` # option cr in regexp ;; esac # Try without a prefix underscore, then with it. for ac_symprfx in "" "_"; do # Transform symcode, sympat, and symprfx into a raw symbol and a C symbol. symxfrm="\\1 $ac_symprfx\\2 \\2" # Write the raw and C identifiers. if test "$lt_cv_nm_interface" = "MS dumpbin"; then # Fake it for dumpbin and say T for any non-static function, # D for any global variable and I for any imported variable. # Also find C++ and __fastcall symbols from MSVC++, # which start with @ or ?. lt_cv_sys_global_symbol_pipe="$AWK ['"\ " {last_section=section; section=\$ 3};"\ " /^COFF SYMBOL TABLE/{for(i in hide) delete hide[i]};"\ " /Section length .*#relocs.*(pick any)/{hide[last_section]=1};"\ " /^ *Symbol name *: /{split(\$ 0,sn,\":\"); si=substr(sn[2],2)};"\ " /^ *Type *: code/{print \"T\",si,substr(si,length(prfx))};"\ " /^ *Type *: data/{print \"I\",si,substr(si,length(prfx))};"\ " \$ 0!~/External *\|/{next};"\ " / 0+ UNDEF /{next}; / UNDEF \([^|]\)*()/{next};"\ " {if(hide[section]) next};"\ " {f=\"D\"}; \$ 0~/\(\).*\|/{f=\"T\"};"\ " {split(\$ 0,a,/\||\r/); split(a[2],s)};"\ " s[1]~/^[@?]/{print f,s[1],s[1]; next};"\ " s[1]~prfx {split(s[1],t,\"@\"); print f,t[1],substr(t[1],length(prfx))}"\ " ' prfx=^$ac_symprfx]" else lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[[ ]]\($symcode$symcode*\)[[ ]][[ ]]*$ac_symprfx$sympat$opt_cr$/$symxfrm/p'" fi lt_cv_sys_global_symbol_pipe="$lt_cv_sys_global_symbol_pipe | sed '/ __gnu_lto/d'" # Check to see that the pipe works correctly. pipe_works=no rm -f conftest* cat > conftest.$ac_ext <<_LT_EOF #ifdef __cplusplus extern "C" { #endif char nm_test_var; void nm_test_func(void); void nm_test_func(void){} #ifdef __cplusplus } #endif int main(){nm_test_var='a';nm_test_func();return(0);} _LT_EOF if AC_TRY_EVAL(ac_compile); then # Now try to grab the symbols. nlist=conftest.nm if AC_TRY_EVAL(NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist) && test -s "$nlist"; then # Try sorting and uniquifying the output. if sort "$nlist" | uniq > "$nlist"T; then mv -f "$nlist"T "$nlist" else rm -f "$nlist"T fi # Make sure that we snagged all the symbols we need. if $GREP ' nm_test_var$' "$nlist" >/dev/null; then if $GREP ' nm_test_func$' "$nlist" >/dev/null; then cat <<_LT_EOF > conftest.$ac_ext /* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests. */ #if defined _WIN32 || defined __CYGWIN__ || defined _WIN32_WCE /* DATA imports from DLLs on WIN32 can't be const, because runtime relocations are performed -- see ld's documentation on pseudo-relocs. */ # define LT@&t@_DLSYM_CONST #elif defined __osf__ /* This system does not cope well with relocations in const data. */ # define LT@&t@_DLSYM_CONST #else # define LT@&t@_DLSYM_CONST const #endif #ifdef __cplusplus extern "C" { #endif _LT_EOF # Now generate the symbol file. eval "$lt_cv_sys_global_symbol_to_cdecl"' < "$nlist" | $GREP -v main >> conftest.$ac_ext' cat <<_LT_EOF >> conftest.$ac_ext /* The mapping between symbol names and symbols. */ LT@&t@_DLSYM_CONST struct { const char *name; void *address; } lt__PROGRAM__LTX_preloaded_symbols[[]] = { { "@PROGRAM@", (void *) 0 }, _LT_EOF $SED "s/^$symcode$symcode* .* \(.*\)$/ {\"\1\", (void *) \&\1},/" < "$nlist" | $GREP -v main >> conftest.$ac_ext cat <<\_LT_EOF >> conftest.$ac_ext {0, (void *) 0} }; /* This works around a problem in FreeBSD linker */ #ifdef FREEBSD_WORKAROUND static const void *lt_preloaded_setup() { return lt__PROGRAM__LTX_preloaded_symbols; } #endif #ifdef __cplusplus } #endif _LT_EOF # Now try linking the two files. mv conftest.$ac_objext conftstm.$ac_objext lt_globsym_save_LIBS=$LIBS lt_globsym_save_CFLAGS=$CFLAGS LIBS=conftstm.$ac_objext CFLAGS="$CFLAGS$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)" if AC_TRY_EVAL(ac_link) && test -s conftest$ac_exeext; then pipe_works=yes fi LIBS=$lt_globsym_save_LIBS CFLAGS=$lt_globsym_save_CFLAGS else echo "cannot find nm_test_func in $nlist" >&AS_MESSAGE_LOG_FD fi else echo "cannot find nm_test_var in $nlist" >&AS_MESSAGE_LOG_FD fi else echo "cannot run $lt_cv_sys_global_symbol_pipe" >&AS_MESSAGE_LOG_FD fi else echo "$progname: failed program was:" >&AS_MESSAGE_LOG_FD cat conftest.$ac_ext >&5 fi rm -rf conftest* conftst* # Do not use the global_symbol_pipe unless it works. if test yes = "$pipe_works"; then break else lt_cv_sys_global_symbol_pipe= fi done ]) if test -z "$lt_cv_sys_global_symbol_pipe"; then lt_cv_sys_global_symbol_to_cdecl= fi if test -z "$lt_cv_sys_global_symbol_pipe$lt_cv_sys_global_symbol_to_cdecl"; then AC_MSG_RESULT(failed) else AC_MSG_RESULT(ok) fi # Response file support. if test "$lt_cv_nm_interface" = "MS dumpbin"; then nm_file_list_spec='@' elif $NM --help 2>/dev/null | grep '[[@]]FILE' >/dev/null; then nm_file_list_spec='@' fi _LT_DECL([global_symbol_pipe], [lt_cv_sys_global_symbol_pipe], [1], [Take the output of nm and produce a listing of raw symbols and C names]) _LT_DECL([global_symbol_to_cdecl], [lt_cv_sys_global_symbol_to_cdecl], [1], [Transform the output of nm in a proper C declaration]) _LT_DECL([global_symbol_to_import], [lt_cv_sys_global_symbol_to_import], [1], [Transform the output of nm into a list of symbols to manually relocate]) _LT_DECL([global_symbol_to_c_name_address], [lt_cv_sys_global_symbol_to_c_name_address], [1], [Transform the output of nm in a C name address pair]) _LT_DECL([global_symbol_to_c_name_address_lib_prefix], [lt_cv_sys_global_symbol_to_c_name_address_lib_prefix], [1], [Transform the output of nm in a C name address pair when lib prefix is needed]) _LT_DECL([nm_interface], [lt_cv_nm_interface], [1], [The name lister interface]) _LT_DECL([], [nm_file_list_spec], [1], [Specify filename containing input files for $NM]) ]) # _LT_CMD_GLOBAL_SYMBOLS # _LT_COMPILER_PIC([TAGNAME]) # --------------------------- m4_defun([_LT_COMPILER_PIC], [m4_require([_LT_TAG_COMPILER])dnl _LT_TAGVAR(lt_prog_compiler_wl, $1)= _LT_TAGVAR(lt_prog_compiler_pic, $1)= _LT_TAGVAR(lt_prog_compiler_static, $1)= m4_if([$1], [CXX], [ # C++ specific cases for pic, static, wl, etc. if test yes = "$GXX"; then _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' case $host_os in aix*) # All AIX code is PIC. if test ia64 = "$host_cpu"; then # AIX 5 now supports IA64 processor _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' fi _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; amigaos*) case $host_cpu in powerpc) # see comment about AmigaOS4 .so support _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; m68k) # FIXME: we need at least 68020 code to build shared libraries, but # adding the '-m68020' flag to GCC prevents building anything better, # like '-m68040'. _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4' ;; esac ;; beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*) # PIC is the default for these OSes. ;; mingw* | cygwin* | os2* | pw32* | cegcc*) # This hack is so that the source file can tell whether it is being # built for inclusion in a dll (and should export symbols for example). # Although the cygwin gcc ignores -fPIC, still need this for old-style # (--disable-auto-import) libraries m4_if([$1], [GCJ], [], [_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT']) case $host_os in os2*) _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-static' ;; esac ;; darwin* | rhapsody*) # PIC is the default on this platform # Common symbols not allowed in MH_DYLIB files _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common' ;; *djgpp*) # DJGPP does not support shared libraries at all _LT_TAGVAR(lt_prog_compiler_pic, $1)= ;; haiku*) # PIC is the default for Haiku. # The "-static" flag exists, but is broken. _LT_TAGVAR(lt_prog_compiler_static, $1)= ;; interix[[3-9]]*) # Interix 3.x gcc -fpic/-fPIC options generate broken code. # Instead, we relocate shared libraries at runtime. ;; sysv4*MP*) if test -d /usr/nec; then _LT_TAGVAR(lt_prog_compiler_pic, $1)=-Kconform_pic fi ;; hpux*) # PIC is the default for 64-bit PA HP-UX, but not for 32-bit # PA HP-UX. On IA64 HP-UX, PIC is the default but the pic flag # sets the default TLS model and affects inlining. case $host_cpu in hppa*64*) ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; esac ;; *qnx* | *nto*) # QNX uses GNU C++, but need to define -shared option too, otherwise # it will coredump. _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared' ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; esac else case $host_os in aix[[4-9]]*) # All AIX code is PIC. if test ia64 = "$host_cpu"; then # AIX 5 now supports IA64 processor _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' else _LT_TAGVAR(lt_prog_compiler_static, $1)='-bnso -bI:/lib/syscalls.exp' fi ;; chorus*) case $cc_basename in cxch68*) # Green Hills C++ Compiler # _LT_TAGVAR(lt_prog_compiler_static, $1)="--no_auto_instantiation -u __main -u __premain -u _abort -r $COOL_DIR/lib/libOrb.a $MVME_DIR/lib/CC/libC.a $MVME_DIR/lib/classix/libcx.s.a" ;; esac ;; mingw* | cygwin* | os2* | pw32* | cegcc*) # This hack is so that the source file can tell whether it is being # built for inclusion in a dll (and should export symbols for example). m4_if([$1], [GCJ], [], [_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT']) ;; dgux*) case $cc_basename in ec++*) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' ;; ghcx*) # Green Hills C++ Compiler _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic' ;; *) ;; esac ;; freebsd* | dragonfly*) # FreeBSD uses GNU C++ ;; hpux9* | hpux10* | hpux11*) case $cc_basename in CC*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-a ${wl}archive' if test ia64 != "$host_cpu"; then _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z' fi ;; aCC*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-a ${wl}archive' case $host_cpu in hppa*64*|ia64*) # +Z the default ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z' ;; esac ;; *) ;; esac ;; interix*) # This is c89, which is MS Visual C++ (no shared libs) # Anyone wants to do a port? ;; irix5* | irix6* | nonstopux*) case $cc_basename in CC*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' # CC pic flag -KPIC is the default. ;; *) ;; esac ;; linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*) case $cc_basename in KCC*) # KAI C++ Compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='--backend -Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; ecpc* ) # old Intel C++ for x86_64, which still supported -KPIC. _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' ;; icpc* ) # Intel C++, used to be incompatible with GCC. # ICC 10 doesn't accept -KPIC any more. _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' ;; pgCC* | pgcpp*) # Portland Group C++ compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; cxx*) # Compaq C++ # Make sure the PIC flag is empty. It appears that all Alpha # Linux and Compaq Tru64 Unix objects are PIC. _LT_TAGVAR(lt_prog_compiler_pic, $1)= _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' ;; xlc* | xlC* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL 8.0, 9.0 on PPC and BlueGene _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink' ;; *) case `$CC -V 2>&1 | sed 5q` in *Sun\ C*) # Sun C++ 5.9 _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ' ;; esac ;; esac ;; lynxos*) ;; m88k*) ;; mvs*) case $cc_basename in cxx*) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-W c,exportall' ;; *) ;; esac ;; netbsd*) ;; *qnx* | *nto*) # QNX uses GNU C++, but need to define -shared option too, otherwise # it will coredump. _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared' ;; osf3* | osf4* | osf5*) case $cc_basename in KCC*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='--backend -Wl,' ;; RCC*) # Rational C++ 2.4.1 _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic' ;; cxx*) # Digital/Compaq C++ _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' # Make sure the PIC flag is empty. It appears that all Alpha # Linux and Compaq Tru64 Unix objects are PIC. _LT_TAGVAR(lt_prog_compiler_pic, $1)= _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' ;; *) ;; esac ;; psos*) ;; solaris*) case $cc_basename in CC* | sunCC*) # Sun C++ 4.2, 5.x and Centerline C++ _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ' ;; gcx*) # Green Hills C++ Compiler _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC' ;; *) ;; esac ;; sunos4*) case $cc_basename in CC*) # Sun C++ 4.x _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; lcc*) # Lucid _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic' ;; *) ;; esac ;; sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*) case $cc_basename in CC*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; esac ;; tandem*) case $cc_basename in NCC*) # NonStop-UX NCC 3.20 _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' ;; *) ;; esac ;; vxworks*) ;; *) _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no ;; esac fi ], [ if test yes = "$GCC"; then _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' case $host_os in aix*) # All AIX code is PIC. if test ia64 = "$host_cpu"; then # AIX 5 now supports IA64 processor _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' fi _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; amigaos*) case $host_cpu in powerpc) # see comment about AmigaOS4 .so support _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; m68k) # FIXME: we need at least 68020 code to build shared libraries, but # adding the '-m68020' flag to GCC prevents building anything better, # like '-m68040'. _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4' ;; esac ;; beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*) # PIC is the default for these OSes. ;; mingw* | cygwin* | pw32* | os2* | cegcc*) # This hack is so that the source file can tell whether it is being # built for inclusion in a dll (and should export symbols for example). # Although the cygwin gcc ignores -fPIC, still need this for old-style # (--disable-auto-import) libraries m4_if([$1], [GCJ], [], [_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT']) case $host_os in os2*) _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-static' ;; esac ;; darwin* | rhapsody*) # PIC is the default on this platform # Common symbols not allowed in MH_DYLIB files _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common' ;; haiku*) # PIC is the default for Haiku. # The "-static" flag exists, but is broken. _LT_TAGVAR(lt_prog_compiler_static, $1)= ;; hpux*) # PIC is the default for 64-bit PA HP-UX, but not for 32-bit # PA HP-UX. On IA64 HP-UX, PIC is the default but the pic flag # sets the default TLS model and affects inlining. case $host_cpu in hppa*64*) # +Z the default ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; esac ;; interix[[3-9]]*) # Interix 3.x gcc -fpic/-fPIC options generate broken code. # Instead, we relocate shared libraries at runtime. ;; msdosdjgpp*) # Just because we use GCC doesn't mean we suddenly get shared libraries # on systems that don't support them. _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no enable_shared=no ;; *nto* | *qnx*) # QNX uses GNU C++, but need to define -shared option too, otherwise # it will coredump. _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared' ;; sysv4*MP*) if test -d /usr/nec; then _LT_TAGVAR(lt_prog_compiler_pic, $1)=-Kconform_pic fi ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; esac case $cc_basename in nvcc*) # Cuda Compiler Driver 2.2 _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Xlinker ' if test -n "$_LT_TAGVAR(lt_prog_compiler_pic, $1)"; then _LT_TAGVAR(lt_prog_compiler_pic, $1)="-Xcompiler $_LT_TAGVAR(lt_prog_compiler_pic, $1)" fi ;; esac else # PORTME Check for flag to pass linker flags through the system compiler. case $host_os in aix*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' if test ia64 = "$host_cpu"; then # AIX 5 now supports IA64 processor _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' else _LT_TAGVAR(lt_prog_compiler_static, $1)='-bnso -bI:/lib/syscalls.exp' fi ;; darwin* | rhapsody*) # PIC is the default on this platform # Common symbols not allowed in MH_DYLIB files _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common' case $cc_basename in nagfor*) # NAG Fortran compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; esac ;; mingw* | cygwin* | pw32* | os2* | cegcc*) # This hack is so that the source file can tell whether it is being # built for inclusion in a dll (and should export symbols for example). m4_if([$1], [GCJ], [], [_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT']) case $host_os in os2*) _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-static' ;; esac ;; hpux9* | hpux10* | hpux11*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' # PIC is the default for IA64 HP-UX and 64-bit HP-UX, but # not for PA HP-UX. case $host_cpu in hppa*64*|ia64*) # +Z the default ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z' ;; esac # Is there a better lt_prog_compiler_static that works with the bundled CC? _LT_TAGVAR(lt_prog_compiler_static, $1)='$wl-a ${wl}archive' ;; irix5* | irix6* | nonstopux*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' # PIC (with -KPIC) is the default. _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' ;; linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*) case $cc_basename in # old Intel for x86_64, which still supported -KPIC. ecc*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' ;; # icc used to be incompatible with GCC. # ICC 10 doesn't accept -KPIC any more. icc* | ifort*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' ;; # Lahey Fortran 8.1. lf95*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='--shared' _LT_TAGVAR(lt_prog_compiler_static, $1)='--static' ;; nagfor*) # NAG Fortran compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; tcc*) # Fabrice Bellard et al's Tiny C Compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' ;; pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*) # Portland Group compilers (*not* the Pentium gcc compiler, # which looks to be a dead project) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; ccc*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' # All Alpha code is PIC. _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' ;; xl* | bgxl* | bgf* | mpixl*) # IBM XL C 8.0/Fortran 10.1, 11.1 on PPC and BlueGene _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink' ;; *) case `$CC -V 2>&1 | sed 5q` in *Sun\ Ceres\ Fortran* | *Sun*Fortran*\ [[1-7]].* | *Sun*Fortran*\ 8.[[0-3]]*) # Sun Fortran 8.3 passes all unrecognized flags to the linker _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' _LT_TAGVAR(lt_prog_compiler_wl, $1)='' ;; *Sun\ F* | *Sun*Fortran*) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ' ;; *Sun\ C*) # Sun C 5.9 _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' ;; *Intel*\ [[CF]]*Compiler*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-static' ;; *Portland\ Group*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; esac ;; esac ;; newsos6) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; *nto* | *qnx*) # QNX uses GNU C++, but need to define -shared option too, otherwise # it will coredump. _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared' ;; osf3* | osf4* | osf5*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' # All OSF/1 code is PIC. _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' ;; rdos*) _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared' ;; solaris*) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' case $cc_basename in f77* | f90* | f95* | sunf77* | sunf90* | sunf95*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ';; *) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,';; esac ;; sunos4*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; sysv4 | sysv4.2uw2* | sysv4.3*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; sysv4*MP*) if test -d /usr/nec; then _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Kconform_pic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' fi ;; sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; unicos*) _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no ;; uts4*) _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' ;; *) _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no ;; esac fi ]) case $host_os in # For platforms that do not support PIC, -DPIC is meaningless: *djgpp*) _LT_TAGVAR(lt_prog_compiler_pic, $1)= ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)="$_LT_TAGVAR(lt_prog_compiler_pic, $1)@&t@m4_if([$1],[],[ -DPIC],[m4_if([$1],[CXX],[ -DPIC],[])])" ;; esac AC_CACHE_CHECK([for $compiler option to produce PIC], [_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)], [_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)=$_LT_TAGVAR(lt_prog_compiler_pic, $1)]) _LT_TAGVAR(lt_prog_compiler_pic, $1)=$_LT_TAGVAR(lt_cv_prog_compiler_pic, $1) # # Check to make sure the PIC flag actually works. # if test -n "$_LT_TAGVAR(lt_prog_compiler_pic, $1)"; then _LT_COMPILER_OPTION([if $compiler PIC flag $_LT_TAGVAR(lt_prog_compiler_pic, $1) works], [_LT_TAGVAR(lt_cv_prog_compiler_pic_works, $1)], [$_LT_TAGVAR(lt_prog_compiler_pic, $1)@&t@m4_if([$1],[],[ -DPIC],[m4_if([$1],[CXX],[ -DPIC],[])])], [], [case $_LT_TAGVAR(lt_prog_compiler_pic, $1) in "" | " "*) ;; *) _LT_TAGVAR(lt_prog_compiler_pic, $1)=" $_LT_TAGVAR(lt_prog_compiler_pic, $1)" ;; esac], [_LT_TAGVAR(lt_prog_compiler_pic, $1)= _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no]) fi _LT_TAGDECL([pic_flag], [lt_prog_compiler_pic], [1], [Additional compiler flags for building library objects]) _LT_TAGDECL([wl], [lt_prog_compiler_wl], [1], [How to pass a linker flag through the compiler]) # # Check to make sure the static flag actually works. # wl=$_LT_TAGVAR(lt_prog_compiler_wl, $1) eval lt_tmp_static_flag=\"$_LT_TAGVAR(lt_prog_compiler_static, $1)\" _LT_LINKER_OPTION([if $compiler static flag $lt_tmp_static_flag works], _LT_TAGVAR(lt_cv_prog_compiler_static_works, $1), $lt_tmp_static_flag, [], [_LT_TAGVAR(lt_prog_compiler_static, $1)=]) _LT_TAGDECL([link_static_flag], [lt_prog_compiler_static], [1], [Compiler flag to prevent dynamic linking]) ])# _LT_COMPILER_PIC # _LT_LINKER_SHLIBS([TAGNAME]) # ---------------------------- # See if the linker supports building shared libraries. m4_defun([_LT_LINKER_SHLIBS], [AC_REQUIRE([LT_PATH_LD])dnl AC_REQUIRE([LT_PATH_NM])dnl m4_require([_LT_PATH_MANIFEST_TOOL])dnl m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_DECL_EGREP])dnl m4_require([_LT_DECL_SED])dnl m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl m4_require([_LT_TAG_COMPILER])dnl AC_MSG_CHECKING([whether the $compiler linker ($LD) supports shared libraries]) m4_if([$1], [CXX], [ _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols' _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'] case $host_os in aix[[4-9]]*) # If we're using GNU nm, then we don't want the "-C" option. # -C means demangle to GNU nm, but means don't demangle to AIX nm. # Without the "-l" option, or with the "-B" option, AIX nm treats # weak defined symbols like other global defined symbols, whereas # GNU nm marks them as "W". # While the 'weak' keyword is ignored in the Export File, we need # it in the Import File for the 'aix-soname' feature, so we have # to replace the "-B" option with "-P" for AIX nm. if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { if (\$ 2 == "W") { print \$ 3 " weak" } else { print \$ 3 } } }'\'' | sort -u > $export_symbols' else _LT_TAGVAR(export_symbols_cmds, $1)='`func_echo_all $NM | $SED -e '\''s/B\([[^B]]*\)$/P\1/'\''` -PCpgl $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) && ([substr](\$ 1,1,1) != ".")) { if ((\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) { print \$ 1 " weak" } else { print \$ 1 } } }'\'' | sort -u > $export_symbols' fi ;; pw32*) _LT_TAGVAR(export_symbols_cmds, $1)=$ltdll_cmds ;; cygwin* | mingw* | cegcc*) case $cc_basename in cl*) _LT_TAGVAR(exclude_expsyms, $1)='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*' ;; *) _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1 DATA/;s/^.*[[ ]]__nm__\([[^ ]]*\)[[ ]][[^ ]]*/\1 DATA/;/^I[[ ]]/d;/^[[AITW]][[ ]]/s/.* //'\'' | sort | uniq > $export_symbols' _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname'] ;; esac ;; *) _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols' ;; esac ], [ runpath_var= _LT_TAGVAR(allow_undefined_flag, $1)= _LT_TAGVAR(always_export_symbols, $1)=no _LT_TAGVAR(archive_cmds, $1)= _LT_TAGVAR(archive_expsym_cmds, $1)= _LT_TAGVAR(compiler_needs_object, $1)=no _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no _LT_TAGVAR(export_dynamic_flag_spec, $1)= _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols' _LT_TAGVAR(hardcode_automatic, $1)=no _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)= _LT_TAGVAR(hardcode_libdir_separator, $1)= _LT_TAGVAR(hardcode_minus_L, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported _LT_TAGVAR(inherit_rpath, $1)=no _LT_TAGVAR(link_all_deplibs, $1)=unknown _LT_TAGVAR(module_cmds, $1)= _LT_TAGVAR(module_expsym_cmds, $1)= _LT_TAGVAR(old_archive_from_new_cmds, $1)= _LT_TAGVAR(old_archive_from_expsyms_cmds, $1)= _LT_TAGVAR(thread_safe_flag_spec, $1)= _LT_TAGVAR(whole_archive_flag_spec, $1)= # include_expsyms should be a list of space-separated symbols to be *always* # included in the symbol list _LT_TAGVAR(include_expsyms, $1)= # exclude_expsyms can be an extended regexp of symbols to exclude # it will be wrapped by ' (' and ')$', so one must not match beginning or # end of line. Example: 'a|bc|.*d.*' will exclude the symbols 'a' and 'bc', # as well as any symbol that contains 'd'. _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'] # Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out # platforms (ab)use it in PIC code, but their linkers get confused if # the symbol is explicitly referenced. Since portable code cannot # rely on this symbol name, it's probably fine to never include it in # preloaded symbol tables. # Exclude shared library initialization/finalization symbols. dnl Note also adjust exclude_expsyms for C++ above. extract_expsyms_cmds= case $host_os in cygwin* | mingw* | pw32* | cegcc*) # FIXME: the MSVC++ port hasn't been tested in a loooong time # When not using gcc, we currently assume that we are using # Microsoft Visual C++. if test yes != "$GCC"; then with_gnu_ld=no fi ;; interix*) # we just hope/assume this is gcc and not c89 (= MSVC++) with_gnu_ld=yes ;; openbsd* | bitrig*) with_gnu_ld=no ;; esac _LT_TAGVAR(ld_shlibs, $1)=yes # On some targets, GNU ld is compatible enough with the native linker # that we're better off using the native interface for both. lt_use_gnu_ld_interface=no if test yes = "$with_gnu_ld"; then case $host_os in aix*) # The AIX port of GNU ld has always aspired to compatibility # with the native linker. However, as the warning in the GNU ld # block says, versions before 2.19.5* couldn't really create working # shared libraries, regardless of the interface used. case `$LD -v 2>&1` in *\ \(GNU\ Binutils\)\ 2.19.5*) ;; *\ \(GNU\ Binutils\)\ 2.[[2-9]]*) ;; *\ \(GNU\ Binutils\)\ [[3-9]]*) ;; *) lt_use_gnu_ld_interface=yes ;; esac ;; *) lt_use_gnu_ld_interface=yes ;; esac fi if test yes = "$lt_use_gnu_ld_interface"; then # If archive_cmds runs LD, not CC, wlarc should be empty wlarc='$wl' # Set some defaults for GNU ld with shared library support. These # are reset later if shared libraries are not supported. Putting them # here allows them to be overridden if necessary. runpath_var=LD_RUN_PATH _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' # ancient GNU ld didn't support --whole-archive et. al. if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive' else _LT_TAGVAR(whole_archive_flag_spec, $1)= fi supports_anon_versioning=no case `$LD -v | $SED -e 's/([^)]\+)\s\+//' 2>&1` in *GNU\ gold*) supports_anon_versioning=yes ;; *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.10.*) ;; # catch versions < 2.11 *\ 2.11.93.0.2\ *) supports_anon_versioning=yes ;; # RH7.3 ... *\ 2.11.92.0.12\ *) supports_anon_versioning=yes ;; # Mandrake 8.2 ... *\ 2.11.*) ;; # other 2.11 versions *) supports_anon_versioning=yes ;; esac # See if GNU ld supports shared libraries. case $host_os in aix[[3-9]]*) # On AIX/PPC, the GNU linker is very broken if test ia64 != "$host_cpu"; then _LT_TAGVAR(ld_shlibs, $1)=no cat <<_LT_EOF 1>&2 *** Warning: the GNU linker, at least up to release 2.19, is reported *** to be unable to reliably create shared libraries on AIX. *** Therefore, libtool is disabling shared libraries support. If you *** really care for shared libraries, you may want to install binutils *** 2.20 or above, or modify your PATH so that a non-GNU linker is found. *** You will then need to restart the configuration process. _LT_EOF fi ;; amigaos*) case $host_cpu in powerpc) # see comment about AmigaOS4 .so support _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='' ;; m68k) _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_minus_L, $1)=yes ;; esac ;; beos*) if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then _LT_TAGVAR(allow_undefined_flag, $1)=unsupported # Joseph Beckenbach says some releases of gcc # support --undefined. This deserves some investigation. FIXME _LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; cygwin* | mingw* | pw32* | cegcc*) # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless, # as there is no search path for DLLs. _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-all-symbols' _LT_TAGVAR(allow_undefined_flag, $1)=unsupported _LT_TAGVAR(always_export_symbols, $1)=no _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1 DATA/;s/^.*[[ ]]__nm__\([[^ ]]*\)[[ ]][[^ ]]*/\1 DATA/;/^I[[ ]]/d;/^[[AITW]][[ ]]/s/.* //'\'' | sort | uniq > $export_symbols' _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname'] if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' # If the export-symbols file already is a .def file, use it as # is; otherwise, prepend EXPORTS... _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then cp $export_symbols $output_objdir/$soname.def; else echo EXPORTS > $output_objdir/$soname.def; cat $export_symbols >> $output_objdir/$soname.def; fi~ $CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; haiku*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(link_all_deplibs, $1)=yes ;; os2*) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(allow_undefined_flag, $1)=unsupported shrext_cmds=.dll _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~ $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~ $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~ $ECHO EXPORTS >> $output_objdir/$libname.def~ emxexp $libobjs | $SED /"_DLL_InitTerm"/d >> $output_objdir/$libname.def~ $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~ emximp -o $lib $output_objdir/$libname.def' _LT_TAGVAR(archive_expsym_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~ $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~ $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~ $ECHO EXPORTS >> $output_objdir/$libname.def~ prefix_cmds="$SED"~ if test EXPORTS = "`$SED 1q $export_symbols`"; then prefix_cmds="$prefix_cmds -e 1d"; fi~ prefix_cmds="$prefix_cmds -e \"s/^\(.*\)$/_\1/g\""~ cat $export_symbols | $prefix_cmds >> $output_objdir/$libname.def~ $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~ emximp -o $lib $output_objdir/$libname.def' _LT_TAGVAR(old_archive_From_new_cmds, $1)='emximp -o $output_objdir/${libname}_dll.a $output_objdir/$libname.def' _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes ;; interix[[3-9]]*) _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc. # Instead, shared libraries are loaded at an image base (0x10000000 by # default) and relocated if they conflict, which is a slow very memory # consuming and fragmenting process. To avoid this, we pick a random, # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link # time. Moving up from 0x10000000 also allows more sbrk(2) space. _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='sed "s|^|_|" $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--retain-symbols-file,$output_objdir/$soname.expsym $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib' ;; gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu) tmp_diet=no if test linux-dietlibc = "$host_os"; then case $cc_basename in diet\ *) tmp_diet=yes;; # linux-dietlibc with static linking (!diet-dyn) esac fi if $LD --help 2>&1 | $EGREP ': supported targets:.* elf' > /dev/null \ && test no = "$tmp_diet" then tmp_addflag=' $pic_flag' tmp_sharedflag='-shared' case $cc_basename,$host_cpu in pgcc*) # Portland Group C compiler _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' tmp_addflag=' $pic_flag' ;; pgf77* | pgf90* | pgf95* | pgfortran*) # Portland Group f77 and f90 compilers _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' tmp_addflag=' $pic_flag -Mnomain' ;; ecc*,ia64* | icc*,ia64*) # Intel C compiler on ia64 tmp_addflag=' -i_dynamic' ;; efc*,ia64* | ifort*,ia64*) # Intel Fortran compiler on ia64 tmp_addflag=' -i_dynamic -nofor_main' ;; ifc* | ifort*) # Intel Fortran compiler tmp_addflag=' -nofor_main' ;; lf95*) # Lahey Fortran 8.1 _LT_TAGVAR(whole_archive_flag_spec, $1)= tmp_sharedflag='--shared' ;; nagfor*) # NAGFOR 5.3 tmp_sharedflag='-Wl,-shared' ;; xl[[cC]]* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL C 8.0 on PPC (deal with xlf below) tmp_sharedflag='-qmkshrobj' tmp_addflag= ;; nvcc*) # Cuda Compiler Driver 2.2 _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' _LT_TAGVAR(compiler_needs_object, $1)=yes ;; esac case `$CC -V 2>&1 | sed 5q` in *Sun\ C*) # Sun C 5.9 _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' _LT_TAGVAR(compiler_needs_object, $1)=yes tmp_sharedflag='-G' ;; *Sun\ F*) # Sun Fortran 8.3 tmp_sharedflag='-G' ;; esac _LT_TAGVAR(archive_cmds, $1)='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' if test yes = "$supports_anon_versioning"; then _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~ cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~ echo "local: *; };" >> $output_objdir/$libname.ver~ $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-version-script $wl$output_objdir/$libname.ver -o $lib' fi case $cc_basename in tcc*) _LT_TAGVAR(export_dynamic_flag_spec, $1)='-rdynamic' ;; xlf* | bgf* | bgxlf* | mpixlf*) # IBM XL Fortran 10.1 on PPC cannot create shared libs itself _LT_TAGVAR(whole_archive_flag_spec, $1)='--whole-archive$convenience --no-whole-archive' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(archive_cmds, $1)='$LD -shared $libobjs $deplibs $linker_flags -soname $soname -o $lib' if test yes = "$supports_anon_versioning"; then _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~ cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~ echo "local: *; };" >> $output_objdir/$libname.ver~ $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib' fi ;; esac else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; netbsd*) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib' wlarc= else _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' fi ;; solaris*) if $LD -v 2>&1 | $GREP 'BFD 2\.8' > /dev/null; then _LT_TAGVAR(ld_shlibs, $1)=no cat <<_LT_EOF 1>&2 *** Warning: The releases 2.8.* of the GNU linker cannot reliably *** create shared libraries on Solaris systems. Therefore, libtool *** is disabling shared libraries support. We urge you to upgrade GNU *** binutils to release 2.9.1 or newer. Another option is to modify *** your PATH or compiler configuration so that the native linker is *** used, and then restart. _LT_EOF elif $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*) case `$LD -v 2>&1` in *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.1[[0-5]].*) _LT_TAGVAR(ld_shlibs, $1)=no cat <<_LT_EOF 1>&2 *** Warning: Releases of the GNU linker prior to 2.16.91.0.3 cannot *** reliably create shared libraries on SCO systems. Therefore, libtool *** is disabling shared libraries support. We urge you to upgrade GNU *** binutils to release 2.16.91.0.3 or newer. Another option is to modify *** your PATH or compiler configuration so that the native linker is *** used, and then restart. _LT_EOF ;; *) # For security reasons, it is highly recommended that you always # use absolute paths for naming shared libraries, and exclude the # DT_RUNPATH tag from executables and libraries. But doing so # requires that you compile everything twice, which is a pain. if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; esac ;; sunos4*) _LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags' wlarc= _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; *) if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; esac if test no = "$_LT_TAGVAR(ld_shlibs, $1)"; then runpath_var= _LT_TAGVAR(hardcode_libdir_flag_spec, $1)= _LT_TAGVAR(export_dynamic_flag_spec, $1)= _LT_TAGVAR(whole_archive_flag_spec, $1)= fi else # PORTME fill in a description of your system's linker (not GNU ld) case $host_os in aix3*) _LT_TAGVAR(allow_undefined_flag, $1)=unsupported _LT_TAGVAR(always_export_symbols, $1)=yes _LT_TAGVAR(archive_expsym_cmds, $1)='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname' # Note: this linker hardcodes the directories in LIBPATH if there # are no directories specified by -L. _LT_TAGVAR(hardcode_minus_L, $1)=yes if test yes = "$GCC" && test -z "$lt_prog_compiler_static"; then # Neither direct hardcoding nor static linking is supported with a # broken collect2. _LT_TAGVAR(hardcode_direct, $1)=unsupported fi ;; aix[[4-9]]*) if test ia64 = "$host_cpu"; then # On IA64, the linker does run time linking by default, so we don't # have to do anything special. aix_use_runtimelinking=no exp_sym_flag='-Bexport' no_entry_flag= else # If we're using GNU nm, then we don't want the "-C" option. # -C means demangle to GNU nm, but means don't demangle to AIX nm. # Without the "-l" option, or with the "-B" option, AIX nm treats # weak defined symbols like other global defined symbols, whereas # GNU nm marks them as "W". # While the 'weak' keyword is ignored in the Export File, we need # it in the Import File for the 'aix-soname' feature, so we have # to replace the "-B" option with "-P" for AIX nm. if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { if (\$ 2 == "W") { print \$ 3 " weak" } else { print \$ 3 } } }'\'' | sort -u > $export_symbols' else _LT_TAGVAR(export_symbols_cmds, $1)='`func_echo_all $NM | $SED -e '\''s/B\([[^B]]*\)$/P\1/'\''` -PCpgl $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) && ([substr](\$ 1,1,1) != ".")) { if ((\$ 2 == "W") || (\$ 2 == "V") || (\$ 2 == "Z")) { print \$ 1 " weak" } else { print \$ 1 } } }'\'' | sort -u > $export_symbols' fi aix_use_runtimelinking=no # Test if we are trying to use run time linking or normal # AIX style linking. If -brtl is somewhere in LDFLAGS, we # have runtime linking enabled, and use it for executables. # For shared libraries, we enable/disable runtime linking # depending on the kind of the shared library created - # when "with_aix_soname,aix_use_runtimelinking" is: # "aix,no" lib.a(lib.so.V) shared, rtl:no, for executables # "aix,yes" lib.so shared, rtl:yes, for executables # lib.a static archive # "both,no" lib.so.V(shr.o) shared, rtl:yes # lib.a(lib.so.V) shared, rtl:no, for executables # "both,yes" lib.so.V(shr.o) shared, rtl:yes, for executables # lib.a(lib.so.V) shared, rtl:no # "svr4,*" lib.so.V(shr.o) shared, rtl:yes, for executables # lib.a static archive case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*) for ld_flag in $LDFLAGS; do if (test x-brtl = "x$ld_flag" || test x-Wl,-brtl = "x$ld_flag"); then aix_use_runtimelinking=yes break fi done if test svr4,no = "$with_aix_soname,$aix_use_runtimelinking"; then # With aix-soname=svr4, we create the lib.so.V shared archives only, # so we don't have lib.a shared libs to link our executables. # We have to force runtime linking in this case. aix_use_runtimelinking=yes LDFLAGS="$LDFLAGS -Wl,-brtl" fi ;; esac exp_sym_flag='-bexport' no_entry_flag='-bnoentry' fi # When large executables or shared objects are built, AIX ld can # have problems creating the table of contents. If linking a library # or program results in "error TOC overflow" add -mminimal-toc to # CXXFLAGS/CFLAGS for g++/gcc. In the cases where that is not # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS. _LT_TAGVAR(archive_cmds, $1)='' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_direct_absolute, $1)=yes _LT_TAGVAR(hardcode_libdir_separator, $1)=':' _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(file_list_spec, $1)='$wl-f,' case $with_aix_soname,$aix_use_runtimelinking in aix,*) ;; # traditional, no import file svr4,* | *,yes) # use import file # The Import File defines what to hardcode. _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=no ;; esac if test yes = "$GCC"; then case $host_os in aix4.[[012]]|aix4.[[012]].*) # We only want to do this on AIX 4.2 and lower, the check # below for broken collect2 doesn't work under 4.3+ collect2name=`$CC -print-prog-name=collect2` if test -f "$collect2name" && strings "$collect2name" | $GREP resolve_lib_name >/dev/null then # We have reworked collect2 : else # We have old collect2 _LT_TAGVAR(hardcode_direct, $1)=unsupported # It fails to find uninstalled libraries when the uninstalled # path is not listed in the libpath. Setting hardcode_minus_L # to unsupported forces relinking _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)= fi ;; esac shared_flag='-shared' if test yes = "$aix_use_runtimelinking"; then shared_flag="$shared_flag "'$wl-G' fi # Need to ensure runtime linking is disabled for the traditional # shared library, or the linker may eventually find shared libraries # /with/ Import File - we do not want to mix them. shared_flag_aix='-shared' shared_flag_svr4='-shared $wl-G' else # not using gcc if test ia64 = "$host_cpu"; then # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release # chokes on -Wl,-G. The following line is correct: shared_flag='-G' else if test yes = "$aix_use_runtimelinking"; then shared_flag='$wl-G' else shared_flag='$wl-bM:SRE' fi shared_flag_aix='$wl-bM:SRE' shared_flag_svr4='$wl-G' fi fi _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-bexpall' # It seems that -bexpall does not export symbols beginning with # underscore (_), so it is better to generate a list of symbols to export. _LT_TAGVAR(always_export_symbols, $1)=yes if test aix,yes = "$with_aix_soname,$aix_use_runtimelinking"; then # Warning - without using the other runtime loading flags (-brtl), # -berok will link without error, but may produce a broken library. _LT_TAGVAR(allow_undefined_flag, $1)='-berok' # Determine the default libpath from the value encoded in an # empty executable. _LT_SYS_MODULE_PATH_AIX([$1]) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath" _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs $wl'$no_entry_flag' $compiler_flags `if test -n "$allow_undefined_flag"; then func_echo_all "$wl$allow_undefined_flag"; else :; fi` $wl'$exp_sym_flag:\$export_symbols' '$shared_flag else if test ia64 = "$host_cpu"; then _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R $libdir:/usr/lib:/lib' _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs" _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\$wl$no_entry_flag"' $compiler_flags $wl$allow_undefined_flag '"\$wl$exp_sym_flag:\$export_symbols" else # Determine the default libpath from the value encoded in an # empty executable. _LT_SYS_MODULE_PATH_AIX([$1]) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath" # Warning - without using the other run time loading flags, # -berok will link without error, but may produce a broken library. _LT_TAGVAR(no_undefined_flag, $1)=' $wl-bernotok' _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-berok' if test yes = "$with_gnu_ld"; then # We only use this code for GNU lds that support --whole-archive. _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive$convenience $wl--no-whole-archive' else # Exported symbols can be pulled into shared objects from archives _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience' fi _LT_TAGVAR(archive_cmds_need_lc, $1)=yes _LT_TAGVAR(archive_expsym_cmds, $1)='$RM -r $output_objdir/$realname.d~$MKDIR $output_objdir/$realname.d' # -brtl affects multiple linker settings, -berok does not and is overridden later compiler_flags_filtered='`func_echo_all "$compiler_flags " | $SED -e "s%-brtl\\([[, ]]\\)%-berok\\1%g"`' if test svr4 != "$with_aix_soname"; then # This is similar to how AIX traditionally builds its shared libraries. _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_aix' -o $output_objdir/$realname.d/$soname $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$realname.d/$soname' fi if test aix != "$with_aix_soname"; then _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_svr4' -o $output_objdir/$realname.d/$shared_archive_member_spec.o $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$STRIP -e $output_objdir/$realname.d/$shared_archive_member_spec.o~( func_echo_all "#! $soname($shared_archive_member_spec.o)"; if test shr_64 = "$shared_archive_member_spec"; then func_echo_all "# 64"; else func_echo_all "# 32"; fi; cat $export_symbols ) > $output_objdir/$realname.d/$shared_archive_member_spec.imp~$AR $AR_FLAGS $output_objdir/$soname $output_objdir/$realname.d/$shared_archive_member_spec.o $output_objdir/$realname.d/$shared_archive_member_spec.imp' else # used by -dlpreopen to get the symbols _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$MV $output_objdir/$realname.d/$soname $output_objdir' fi _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$RM -r $output_objdir/$realname.d' fi fi ;; amigaos*) case $host_cpu in powerpc) # see comment about AmigaOS4 .so support _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='' ;; m68k) _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_minus_L, $1)=yes ;; esac ;; bsdi[[45]]*) _LT_TAGVAR(export_dynamic_flag_spec, $1)=-rdynamic ;; cygwin* | mingw* | pw32* | cegcc*) # When not using gcc, we currently assume that we are using # Microsoft Visual C++. # hardcode_libdir_flag_spec is actually meaningless, as there is # no search path for DLLs. case $cc_basename in cl*) # Native MSVC _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' ' _LT_TAGVAR(allow_undefined_flag, $1)=unsupported _LT_TAGVAR(always_export_symbols, $1)=yes _LT_TAGVAR(file_list_spec, $1)='@' # Tell ltmain to make .lib files, not .a files. libext=lib # Tell ltmain to make .dll files, not .so files. shrext_cmds=.dll # FIXME: Setting linknames here is a bad hack. _LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~linknames=' _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then cp "$export_symbols" "$output_objdir/$soname.def"; echo "$tool_output_objdir$soname.def" > "$output_objdir/$soname.exp"; else $SED -e '\''s/^/-link -EXPORT:/'\'' < $export_symbols > $output_objdir/$soname.exp; fi~ $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~ linknames=' # The linker will not automatically build a static lib if we build a DLL. # _LT_TAGVAR(old_archive_from_new_cmds, $1)='true' _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes _LT_TAGVAR(exclude_expsyms, $1)='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*' _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1,DATA/'\'' | $SED -e '\''/^[[AITW]][[ ]]/s/.*[[ ]]//'\'' | sort | uniq > $export_symbols' # Don't use ranlib _LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib' _LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~ lt_tool_outputfile="@TOOL_OUTPUT@"~ case $lt_outputfile in *.exe|*.EXE) ;; *) lt_outputfile=$lt_outputfile.exe lt_tool_outputfile=$lt_tool_outputfile.exe ;; esac~ if test : != "$MANIFEST_TOOL" && test -f "$lt_outputfile.manifest"; then $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1; $RM "$lt_outputfile.manifest"; fi' ;; *) # Assume MSVC wrapper _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' ' _LT_TAGVAR(allow_undefined_flag, $1)=unsupported # Tell ltmain to make .lib files, not .a files. libext=lib # Tell ltmain to make .dll files, not .so files. shrext_cmds=.dll # FIXME: Setting linknames here is a bad hack. _LT_TAGVAR(archive_cmds, $1)='$CC -o $lib $libobjs $compiler_flags `func_echo_all "$deplibs" | $SED '\''s/ -lc$//'\''` -link -dll~linknames=' # The linker will automatically build a .lib file if we build a DLL. _LT_TAGVAR(old_archive_from_new_cmds, $1)='true' # FIXME: Should let the user specify the lib program. _LT_TAGVAR(old_archive_cmds, $1)='lib -OUT:$oldlib$oldobjs$old_deplibs' _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes ;; esac ;; darwin* | rhapsody*) _LT_DARWIN_LINKER_FEATURES($1) ;; dgux*) _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor # support. Future versions do this automatically, but an explicit c++rt0.o # does not break anything, and helps significantly (at the cost of a little # extra space). freebsd2.2*) _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; # Unfortunately, older versions of FreeBSD 2 do not have this feature. freebsd2.*) _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; # FreeBSD 3 and greater uses gcc -shared to do shared libraries. freebsd* | dragonfly*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; hpux9*) if test yes = "$GCC"; then _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' else _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' fi _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: _LT_TAGVAR(hardcode_direct, $1)=yes # hardcode_minus_L: Not really in the search PATH, # but as the default location of the library. _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' ;; hpux10*) if test yes,no = "$GCC,$with_gnu_ld"; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags' else _LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' fi if test no = "$with_gnu_ld"; then _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_direct_absolute, $1)=yes _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' # hardcode_minus_L: Not really in the search PATH, # but as the default location of the library. _LT_TAGVAR(hardcode_minus_L, $1)=yes fi ;; hpux11*) if test yes,no = "$GCC,$with_gnu_ld"; then case $host_cpu in hppa*64*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl+h $wl$soname -o $lib $libobjs $deplibs $compiler_flags' ;; ia64*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags' ;; *) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags' ;; esac else case $host_cpu in hppa*64*) _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname -o $lib $libobjs $deplibs $compiler_flags' ;; ia64*) _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags' ;; *) m4_if($1, [], [ # Older versions of the 11.00 compiler do not understand -b yet # (HP92453-01 A.11.01.20 doesn't, HP92453-01 B.11.X.35175-35176.GP does) _LT_LINKER_OPTION([if $CC understands -b], _LT_TAGVAR(lt_cv_prog_compiler__b, $1), [-b], [_LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags'], [_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'])], [_LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $libobjs $deplibs $compiler_flags']) ;; esac fi if test no = "$with_gnu_ld"; then _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: case $host_cpu in hppa*64*|ia64*) _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; *) _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_direct_absolute, $1)=yes _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' # hardcode_minus_L: Not really in the search PATH, # but as the default location of the library. _LT_TAGVAR(hardcode_minus_L, $1)=yes ;; esac fi ;; irix5* | irix6* | nonstopux*) if test yes = "$GCC"; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' # Try to use the -exported_symbol ld option, if it does not # work, assume that -exports_file does not work either and # implicitly export all symbols. # This should be the same for all languages, so no per-tag cache variable. AC_CACHE_CHECK([whether the $host_os linker accepts -exported_symbol], [lt_cv_irix_exported_symbol], [save_LDFLAGS=$LDFLAGS LDFLAGS="$LDFLAGS -shared $wl-exported_symbol ${wl}foo $wl-update_registry $wl/dev/null" AC_LINK_IFELSE( [AC_LANG_SOURCE( [AC_LANG_CASE([C], [[int foo (void) { return 0; }]], [C++], [[int foo (void) { return 0; }]], [Fortran 77], [[ subroutine foo end]], [Fortran], [[ subroutine foo end]])])], [lt_cv_irix_exported_symbol=yes], [lt_cv_irix_exported_symbol=no]) LDFLAGS=$save_LDFLAGS]) if test yes = "$lt_cv_irix_exported_symbol"; then _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations $wl-exports_file $wl$export_symbols -o $lib' fi else _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -exports_file $export_symbols -o $lib' fi _LT_TAGVAR(archive_cmds_need_lc, $1)='no' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: _LT_TAGVAR(inherit_rpath, $1)=yes _LT_TAGVAR(link_all_deplibs, $1)=yes ;; linux*) case $cc_basename in tcc*) # Fabrice Bellard et al's Tiny C Compiler _LT_TAGVAR(ld_shlibs, $1)=yes _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags' ;; esac ;; netbsd*) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out else _LT_TAGVAR(archive_cmds, $1)='$LD -shared -o $lib $libobjs $deplibs $linker_flags' # ELF fi _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; newsos6) _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; *nto* | *qnx*) ;; openbsd* | bitrig*) if test -f /usr/libexec/ld.so; then _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=yes if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`"; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags $wl-retain-symbols-file,$export_symbols' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' else _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' fi else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; os2*) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(allow_undefined_flag, $1)=unsupported shrext_cmds=.dll _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~ $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~ $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~ $ECHO EXPORTS >> $output_objdir/$libname.def~ emxexp $libobjs | $SED /"_DLL_InitTerm"/d >> $output_objdir/$libname.def~ $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~ emximp -o $lib $output_objdir/$libname.def' _LT_TAGVAR(archive_expsym_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~ $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~ $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~ $ECHO EXPORTS >> $output_objdir/$libname.def~ prefix_cmds="$SED"~ if test EXPORTS = "`$SED 1q $export_symbols`"; then prefix_cmds="$prefix_cmds -e 1d"; fi~ prefix_cmds="$prefix_cmds -e \"s/^\(.*\)$/_\1/g\""~ cat $export_symbols | $prefix_cmds >> $output_objdir/$libname.def~ $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~ emximp -o $lib $output_objdir/$libname.def' _LT_TAGVAR(old_archive_From_new_cmds, $1)='emximp -o $output_objdir/${libname}_dll.a $output_objdir/$libname.def' _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes ;; osf3*) if test yes = "$GCC"; then _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*' _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' else _LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*' _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' fi _LT_TAGVAR(archive_cmds_need_lc, $1)='no' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: ;; osf4* | osf5*) # as osf3* with the addition of -msym flag if test yes = "$GCC"; then _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*' _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $pic_flag $libobjs $deplibs $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' else _LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*' _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done; printf "%s\\n" "-hidden">> $lib.exp~ $CC -shared$allow_undefined_flag $wl-input $wl$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib~$RM $lib.exp' # Both c and cxx compiler support -rpath directly _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir' fi _LT_TAGVAR(archive_cmds_need_lc, $1)='no' _LT_TAGVAR(hardcode_libdir_separator, $1)=: ;; solaris*) _LT_TAGVAR(no_undefined_flag, $1)=' -z defs' if test yes = "$GCC"; then wlarc='$wl' _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $wl-z ${wl}text $wl-h $wl$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ $CC -shared $pic_flag $wl-z ${wl}text $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp' else case `$CC -V 2>&1` in *"Compilers 5.0"*) wlarc='' _LT_TAGVAR(archive_cmds, $1)='$LD -G$allow_undefined_flag -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ $LD -G$allow_undefined_flag -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp' ;; *) wlarc='$wl' _LT_TAGVAR(archive_cmds, $1)='$CC -G$allow_undefined_flag -h $soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ $CC -G$allow_undefined_flag -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp' ;; esac fi _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no case $host_os in solaris2.[[0-5]] | solaris2.[[0-5]].*) ;; *) # The compiler driver will combine and reorder linker options, # but understands '-z linker_flag'. GCC discards it without '$wl', # but is careful enough not to reorder. # Supported since Solaris 2.6 (maybe 2.5.1?) if test yes = "$GCC"; then _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl-z ${wl}allextract$convenience $wl-z ${wl}defaultextract' else _LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract' fi ;; esac _LT_TAGVAR(link_all_deplibs, $1)=yes ;; sunos4*) if test sequent = "$host_vendor"; then # Use $CC to link under sequent, because it throws in some extra .o # files that make .init and .fini sections work. _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h $soname -o $lib $libobjs $deplibs $compiler_flags' else _LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags' fi _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; sysv4) case $host_vendor in sni) _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_direct, $1)=yes # is this really true??? ;; siemens) ## LD is ld it makes a PLAMLIB ## CC just makes a GrossModule. _LT_TAGVAR(archive_cmds, $1)='$LD -G -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(reload_cmds, $1)='$CC -r -o $output$reload_objs' _LT_TAGVAR(hardcode_direct, $1)=no ;; motorola) _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_direct, $1)=no #Motorola manual says yes, but my tests say they lie ;; esac runpath_var='LD_RUN_PATH' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; sysv4.3*) _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(export_dynamic_flag_spec, $1)='-Bexport' ;; sysv4*MP*) if test -d /usr/nec; then _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no runpath_var=LD_RUN_PATH hardcode_runpath_var=yes _LT_TAGVAR(ld_shlibs, $1)=yes fi ;; sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*) _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text' _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no runpath_var='LD_RUN_PATH' if test yes = "$GCC"; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' else _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' fi ;; sysv5* | sco3.2v5* | sco5v6*) # Note: We CANNOT use -z defs as we might desire, because we do not # link with -lc, and that would cause any symbols used from libc to # always be unresolved, which means just about no library would # ever link correctly. If we're not using GNU ld we use -z text # though, which does catch some bad symbols but isn't as heavy-handed # as -z defs. _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text' _LT_TAGVAR(allow_undefined_flag, $1)='$wl-z,nodefs' _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R,$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=':' _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-Bexport' runpath_var='LD_RUN_PATH' if test yes = "$GCC"; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' else _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' fi ;; uts4*) _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; *) _LT_TAGVAR(ld_shlibs, $1)=no ;; esac if test sni = "$host_vendor"; then case $host in sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*) _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-Blargedynsym' ;; esac fi fi ]) AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)]) test no = "$_LT_TAGVAR(ld_shlibs, $1)" && can_build_shared=no _LT_TAGVAR(with_gnu_ld, $1)=$with_gnu_ld _LT_DECL([], [libext], [0], [Old archive suffix (normally "a")])dnl _LT_DECL([], [shrext_cmds], [1], [Shared library suffix (normally ".so")])dnl _LT_DECL([], [extract_expsyms_cmds], [2], [The commands to extract the exported symbol list from a shared archive]) # # Do we need to explicitly link libc? # case "x$_LT_TAGVAR(archive_cmds_need_lc, $1)" in x|xyes) # Assume -lc should be added _LT_TAGVAR(archive_cmds_need_lc, $1)=yes if test yes,yes = "$GCC,$enable_shared"; then case $_LT_TAGVAR(archive_cmds, $1) in *'~'*) # FIXME: we may have to deal with multi-command sequences. ;; '$CC '*) # Test whether the compiler implicitly links with -lc since on some # systems, -lgcc has to come before -lc. If gcc already passes -lc # to ld, don't add -lc before -lgcc. AC_CACHE_CHECK([whether -lc should be explicitly linked in], [lt_cv_]_LT_TAGVAR(archive_cmds_need_lc, $1), [$RM conftest* echo "$lt_simple_compile_test_code" > conftest.$ac_ext if AC_TRY_EVAL(ac_compile) 2>conftest.err; then soname=conftest lib=conftest libobjs=conftest.$ac_objext deplibs= wl=$_LT_TAGVAR(lt_prog_compiler_wl, $1) pic_flag=$_LT_TAGVAR(lt_prog_compiler_pic, $1) compiler_flags=-v linker_flags=-v verstring= output_objdir=. libname=conftest lt_save_allow_undefined_flag=$_LT_TAGVAR(allow_undefined_flag, $1) _LT_TAGVAR(allow_undefined_flag, $1)= if AC_TRY_EVAL(_LT_TAGVAR(archive_cmds, $1) 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1) then lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)=no else lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)=yes fi _LT_TAGVAR(allow_undefined_flag, $1)=$lt_save_allow_undefined_flag else cat conftest.err 1>&5 fi $RM conftest* ]) _LT_TAGVAR(archive_cmds_need_lc, $1)=$lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1) ;; esac fi ;; esac _LT_TAGDECL([build_libtool_need_lc], [archive_cmds_need_lc], [0], [Whether or not to add -lc for building shared libraries]) _LT_TAGDECL([allow_libtool_libs_with_static_runtimes], [enable_shared_with_static_runtimes], [0], [Whether or not to disallow shared libs when runtime libs are static]) _LT_TAGDECL([], [export_dynamic_flag_spec], [1], [Compiler flag to allow reflexive dlopens]) _LT_TAGDECL([], [whole_archive_flag_spec], [1], [Compiler flag to generate shared objects directly from archives]) _LT_TAGDECL([], [compiler_needs_object], [1], [Whether the compiler copes with passing no objects directly]) _LT_TAGDECL([], [old_archive_from_new_cmds], [2], [Create an old-style archive from a shared archive]) _LT_TAGDECL([], [old_archive_from_expsyms_cmds], [2], [Create a temporary old-style archive to link instead of a shared archive]) _LT_TAGDECL([], [archive_cmds], [2], [Commands used to build a shared archive]) _LT_TAGDECL([], [archive_expsym_cmds], [2]) _LT_TAGDECL([], [module_cmds], [2], [Commands used to build a loadable module if different from building a shared archive.]) _LT_TAGDECL([], [module_expsym_cmds], [2]) _LT_TAGDECL([], [with_gnu_ld], [1], [Whether we are building with GNU ld or not]) _LT_TAGDECL([], [allow_undefined_flag], [1], [Flag that allows shared libraries with undefined symbols to be built]) _LT_TAGDECL([], [no_undefined_flag], [1], [Flag that enforces no undefined symbols]) _LT_TAGDECL([], [hardcode_libdir_flag_spec], [1], [Flag to hardcode $libdir into a binary during linking. This must work even if $libdir does not exist]) _LT_TAGDECL([], [hardcode_libdir_separator], [1], [Whether we need a single "-rpath" flag with a separated argument]) _LT_TAGDECL([], [hardcode_direct], [0], [Set to "yes" if using DIR/libNAME$shared_ext during linking hardcodes DIR into the resulting binary]) _LT_TAGDECL([], [hardcode_direct_absolute], [0], [Set to "yes" if using DIR/libNAME$shared_ext during linking hardcodes DIR into the resulting binary and the resulting library dependency is "absolute", i.e impossible to change by setting $shlibpath_var if the library is relocated]) _LT_TAGDECL([], [hardcode_minus_L], [0], [Set to "yes" if using the -LDIR flag during linking hardcodes DIR into the resulting binary]) _LT_TAGDECL([], [hardcode_shlibpath_var], [0], [Set to "yes" if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into the resulting binary]) _LT_TAGDECL([], [hardcode_automatic], [0], [Set to "yes" if building a shared library automatically hardcodes DIR into the library and all subsequent libraries and executables linked against it]) _LT_TAGDECL([], [inherit_rpath], [0], [Set to yes if linker adds runtime paths of dependent libraries to runtime path list]) _LT_TAGDECL([], [link_all_deplibs], [0], [Whether libtool must link a program against all its dependency libraries]) _LT_TAGDECL([], [always_export_symbols], [0], [Set to "yes" if exported symbols are required]) _LT_TAGDECL([], [export_symbols_cmds], [2], [The commands to list exported symbols]) _LT_TAGDECL([], [exclude_expsyms], [1], [Symbols that should not be listed in the preloaded symbols]) _LT_TAGDECL([], [include_expsyms], [1], [Symbols that must always be exported]) _LT_TAGDECL([], [prelink_cmds], [2], [Commands necessary for linking programs (against libraries) with templates]) _LT_TAGDECL([], [postlink_cmds], [2], [Commands necessary for finishing linking programs]) _LT_TAGDECL([], [file_list_spec], [1], [Specify filename containing input files]) dnl FIXME: Not yet implemented dnl _LT_TAGDECL([], [thread_safe_flag_spec], [1], dnl [Compiler flag to generate thread safe objects]) ])# _LT_LINKER_SHLIBS # _LT_LANG_C_CONFIG([TAG]) # ------------------------ # Ensure that the configuration variables for a C compiler are suitably # defined. These variables are subsequently used by _LT_CONFIG to write # the compiler configuration to 'libtool'. m4_defun([_LT_LANG_C_CONFIG], [m4_require([_LT_DECL_EGREP])dnl lt_save_CC=$CC AC_LANG_PUSH(C) # Source file extension for C test sources. ac_ext=c # Object file extension for compiled C test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # Code to be used in simple compile tests lt_simple_compile_test_code="int some_variable = 0;" # Code to be used in simple link tests lt_simple_link_test_code='int main(){return(0);}' _LT_TAG_COMPILER # Save the default compiler, since it gets overwritten when the other # tags are being tested, and _LT_TAGVAR(compiler, []) is a NOP. compiler_DEFAULT=$CC # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE ## CAVEAT EMPTOR: ## There is no encapsulation within the following macros, do not change ## the running order or otherwise move them around unless you know exactly ## what you are doing... if test -n "$compiler"; then _LT_COMPILER_NO_RTTI($1) _LT_COMPILER_PIC($1) _LT_COMPILER_C_O($1) _LT_COMPILER_FILE_LOCKS($1) _LT_LINKER_SHLIBS($1) _LT_SYS_DYNAMIC_LINKER($1) _LT_LINKER_HARDCODE_LIBPATH($1) LT_SYS_DLOPEN_SELF _LT_CMD_STRIPLIB # Report what library types will actually be built AC_MSG_CHECKING([if libtool supports shared libraries]) AC_MSG_RESULT([$can_build_shared]) AC_MSG_CHECKING([whether to build shared libraries]) test no = "$can_build_shared" && enable_shared=no # On AIX, shared libraries and static libraries use the same namespace, and # are all built from PIC. case $host_os in aix3*) test yes = "$enable_shared" && enable_static=no if test -n "$RANLIB"; then archive_cmds="$archive_cmds~\$RANLIB \$lib" postinstall_cmds='$RANLIB $lib' fi ;; aix[[4-9]]*) if test ia64 != "$host_cpu"; then case $enable_shared,$with_aix_soname,$aix_use_runtimelinking in yes,aix,yes) ;; # shared object as lib.so file only yes,svr4,*) ;; # shared object as lib.so archive member only yes,*) enable_static=no ;; # shared object in lib.a archive as well esac fi ;; esac AC_MSG_RESULT([$enable_shared]) AC_MSG_CHECKING([whether to build static libraries]) # Make sure either enable_shared or enable_static is yes. test yes = "$enable_shared" || enable_static=yes AC_MSG_RESULT([$enable_static]) _LT_CONFIG($1) fi AC_LANG_POP CC=$lt_save_CC ])# _LT_LANG_C_CONFIG # _LT_LANG_CXX_CONFIG([TAG]) # -------------------------- # Ensure that the configuration variables for a C++ compiler are suitably # defined. These variables are subsequently used by _LT_CONFIG to write # the compiler configuration to 'libtool'. m4_defun([_LT_LANG_CXX_CONFIG], [m4_require([_LT_FILEUTILS_DEFAULTS])dnl m4_require([_LT_DECL_EGREP])dnl m4_require([_LT_PATH_MANIFEST_TOOL])dnl if test -n "$CXX" && ( test no != "$CXX" && ( (test g++ = "$CXX" && `g++ -v >/dev/null 2>&1` ) || (test g++ != "$CXX"))); then AC_PROG_CXXCPP else _lt_caught_CXX_error=yes fi AC_LANG_PUSH(C++) _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(allow_undefined_flag, $1)= _LT_TAGVAR(always_export_symbols, $1)=no _LT_TAGVAR(archive_expsym_cmds, $1)= _LT_TAGVAR(compiler_needs_object, $1)=no _LT_TAGVAR(export_dynamic_flag_spec, $1)= _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)= _LT_TAGVAR(hardcode_libdir_separator, $1)= _LT_TAGVAR(hardcode_minus_L, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported _LT_TAGVAR(hardcode_automatic, $1)=no _LT_TAGVAR(inherit_rpath, $1)=no _LT_TAGVAR(module_cmds, $1)= _LT_TAGVAR(module_expsym_cmds, $1)= _LT_TAGVAR(link_all_deplibs, $1)=unknown _LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds _LT_TAGVAR(reload_flag, $1)=$reload_flag _LT_TAGVAR(reload_cmds, $1)=$reload_cmds _LT_TAGVAR(no_undefined_flag, $1)= _LT_TAGVAR(whole_archive_flag_spec, $1)= _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no # Source file extension for C++ test sources. ac_ext=cpp # Object file extension for compiled C++ test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # No sense in running all these tests if we already determined that # the CXX compiler isn't working. Some variables (like enable_shared) # are currently assumed to apply to all compilers on this platform, # and will be corrupted by setting them based on a non-working compiler. if test yes != "$_lt_caught_CXX_error"; then # Code to be used in simple compile tests lt_simple_compile_test_code="int some_variable = 0;" # Code to be used in simple link tests lt_simple_link_test_code='int main(int, char *[[]]) { return(0); }' # ltmain only uses $CC for tagged configurations so make sure $CC is set. _LT_TAG_COMPILER # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE # Allow CC to be a program name with arguments. lt_save_CC=$CC lt_save_CFLAGS=$CFLAGS lt_save_LD=$LD lt_save_GCC=$GCC GCC=$GXX lt_save_with_gnu_ld=$with_gnu_ld lt_save_path_LD=$lt_cv_path_LD if test -n "${lt_cv_prog_gnu_ldcxx+set}"; then lt_cv_prog_gnu_ld=$lt_cv_prog_gnu_ldcxx else $as_unset lt_cv_prog_gnu_ld fi if test -n "${lt_cv_path_LDCXX+set}"; then lt_cv_path_LD=$lt_cv_path_LDCXX else $as_unset lt_cv_path_LD fi test -z "${LDCXX+set}" || LD=$LDCXX CC=${CXX-"c++"} CFLAGS=$CXXFLAGS compiler=$CC _LT_TAGVAR(compiler, $1)=$CC _LT_CC_BASENAME([$compiler]) if test -n "$compiler"; then # We don't want -fno-exception when compiling C++ code, so set the # no_builtin_flag separately if test yes = "$GXX"; then _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin' else _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)= fi if test yes = "$GXX"; then # Set up default GNU C++ configuration LT_PATH_LD # Check if GNU C++ uses GNU ld as the underlying linker, since the # archiving commands below assume that GNU ld is being used. if test yes = "$with_gnu_ld"; then _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' # If archive_cmds runs LD, not CC, wlarc should be empty # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to # investigate it a little bit more. (MM) wlarc='$wl' # ancient GNU ld didn't support --whole-archive et. al. if eval "`$CC -print-prog-name=ld` --help 2>&1" | $GREP 'no-whole-archive' > /dev/null; then _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive' else _LT_TAGVAR(whole_archive_flag_spec, $1)= fi else with_gnu_ld=no wlarc= # A generic and very simple default shared library creation # command for GNU C++ for the case where it uses the native # linker, instead of GNU ld. If possible, this setting should # overridden to take advantage of the native linker features on # the platform it is being used on. _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib' fi # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"' else GXX=no with_gnu_ld=no wlarc= fi # PORTME: fill in a description of your system's C++ link characteristics AC_MSG_CHECKING([whether the $compiler linker ($LD) supports shared libraries]) _LT_TAGVAR(ld_shlibs, $1)=yes case $host_os in aix3*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; aix[[4-9]]*) if test ia64 = "$host_cpu"; then # On IA64, the linker does run time linking by default, so we don't # have to do anything special. aix_use_runtimelinking=no exp_sym_flag='-Bexport' no_entry_flag= else aix_use_runtimelinking=no # Test if we are trying to use run time linking or normal # AIX style linking. If -brtl is somewhere in LDFLAGS, we # have runtime linking enabled, and use it for executables. # For shared libraries, we enable/disable runtime linking # depending on the kind of the shared library created - # when "with_aix_soname,aix_use_runtimelinking" is: # "aix,no" lib.a(lib.so.V) shared, rtl:no, for executables # "aix,yes" lib.so shared, rtl:yes, for executables # lib.a static archive # "both,no" lib.so.V(shr.o) shared, rtl:yes # lib.a(lib.so.V) shared, rtl:no, for executables # "both,yes" lib.so.V(shr.o) shared, rtl:yes, for executables # lib.a(lib.so.V) shared, rtl:no # "svr4,*" lib.so.V(shr.o) shared, rtl:yes, for executables # lib.a static archive case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*) for ld_flag in $LDFLAGS; do case $ld_flag in *-brtl*) aix_use_runtimelinking=yes break ;; esac done if test svr4,no = "$with_aix_soname,$aix_use_runtimelinking"; then # With aix-soname=svr4, we create the lib.so.V shared archives only, # so we don't have lib.a shared libs to link our executables. # We have to force runtime linking in this case. aix_use_runtimelinking=yes LDFLAGS="$LDFLAGS -Wl,-brtl" fi ;; esac exp_sym_flag='-bexport' no_entry_flag='-bnoentry' fi # When large executables or shared objects are built, AIX ld can # have problems creating the table of contents. If linking a library # or program results in "error TOC overflow" add -mminimal-toc to # CXXFLAGS/CFLAGS for g++/gcc. In the cases where that is not # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS. _LT_TAGVAR(archive_cmds, $1)='' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_direct_absolute, $1)=yes _LT_TAGVAR(hardcode_libdir_separator, $1)=':' _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(file_list_spec, $1)='$wl-f,' case $with_aix_soname,$aix_use_runtimelinking in aix,*) ;; # no import file svr4,* | *,yes) # use import file # The Import File defines what to hardcode. _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=no ;; esac if test yes = "$GXX"; then case $host_os in aix4.[[012]]|aix4.[[012]].*) # We only want to do this on AIX 4.2 and lower, the check # below for broken collect2 doesn't work under 4.3+ collect2name=`$CC -print-prog-name=collect2` if test -f "$collect2name" && strings "$collect2name" | $GREP resolve_lib_name >/dev/null then # We have reworked collect2 : else # We have old collect2 _LT_TAGVAR(hardcode_direct, $1)=unsupported # It fails to find uninstalled libraries when the uninstalled # path is not listed in the libpath. Setting hardcode_minus_L # to unsupported forces relinking _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)= fi esac shared_flag='-shared' if test yes = "$aix_use_runtimelinking"; then shared_flag=$shared_flag' $wl-G' fi # Need to ensure runtime linking is disabled for the traditional # shared library, or the linker may eventually find shared libraries # /with/ Import File - we do not want to mix them. shared_flag_aix='-shared' shared_flag_svr4='-shared $wl-G' else # not using gcc if test ia64 = "$host_cpu"; then # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release # chokes on -Wl,-G. The following line is correct: shared_flag='-G' else if test yes = "$aix_use_runtimelinking"; then shared_flag='$wl-G' else shared_flag='$wl-bM:SRE' fi shared_flag_aix='$wl-bM:SRE' shared_flag_svr4='$wl-G' fi fi _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-bexpall' # It seems that -bexpall does not export symbols beginning with # underscore (_), so it is better to generate a list of symbols to # export. _LT_TAGVAR(always_export_symbols, $1)=yes if test aix,yes = "$with_aix_soname,$aix_use_runtimelinking"; then # Warning - without using the other runtime loading flags (-brtl), # -berok will link without error, but may produce a broken library. # The "-G" linker flag allows undefined symbols. _LT_TAGVAR(no_undefined_flag, $1)='-bernotok' # Determine the default libpath from the value encoded in an empty # executable. _LT_SYS_MODULE_PATH_AIX([$1]) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath" _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs $wl'$no_entry_flag' $compiler_flags `if test -n "$allow_undefined_flag"; then func_echo_all "$wl$allow_undefined_flag"; else :; fi` $wl'$exp_sym_flag:\$export_symbols' '$shared_flag else if test ia64 = "$host_cpu"; then _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R $libdir:/usr/lib:/lib' _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs" _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\$wl$no_entry_flag"' $compiler_flags $wl$allow_undefined_flag '"\$wl$exp_sym_flag:\$export_symbols" else # Determine the default libpath from the value encoded in an # empty executable. _LT_SYS_MODULE_PATH_AIX([$1]) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-blibpath:$libdir:'"$aix_libpath" # Warning - without using the other run time loading flags, # -berok will link without error, but may produce a broken library. _LT_TAGVAR(no_undefined_flag, $1)=' $wl-bernotok' _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-berok' if test yes = "$with_gnu_ld"; then # We only use this code for GNU lds that support --whole-archive. _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive$convenience $wl--no-whole-archive' else # Exported symbols can be pulled into shared objects from archives _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience' fi _LT_TAGVAR(archive_cmds_need_lc, $1)=yes _LT_TAGVAR(archive_expsym_cmds, $1)='$RM -r $output_objdir/$realname.d~$MKDIR $output_objdir/$realname.d' # -brtl affects multiple linker settings, -berok does not and is overridden later compiler_flags_filtered='`func_echo_all "$compiler_flags " | $SED -e "s%-brtl\\([[, ]]\\)%-berok\\1%g"`' if test svr4 != "$with_aix_soname"; then # This is similar to how AIX traditionally builds its shared # libraries. Need -bnortl late, we may have -brtl in LDFLAGS. _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_aix' -o $output_objdir/$realname.d/$soname $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$realname.d/$soname' fi if test aix != "$with_aix_soname"; then _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$CC '$shared_flag_svr4' -o $output_objdir/$realname.d/$shared_archive_member_spec.o $libobjs $deplibs $wl-bnoentry '$compiler_flags_filtered'$wl-bE:$export_symbols$allow_undefined_flag~$STRIP -e $output_objdir/$realname.d/$shared_archive_member_spec.o~( func_echo_all "#! $soname($shared_archive_member_spec.o)"; if test shr_64 = "$shared_archive_member_spec"; then func_echo_all "# 64"; else func_echo_all "# 32"; fi; cat $export_symbols ) > $output_objdir/$realname.d/$shared_archive_member_spec.imp~$AR $AR_FLAGS $output_objdir/$soname $output_objdir/$realname.d/$shared_archive_member_spec.o $output_objdir/$realname.d/$shared_archive_member_spec.imp' else # used by -dlpreopen to get the symbols _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$MV $output_objdir/$realname.d/$soname $output_objdir' fi _LT_TAGVAR(archive_expsym_cmds, $1)="$_LT_TAGVAR(archive_expsym_cmds, $1)"'~$RM -r $output_objdir/$realname.d' fi fi ;; beos*) if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then _LT_TAGVAR(allow_undefined_flag, $1)=unsupported # Joseph Beckenbach says some releases of gcc # support --undefined. This deserves some investigation. FIXME _LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; chorus*) case $cc_basename in *) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; esac ;; cygwin* | mingw* | pw32* | cegcc*) case $GXX,$cc_basename in ,cl* | no,cl*) # Native MSVC # hardcode_libdir_flag_spec is actually meaningless, as there is # no search path for DLLs. _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' ' _LT_TAGVAR(allow_undefined_flag, $1)=unsupported _LT_TAGVAR(always_export_symbols, $1)=yes _LT_TAGVAR(file_list_spec, $1)='@' # Tell ltmain to make .lib files, not .a files. libext=lib # Tell ltmain to make .dll files, not .so files. shrext_cmds=.dll # FIXME: Setting linknames here is a bad hack. _LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~linknames=' _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then cp "$export_symbols" "$output_objdir/$soname.def"; echo "$tool_output_objdir$soname.def" > "$output_objdir/$soname.exp"; else $SED -e '\''s/^/-link -EXPORT:/'\'' < $export_symbols > $output_objdir/$soname.exp; fi~ $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~ linknames=' # The linker will not automatically build a static lib if we build a DLL. # _LT_TAGVAR(old_archive_from_new_cmds, $1)='true' _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes # Don't use ranlib _LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib' _LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~ lt_tool_outputfile="@TOOL_OUTPUT@"~ case $lt_outputfile in *.exe|*.EXE) ;; *) lt_outputfile=$lt_outputfile.exe lt_tool_outputfile=$lt_tool_outputfile.exe ;; esac~ func_to_tool_file "$lt_outputfile"~ if test : != "$MANIFEST_TOOL" && test -f "$lt_outputfile.manifest"; then $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1; $RM "$lt_outputfile.manifest"; fi' ;; *) # g++ # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless, # as there is no search path for DLLs. _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-all-symbols' _LT_TAGVAR(allow_undefined_flag, $1)=unsupported _LT_TAGVAR(always_export_symbols, $1)=no _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' # If the export-symbols file already is a .def file, use it as # is; otherwise, prepend EXPORTS... _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then cp $export_symbols $output_objdir/$soname.def; else echo EXPORTS > $output_objdir/$soname.def; cat $export_symbols >> $output_objdir/$soname.def; fi~ $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; esac ;; darwin* | rhapsody*) _LT_DARWIN_LINKER_FEATURES($1) ;; os2*) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' _LT_TAGVAR(hardcode_minus_L, $1)=yes _LT_TAGVAR(allow_undefined_flag, $1)=unsupported shrext_cmds=.dll _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~ $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~ $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~ $ECHO EXPORTS >> $output_objdir/$libname.def~ emxexp $libobjs | $SED /"_DLL_InitTerm"/d >> $output_objdir/$libname.def~ $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~ emximp -o $lib $output_objdir/$libname.def' _LT_TAGVAR(archive_expsym_cmds, $1)='$ECHO "LIBRARY ${soname%$shared_ext} INITINSTANCE TERMINSTANCE" > $output_objdir/$libname.def~ $ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~ $ECHO "DATA MULTIPLE NONSHARED" >> $output_objdir/$libname.def~ $ECHO EXPORTS >> $output_objdir/$libname.def~ prefix_cmds="$SED"~ if test EXPORTS = "`$SED 1q $export_symbols`"; then prefix_cmds="$prefix_cmds -e 1d"; fi~ prefix_cmds="$prefix_cmds -e \"s/^\(.*\)$/_\1/g\""~ cat $export_symbols | $prefix_cmds >> $output_objdir/$libname.def~ $CC -Zdll -Zcrtdll -o $output_objdir/$soname $libobjs $deplibs $compiler_flags $output_objdir/$libname.def~ emximp -o $lib $output_objdir/$libname.def' _LT_TAGVAR(old_archive_From_new_cmds, $1)='emximp -o $output_objdir/${libname}_dll.a $output_objdir/$libname.def' _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes ;; dgux*) case $cc_basename in ec++*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; ghcx*) # Green Hills C++ Compiler # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; *) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; esac ;; freebsd2.*) # C++ shared libraries reported to be fairly broken before # switch to ELF _LT_TAGVAR(ld_shlibs, $1)=no ;; freebsd-elf*) _LT_TAGVAR(archive_cmds_need_lc, $1)=no ;; freebsd* | dragonfly*) # FreeBSD 3 and later use GNU C++ and GNU ld with standard ELF # conventions _LT_TAGVAR(ld_shlibs, $1)=yes ;; haiku*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(link_all_deplibs, $1)=yes ;; hpux9*) _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH, # but as the default # location of the library. case $cc_basename in CC*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; aCC*) _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -b $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. # # There doesn't appear to be a way to prevent this compiler from # explicitly linking system object files so we need to strip them # from the output so that they don't get included in the library # dependencies. output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"' ;; *) if test yes = "$GXX"; then _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' else # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no fi ;; esac ;; hpux10*|hpux11*) if test no = "$with_gnu_ld"; then _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl+b $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: case $host_cpu in hppa*64*|ia64*) ;; *) _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' ;; esac fi case $host_cpu in hppa*64*|ia64*) _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no ;; *) _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_direct_absolute, $1)=yes _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH, # but as the default # location of the library. ;; esac case $cc_basename in CC*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; aCC*) case $host_cpu in hppa*64*) _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; ia64*) _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; *) _LT_TAGVAR(archive_cmds, $1)='$CC -b $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; esac # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. # # There doesn't appear to be a way to prevent this compiler from # explicitly linking system object files so we need to strip them # from the output so that they don't get included in the library # dependencies. output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"' ;; *) if test yes = "$GXX"; then if test no = "$with_gnu_ld"; then case $host_cpu in hppa*64*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; ia64*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; *) _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; esac fi else # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no fi ;; esac ;; interix[[3-9]]*) _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc. # Instead, shared libraries are loaded at an image base (0x10000000 by # default) and relocated if they conflict, which is a slow very memory # consuming and fragmenting process. To avoid this, we pick a random, # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link # time. Moving up from 0x10000000 also allows more sbrk(2) space. _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='sed "s|^|_|" $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-h,$soname $wl--retain-symbols-file,$output_objdir/$soname.expsym $wl--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib' ;; irix5* | irix6*) case $cc_basename in CC*) # SGI C++ _LT_TAGVAR(archive_cmds, $1)='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' # Archives containing C++ object files must be created using # "CC -ar", where "CC" is the IRIX C++ compiler. This is # necessary to make sure instantiated templates are included # in the archive. _LT_TAGVAR(old_archive_cmds, $1)='$CC -ar -WR,-u -o $oldlib $oldobjs' ;; *) if test yes = "$GXX"; then if test no = "$with_gnu_ld"; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' else _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` -o $lib' fi fi _LT_TAGVAR(link_all_deplibs, $1)=yes ;; esac _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: _LT_TAGVAR(inherit_rpath, $1)=yes ;; linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*) case $cc_basename in KCC*) # Kuck and Associates, Inc. (KAI) C++ Compiler # KCC will only create a shared library if the output file # ends with ".so" (or ".sl" for HP-UX), so rename the library # to its proper name (with version) after linking. _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\$tempext\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\$tempext\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib $wl-retain-symbols-file,$export_symbols; mv \$templib $lib' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. # # There doesn't appear to be a way to prevent this compiler from # explicitly linking system object files so we need to strip them # from the output so that they don't get included in the library # dependencies. output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' # Archives containing C++ object files must be created using # "CC -Bstatic", where "CC" is the KAI C++ compiler. _LT_TAGVAR(old_archive_cmds, $1)='$CC -Bstatic -o $oldlib $oldobjs' ;; icpc* | ecpc* ) # Intel C++ with_gnu_ld=yes # version 8.0 and above of icpc choke on multiply defined symbols # if we add $predep_objects and $postdep_objects, however 7.1 and # earlier do not add the objects themselves. case `$CC -V 2>&1` in *"Version 7."*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' ;; *) # Version 8.0 or newer tmp_idyn= case $host_cpu in ia64*) tmp_idyn=' -i_dynamic';; esac _LT_TAGVAR(archive_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' ;; esac _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive$convenience $wl--no-whole-archive' ;; pgCC* | pgcpp*) # Portland Group C++ compiler case `$CC -V` in *pgCC\ [[1-5]].* | *pgcpp\ [[1-5]].*) _LT_TAGVAR(prelink_cmds, $1)='tpldir=Template.dir~ rm -rf $tpldir~ $CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~ compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"' _LT_TAGVAR(old_archive_cmds, $1)='tpldir=Template.dir~ rm -rf $tpldir~ $CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~ $AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~ $RANLIB $oldlib' _LT_TAGVAR(archive_cmds, $1)='tpldir=Template.dir~ rm -rf $tpldir~ $CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~ $CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='tpldir=Template.dir~ rm -rf $tpldir~ $CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~ $CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' ;; *) # Version 6 and above use weak symbols _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' ;; esac _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl--rpath $wl$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' ;; cxx*) # Compaq C++ _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib $wl-retain-symbols-file $wl$export_symbols' runpath_var=LD_RUN_PATH _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. # # There doesn't appear to be a way to prevent this compiler from # explicitly linking system object files so we need to strip them # from the output so that they don't get included in the library # dependencies. output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed' ;; xl* | mpixl* | bgxl*) # IBM XL 8.0 on PPC, with GNU ld _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' _LT_TAGVAR(archive_cmds, $1)='$CC -qmkshrobj $libobjs $deplibs $compiler_flags $wl-soname $wl$soname -o $lib' if test yes = "$supports_anon_versioning"; then _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~ cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~ echo "local: *; };" >> $output_objdir/$libname.ver~ $CC -qmkshrobj $libobjs $deplibs $compiler_flags $wl-soname $wl$soname $wl-version-script $wl$output_objdir/$libname.ver -o $lib' fi ;; *) case `$CC -V 2>&1 | sed 5q` in *Sun\ C*) # Sun C++ 5.9 _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs' _LT_TAGVAR(archive_cmds, $1)='$CC -G$allow_undefined_flag -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G$allow_undefined_flag -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-retain-symbols-file $wl$export_symbols' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' _LT_TAGVAR(compiler_needs_object, $1)=yes # Not sure whether something based on # $CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 # would be better. output_verbose_link_cmd='func_echo_all' # Archives containing C++ object files must be created using # "CC -xar", where "CC" is the Sun C++ compiler. This is # necessary to make sure instantiated templates are included # in the archive. _LT_TAGVAR(old_archive_cmds, $1)='$CC -xar -o $oldlib $oldobjs' ;; esac ;; esac ;; lynxos*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; m88k*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; mvs*) case $cc_basename in cxx*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; *) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; esac ;; netbsd*) if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $predep_objects $libobjs $deplibs $postdep_objects $linker_flags' wlarc= _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no fi # Workaround some broken pre-1.5 toolchains output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP conftest.$objext | $SED -e "s:-lgcc -lc -lgcc::"' ;; *nto* | *qnx*) _LT_TAGVAR(ld_shlibs, $1)=yes ;; openbsd* | bitrig*) if test -f /usr/libexec/ld.so; then _LT_TAGVAR(hardcode_direct, $1)=yes _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=yes _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`"; then _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-retain-symbols-file,$export_symbols -o $lib' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-E' _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive' fi output_verbose_link_cmd=func_echo_all else _LT_TAGVAR(ld_shlibs, $1)=no fi ;; osf3* | osf4* | osf5*) case $cc_basename in KCC*) # Kuck and Associates, Inc. (KAI) C++ Compiler # KCC will only create a shared library if the output file # ends with ".so" (or ".sl" for HP-UX), so rename the library # to its proper name (with version) after linking. _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\$tempext\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath,$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: # Archives containing C++ object files must be created using # the KAI C++ compiler. case $host in osf3*) _LT_TAGVAR(old_archive_cmds, $1)='$CC -Bstatic -o $oldlib $oldobjs' ;; *) _LT_TAGVAR(old_archive_cmds, $1)='$CC -o $oldlib $oldobjs' ;; esac ;; RCC*) # Rational C++ 2.4.1 # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; cxx*) case $host in osf3*) _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*' _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $soname `test -n "$verstring" && func_echo_all "$wl-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' ;; *) _LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*' _LT_TAGVAR(archive_cmds, $1)='$CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~ echo "-hidden">> $lib.exp~ $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname $wl-input $wl$lib.exp `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib~ $RM $lib.exp' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir' ;; esac _LT_TAGVAR(hardcode_libdir_separator, $1)=: # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. # # There doesn't appear to be a way to prevent this compiler from # explicitly linking system object files so we need to strip them # from the output so that they don't get included in the library # dependencies. output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list= ; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"' ;; *) if test yes,no = "$GXX,$with_gnu_ld"; then _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*' case $host in osf3*) _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' ;; *) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' ;; esac _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=: # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"' else # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no fi ;; esac ;; psos*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; sunos4*) case $cc_basename in CC*) # Sun C++ 4.x # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; lcc*) # Lucid # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; *) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; esac ;; solaris*) case $cc_basename in CC* | sunCC*) # Sun C++ 4.2, 5.x and Centerline C++ _LT_TAGVAR(archive_cmds_need_lc,$1)=yes _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs' _LT_TAGVAR(archive_cmds, $1)='$CC -G$allow_undefined_flag -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ $CC -G$allow_undefined_flag $wl-M $wl$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir' _LT_TAGVAR(hardcode_shlibpath_var, $1)=no case $host_os in solaris2.[[0-5]] | solaris2.[[0-5]].*) ;; *) # The compiler driver will combine and reorder linker options, # but understands '-z linker_flag'. # Supported since Solaris 2.6 (maybe 2.5.1?) _LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract' ;; esac _LT_TAGVAR(link_all_deplibs, $1)=yes output_verbose_link_cmd='func_echo_all' # Archives containing C++ object files must be created using # "CC -xar", where "CC" is the Sun C++ compiler. This is # necessary to make sure instantiated templates are included # in the archive. _LT_TAGVAR(old_archive_cmds, $1)='$CC -xar -o $oldlib $oldobjs' ;; gcx*) # Green Hills C++ Compiler _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' # The C++ compiler must be used to create the archive. _LT_TAGVAR(old_archive_cmds, $1)='$CC $LDFLAGS -archive -o $oldlib $oldobjs' ;; *) # GNU C++ compiler with Solaris linker if test yes,no = "$GXX,$with_gnu_ld"; then _LT_TAGVAR(no_undefined_flag, $1)=' $wl-z ${wl}defs' if $CC --version | $GREP -v '^2\.7' > /dev/null; then _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ $CC -shared $pic_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"' else # g++ 2.7 appears to require '-G' NOT '-shared' on this # platform. _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ $CC -G -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when # linking a shared library. output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"' fi _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R $wl$libdir' case $host_os in solaris2.[[0-5]] | solaris2.[[0-5]].*) ;; *) _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl-z ${wl}allextract$convenience $wl-z ${wl}defaultextract' ;; esac fi ;; esac ;; sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*) _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text' _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no runpath_var='LD_RUN_PATH' case $cc_basename in CC*) _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' ;; *) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' ;; esac ;; sysv5* | sco3.2v5* | sco5v6*) # Note: We CANNOT use -z defs as we might desire, because we do not # link with -lc, and that would cause any symbols used from libc to # always be unresolved, which means just about no library would # ever link correctly. If we're not using GNU ld we use -z text # though, which does catch some bad symbols but isn't as heavy-handed # as -z defs. _LT_TAGVAR(no_undefined_flag, $1)='$wl-z,text' _LT_TAGVAR(allow_undefined_flag, $1)='$wl-z,nodefs' _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(hardcode_shlibpath_var, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-R,$libdir' _LT_TAGVAR(hardcode_libdir_separator, $1)=':' _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl-Bexport' runpath_var='LD_RUN_PATH' case $cc_basename in CC*) _LT_TAGVAR(archive_cmds, $1)='$CC -G $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(old_archive_cmds, $1)='$CC -Tprelink_objects $oldobjs~ '"$_LT_TAGVAR(old_archive_cmds, $1)" _LT_TAGVAR(reload_cmds, $1)='$CC -Tprelink_objects $reload_objs~ '"$_LT_TAGVAR(reload_cmds, $1)" ;; *) _LT_TAGVAR(archive_cmds, $1)='$CC -shared $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $wl-Bexport:$export_symbols $wl-h,$soname -o $lib $libobjs $deplibs $compiler_flags' ;; esac ;; tandem*) case $cc_basename in NCC*) # NonStop-UX NCC 3.20 # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; *) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; esac ;; vxworks*) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; *) # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no ;; esac AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)]) test no = "$_LT_TAGVAR(ld_shlibs, $1)" && can_build_shared=no _LT_TAGVAR(GCC, $1)=$GXX _LT_TAGVAR(LD, $1)=$LD ## CAVEAT EMPTOR: ## There is no encapsulation within the following macros, do not change ## the running order or otherwise move them around unless you know exactly ## what you are doing... _LT_SYS_HIDDEN_LIBDEPS($1) _LT_COMPILER_PIC($1) _LT_COMPILER_C_O($1) _LT_COMPILER_FILE_LOCKS($1) _LT_LINKER_SHLIBS($1) _LT_SYS_DYNAMIC_LINKER($1) _LT_LINKER_HARDCODE_LIBPATH($1) _LT_CONFIG($1) fi # test -n "$compiler" CC=$lt_save_CC CFLAGS=$lt_save_CFLAGS LDCXX=$LD LD=$lt_save_LD GCC=$lt_save_GCC with_gnu_ld=$lt_save_with_gnu_ld lt_cv_path_LDCXX=$lt_cv_path_LD lt_cv_path_LD=$lt_save_path_LD lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld fi # test yes != "$_lt_caught_CXX_error" AC_LANG_POP ])# _LT_LANG_CXX_CONFIG # _LT_FUNC_STRIPNAME_CNF # ---------------------- # func_stripname_cnf prefix suffix name # strip PREFIX and SUFFIX off of NAME. # PREFIX and SUFFIX must not contain globbing or regex special # characters, hashes, percent signs, but SUFFIX may contain a leading # dot (in which case that matches only a dot). # # This function is identical to the (non-XSI) version of func_stripname, # except this one can be used by m4 code that may be executed by configure, # rather than the libtool script. m4_defun([_LT_FUNC_STRIPNAME_CNF],[dnl AC_REQUIRE([_LT_DECL_SED]) AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH]) func_stripname_cnf () { case @S|@2 in .*) func_stripname_result=`$ECHO "@S|@3" | $SED "s%^@S|@1%%; s%\\\\@S|@2\$%%"`;; *) func_stripname_result=`$ECHO "@S|@3" | $SED "s%^@S|@1%%; s%@S|@2\$%%"`;; esac } # func_stripname_cnf ])# _LT_FUNC_STRIPNAME_CNF # _LT_SYS_HIDDEN_LIBDEPS([TAGNAME]) # --------------------------------- # Figure out "hidden" library dependencies from verbose # compiler output when linking a shared library. # Parse the compiler output and extract the necessary # objects, libraries and library flags. m4_defun([_LT_SYS_HIDDEN_LIBDEPS], [m4_require([_LT_FILEUTILS_DEFAULTS])dnl AC_REQUIRE([_LT_FUNC_STRIPNAME_CNF])dnl # Dependencies to place before and after the object being linked: _LT_TAGVAR(predep_objects, $1)= _LT_TAGVAR(postdep_objects, $1)= _LT_TAGVAR(predeps, $1)= _LT_TAGVAR(postdeps, $1)= _LT_TAGVAR(compiler_lib_search_path, $1)= dnl we can't use the lt_simple_compile_test_code here, dnl because it contains code intended for an executable, dnl not a library. It's possible we should let each dnl tag define a new lt_????_link_test_code variable, dnl but it's only used here... m4_if([$1], [], [cat > conftest.$ac_ext <<_LT_EOF int a; void foo (void) { a = 0; } _LT_EOF ], [$1], [CXX], [cat > conftest.$ac_ext <<_LT_EOF class Foo { public: Foo (void) { a = 0; } private: int a; }; _LT_EOF ], [$1], [F77], [cat > conftest.$ac_ext <<_LT_EOF subroutine foo implicit none integer*4 a a=0 return end _LT_EOF ], [$1], [FC], [cat > conftest.$ac_ext <<_LT_EOF subroutine foo implicit none integer a a=0 return end _LT_EOF ], [$1], [GCJ], [cat > conftest.$ac_ext <<_LT_EOF public class foo { private int a; public void bar (void) { a = 0; } }; _LT_EOF ], [$1], [GO], [cat > conftest.$ac_ext <<_LT_EOF package foo func foo() { } _LT_EOF ]) _lt_libdeps_save_CFLAGS=$CFLAGS case "$CC $CFLAGS " in #( *\ -flto*\ *) CFLAGS="$CFLAGS -fno-lto" ;; *\ -fwhopr*\ *) CFLAGS="$CFLAGS -fno-whopr" ;; *\ -fuse-linker-plugin*\ *) CFLAGS="$CFLAGS -fno-use-linker-plugin" ;; esac dnl Parse the compiler output and extract the necessary dnl objects, libraries and library flags. if AC_TRY_EVAL(ac_compile); then # Parse the compiler output and extract the necessary # objects, libraries and library flags. # Sentinel used to keep track of whether or not we are before # the conftest object file. pre_test_object_deps_done=no for p in `eval "$output_verbose_link_cmd"`; do case $prev$p in -L* | -R* | -l*) # Some compilers place space between "-{L,R}" and the path. # Remove the space. if test x-L = "$p" || test x-R = "$p"; then prev=$p continue fi # Expand the sysroot to ease extracting the directories later. if test -z "$prev"; then case $p in -L*) func_stripname_cnf '-L' '' "$p"; prev=-L; p=$func_stripname_result ;; -R*) func_stripname_cnf '-R' '' "$p"; prev=-R; p=$func_stripname_result ;; -l*) func_stripname_cnf '-l' '' "$p"; prev=-l; p=$func_stripname_result ;; esac fi case $p in =*) func_stripname_cnf '=' '' "$p"; p=$lt_sysroot$func_stripname_result ;; esac if test no = "$pre_test_object_deps_done"; then case $prev in -L | -R) # Internal compiler library paths should come after those # provided the user. The postdeps already come after the # user supplied libs so there is no need to process them. if test -z "$_LT_TAGVAR(compiler_lib_search_path, $1)"; then _LT_TAGVAR(compiler_lib_search_path, $1)=$prev$p else _LT_TAGVAR(compiler_lib_search_path, $1)="${_LT_TAGVAR(compiler_lib_search_path, $1)} $prev$p" fi ;; # The "-l" case would never come before the object being # linked, so don't bother handling this case. esac else if test -z "$_LT_TAGVAR(postdeps, $1)"; then _LT_TAGVAR(postdeps, $1)=$prev$p else _LT_TAGVAR(postdeps, $1)="${_LT_TAGVAR(postdeps, $1)} $prev$p" fi fi prev= ;; *.lto.$objext) ;; # Ignore GCC LTO objects *.$objext) # This assumes that the test object file only shows up # once in the compiler output. if test "$p" = "conftest.$objext"; then pre_test_object_deps_done=yes continue fi if test no = "$pre_test_object_deps_done"; then if test -z "$_LT_TAGVAR(predep_objects, $1)"; then _LT_TAGVAR(predep_objects, $1)=$p else _LT_TAGVAR(predep_objects, $1)="$_LT_TAGVAR(predep_objects, $1) $p" fi else if test -z "$_LT_TAGVAR(postdep_objects, $1)"; then _LT_TAGVAR(postdep_objects, $1)=$p else _LT_TAGVAR(postdep_objects, $1)="$_LT_TAGVAR(postdep_objects, $1) $p" fi fi ;; *) ;; # Ignore the rest. esac done # Clean up. rm -f a.out a.exe else echo "libtool.m4: error: problem compiling $1 test program" fi $RM -f confest.$objext CFLAGS=$_lt_libdeps_save_CFLAGS # PORTME: override above test on systems where it is broken m4_if([$1], [CXX], [case $host_os in interix[[3-9]]*) # Interix 3.5 installs completely hosed .la files for C++, so rather than # hack all around it, let's just trust "g++" to DTRT. _LT_TAGVAR(predep_objects,$1)= _LT_TAGVAR(postdep_objects,$1)= _LT_TAGVAR(postdeps,$1)= ;; esac ]) case " $_LT_TAGVAR(postdeps, $1) " in *" -lc "*) _LT_TAGVAR(archive_cmds_need_lc, $1)=no ;; esac _LT_TAGVAR(compiler_lib_search_dirs, $1)= if test -n "${_LT_TAGVAR(compiler_lib_search_path, $1)}"; then _LT_TAGVAR(compiler_lib_search_dirs, $1)=`echo " ${_LT_TAGVAR(compiler_lib_search_path, $1)}" | $SED -e 's! -L! !g' -e 's!^ !!'` fi _LT_TAGDECL([], [compiler_lib_search_dirs], [1], [The directories searched by this compiler when creating a shared library]) _LT_TAGDECL([], [predep_objects], [1], [Dependencies to place before and after the objects being linked to create a shared library]) _LT_TAGDECL([], [postdep_objects], [1]) _LT_TAGDECL([], [predeps], [1]) _LT_TAGDECL([], [postdeps], [1]) _LT_TAGDECL([], [compiler_lib_search_path], [1], [The library search path used internally by the compiler when linking a shared library]) ])# _LT_SYS_HIDDEN_LIBDEPS # _LT_LANG_F77_CONFIG([TAG]) # -------------------------- # Ensure that the configuration variables for a Fortran 77 compiler are # suitably defined. These variables are subsequently used by _LT_CONFIG # to write the compiler configuration to 'libtool'. m4_defun([_LT_LANG_F77_CONFIG], [AC_LANG_PUSH(Fortran 77) if test -z "$F77" || test no = "$F77"; then _lt_disable_F77=yes fi _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(allow_undefined_flag, $1)= _LT_TAGVAR(always_export_symbols, $1)=no _LT_TAGVAR(archive_expsym_cmds, $1)= _LT_TAGVAR(export_dynamic_flag_spec, $1)= _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)= _LT_TAGVAR(hardcode_libdir_separator, $1)= _LT_TAGVAR(hardcode_minus_L, $1)=no _LT_TAGVAR(hardcode_automatic, $1)=no _LT_TAGVAR(inherit_rpath, $1)=no _LT_TAGVAR(module_cmds, $1)= _LT_TAGVAR(module_expsym_cmds, $1)= _LT_TAGVAR(link_all_deplibs, $1)=unknown _LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds _LT_TAGVAR(reload_flag, $1)=$reload_flag _LT_TAGVAR(reload_cmds, $1)=$reload_cmds _LT_TAGVAR(no_undefined_flag, $1)= _LT_TAGVAR(whole_archive_flag_spec, $1)= _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no # Source file extension for f77 test sources. ac_ext=f # Object file extension for compiled f77 test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # No sense in running all these tests if we already determined that # the F77 compiler isn't working. Some variables (like enable_shared) # are currently assumed to apply to all compilers on this platform, # and will be corrupted by setting them based on a non-working compiler. if test yes != "$_lt_disable_F77"; then # Code to be used in simple compile tests lt_simple_compile_test_code="\ subroutine t return end " # Code to be used in simple link tests lt_simple_link_test_code="\ program t end " # ltmain only uses $CC for tagged configurations so make sure $CC is set. _LT_TAG_COMPILER # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE # Allow CC to be a program name with arguments. lt_save_CC=$CC lt_save_GCC=$GCC lt_save_CFLAGS=$CFLAGS CC=${F77-"f77"} CFLAGS=$FFLAGS compiler=$CC _LT_TAGVAR(compiler, $1)=$CC _LT_CC_BASENAME([$compiler]) GCC=$G77 if test -n "$compiler"; then AC_MSG_CHECKING([if libtool supports shared libraries]) AC_MSG_RESULT([$can_build_shared]) AC_MSG_CHECKING([whether to build shared libraries]) test no = "$can_build_shared" && enable_shared=no # On AIX, shared libraries and static libraries use the same namespace, and # are all built from PIC. case $host_os in aix3*) test yes = "$enable_shared" && enable_static=no if test -n "$RANLIB"; then archive_cmds="$archive_cmds~\$RANLIB \$lib" postinstall_cmds='$RANLIB $lib' fi ;; aix[[4-9]]*) if test ia64 != "$host_cpu"; then case $enable_shared,$with_aix_soname,$aix_use_runtimelinking in yes,aix,yes) ;; # shared object as lib.so file only yes,svr4,*) ;; # shared object as lib.so archive member only yes,*) enable_static=no ;; # shared object in lib.a archive as well esac fi ;; esac AC_MSG_RESULT([$enable_shared]) AC_MSG_CHECKING([whether to build static libraries]) # Make sure either enable_shared or enable_static is yes. test yes = "$enable_shared" || enable_static=yes AC_MSG_RESULT([$enable_static]) _LT_TAGVAR(GCC, $1)=$G77 _LT_TAGVAR(LD, $1)=$LD ## CAVEAT EMPTOR: ## There is no encapsulation within the following macros, do not change ## the running order or otherwise move them around unless you know exactly ## what you are doing... _LT_COMPILER_PIC($1) _LT_COMPILER_C_O($1) _LT_COMPILER_FILE_LOCKS($1) _LT_LINKER_SHLIBS($1) _LT_SYS_DYNAMIC_LINKER($1) _LT_LINKER_HARDCODE_LIBPATH($1) _LT_CONFIG($1) fi # test -n "$compiler" GCC=$lt_save_GCC CC=$lt_save_CC CFLAGS=$lt_save_CFLAGS fi # test yes != "$_lt_disable_F77" AC_LANG_POP ])# _LT_LANG_F77_CONFIG # _LT_LANG_FC_CONFIG([TAG]) # ------------------------- # Ensure that the configuration variables for a Fortran compiler are # suitably defined. These variables are subsequently used by _LT_CONFIG # to write the compiler configuration to 'libtool'. m4_defun([_LT_LANG_FC_CONFIG], [AC_LANG_PUSH(Fortran) if test -z "$FC" || test no = "$FC"; then _lt_disable_FC=yes fi _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(allow_undefined_flag, $1)= _LT_TAGVAR(always_export_symbols, $1)=no _LT_TAGVAR(archive_expsym_cmds, $1)= _LT_TAGVAR(export_dynamic_flag_spec, $1)= _LT_TAGVAR(hardcode_direct, $1)=no _LT_TAGVAR(hardcode_direct_absolute, $1)=no _LT_TAGVAR(hardcode_libdir_flag_spec, $1)= _LT_TAGVAR(hardcode_libdir_separator, $1)= _LT_TAGVAR(hardcode_minus_L, $1)=no _LT_TAGVAR(hardcode_automatic, $1)=no _LT_TAGVAR(inherit_rpath, $1)=no _LT_TAGVAR(module_cmds, $1)= _LT_TAGVAR(module_expsym_cmds, $1)= _LT_TAGVAR(link_all_deplibs, $1)=unknown _LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds _LT_TAGVAR(reload_flag, $1)=$reload_flag _LT_TAGVAR(reload_cmds, $1)=$reload_cmds _LT_TAGVAR(no_undefined_flag, $1)= _LT_TAGVAR(whole_archive_flag_spec, $1)= _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no # Source file extension for fc test sources. ac_ext=${ac_fc_srcext-f} # Object file extension for compiled fc test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # No sense in running all these tests if we already determined that # the FC compiler isn't working. Some variables (like enable_shared) # are currently assumed to apply to all compilers on this platform, # and will be corrupted by setting them based on a non-working compiler. if test yes != "$_lt_disable_FC"; then # Code to be used in simple compile tests lt_simple_compile_test_code="\ subroutine t return end " # Code to be used in simple link tests lt_simple_link_test_code="\ program t end " # ltmain only uses $CC for tagged configurations so make sure $CC is set. _LT_TAG_COMPILER # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE # Allow CC to be a program name with arguments. lt_save_CC=$CC lt_save_GCC=$GCC lt_save_CFLAGS=$CFLAGS CC=${FC-"f95"} CFLAGS=$FCFLAGS compiler=$CC GCC=$ac_cv_fc_compiler_gnu _LT_TAGVAR(compiler, $1)=$CC _LT_CC_BASENAME([$compiler]) if test -n "$compiler"; then AC_MSG_CHECKING([if libtool supports shared libraries]) AC_MSG_RESULT([$can_build_shared]) AC_MSG_CHECKING([whether to build shared libraries]) test no = "$can_build_shared" && enable_shared=no # On AIX, shared libraries and static libraries use the same namespace, and # are all built from PIC. case $host_os in aix3*) test yes = "$enable_shared" && enable_static=no if test -n "$RANLIB"; then archive_cmds="$archive_cmds~\$RANLIB \$lib" postinstall_cmds='$RANLIB $lib' fi ;; aix[[4-9]]*) if test ia64 != "$host_cpu"; then case $enable_shared,$with_aix_soname,$aix_use_runtimelinking in yes,aix,yes) ;; # shared object as lib.so file only yes,svr4,*) ;; # shared object as lib.so archive member only yes,*) enable_static=no ;; # shared object in lib.a archive as well esac fi ;; esac AC_MSG_RESULT([$enable_shared]) AC_MSG_CHECKING([whether to build static libraries]) # Make sure either enable_shared or enable_static is yes. test yes = "$enable_shared" || enable_static=yes AC_MSG_RESULT([$enable_static]) _LT_TAGVAR(GCC, $1)=$ac_cv_fc_compiler_gnu _LT_TAGVAR(LD, $1)=$LD ## CAVEAT EMPTOR: ## There is no encapsulation within the following macros, do not change ## the running order or otherwise move them around unless you know exactly ## what you are doing... _LT_SYS_HIDDEN_LIBDEPS($1) _LT_COMPILER_PIC($1) _LT_COMPILER_C_O($1) _LT_COMPILER_FILE_LOCKS($1) _LT_LINKER_SHLIBS($1) _LT_SYS_DYNAMIC_LINKER($1) _LT_LINKER_HARDCODE_LIBPATH($1) _LT_CONFIG($1) fi # test -n "$compiler" GCC=$lt_save_GCC CC=$lt_save_CC CFLAGS=$lt_save_CFLAGS fi # test yes != "$_lt_disable_FC" AC_LANG_POP ])# _LT_LANG_FC_CONFIG # _LT_LANG_GCJ_CONFIG([TAG]) # -------------------------- # Ensure that the configuration variables for the GNU Java Compiler compiler # are suitably defined. These variables are subsequently used by _LT_CONFIG # to write the compiler configuration to 'libtool'. m4_defun([_LT_LANG_GCJ_CONFIG], [AC_REQUIRE([LT_PROG_GCJ])dnl AC_LANG_SAVE # Source file extension for Java test sources. ac_ext=java # Object file extension for compiled Java test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # Code to be used in simple compile tests lt_simple_compile_test_code="class foo {}" # Code to be used in simple link tests lt_simple_link_test_code='public class conftest { public static void main(String[[]] argv) {}; }' # ltmain only uses $CC for tagged configurations so make sure $CC is set. _LT_TAG_COMPILER # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE # Allow CC to be a program name with arguments. lt_save_CC=$CC lt_save_CFLAGS=$CFLAGS lt_save_GCC=$GCC GCC=yes CC=${GCJ-"gcj"} CFLAGS=$GCJFLAGS compiler=$CC _LT_TAGVAR(compiler, $1)=$CC _LT_TAGVAR(LD, $1)=$LD _LT_CC_BASENAME([$compiler]) # GCJ did not exist at the time GCC didn't implicitly link libc in. _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds _LT_TAGVAR(reload_flag, $1)=$reload_flag _LT_TAGVAR(reload_cmds, $1)=$reload_cmds ## CAVEAT EMPTOR: ## There is no encapsulation within the following macros, do not change ## the running order or otherwise move them around unless you know exactly ## what you are doing... if test -n "$compiler"; then _LT_COMPILER_NO_RTTI($1) _LT_COMPILER_PIC($1) _LT_COMPILER_C_O($1) _LT_COMPILER_FILE_LOCKS($1) _LT_LINKER_SHLIBS($1) _LT_LINKER_HARDCODE_LIBPATH($1) _LT_CONFIG($1) fi AC_LANG_RESTORE GCC=$lt_save_GCC CC=$lt_save_CC CFLAGS=$lt_save_CFLAGS ])# _LT_LANG_GCJ_CONFIG # _LT_LANG_GO_CONFIG([TAG]) # -------------------------- # Ensure that the configuration variables for the GNU Go compiler # are suitably defined. These variables are subsequently used by _LT_CONFIG # to write the compiler configuration to 'libtool'. m4_defun([_LT_LANG_GO_CONFIG], [AC_REQUIRE([LT_PROG_GO])dnl AC_LANG_SAVE # Source file extension for Go test sources. ac_ext=go # Object file extension for compiled Go test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # Code to be used in simple compile tests lt_simple_compile_test_code="package main; func main() { }" # Code to be used in simple link tests lt_simple_link_test_code='package main; func main() { }' # ltmain only uses $CC for tagged configurations so make sure $CC is set. _LT_TAG_COMPILER # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE # Allow CC to be a program name with arguments. lt_save_CC=$CC lt_save_CFLAGS=$CFLAGS lt_save_GCC=$GCC GCC=yes CC=${GOC-"gccgo"} CFLAGS=$GOFLAGS compiler=$CC _LT_TAGVAR(compiler, $1)=$CC _LT_TAGVAR(LD, $1)=$LD _LT_CC_BASENAME([$compiler]) # Go did not exist at the time GCC didn't implicitly link libc in. _LT_TAGVAR(archive_cmds_need_lc, $1)=no _LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds _LT_TAGVAR(reload_flag, $1)=$reload_flag _LT_TAGVAR(reload_cmds, $1)=$reload_cmds ## CAVEAT EMPTOR: ## There is no encapsulation within the following macros, do not change ## the running order or otherwise move them around unless you know exactly ## what you are doing... if test -n "$compiler"; then _LT_COMPILER_NO_RTTI($1) _LT_COMPILER_PIC($1) _LT_COMPILER_C_O($1) _LT_COMPILER_FILE_LOCKS($1) _LT_LINKER_SHLIBS($1) _LT_LINKER_HARDCODE_LIBPATH($1) _LT_CONFIG($1) fi AC_LANG_RESTORE GCC=$lt_save_GCC CC=$lt_save_CC CFLAGS=$lt_save_CFLAGS ])# _LT_LANG_GO_CONFIG # _LT_LANG_RC_CONFIG([TAG]) # ------------------------- # Ensure that the configuration variables for the Windows resource compiler # are suitably defined. These variables are subsequently used by _LT_CONFIG # to write the compiler configuration to 'libtool'. m4_defun([_LT_LANG_RC_CONFIG], [AC_REQUIRE([LT_PROG_RC])dnl AC_LANG_SAVE # Source file extension for RC test sources. ac_ext=rc # Object file extension for compiled RC test sources. objext=o _LT_TAGVAR(objext, $1)=$objext # Code to be used in simple compile tests lt_simple_compile_test_code='sample MENU { MENUITEM "&Soup", 100, CHECKED }' # Code to be used in simple link tests lt_simple_link_test_code=$lt_simple_compile_test_code # ltmain only uses $CC for tagged configurations so make sure $CC is set. _LT_TAG_COMPILER # save warnings/boilerplate of simple test code _LT_COMPILER_BOILERPLATE _LT_LINKER_BOILERPLATE # Allow CC to be a program name with arguments. lt_save_CC=$CC lt_save_CFLAGS=$CFLAGS lt_save_GCC=$GCC GCC= CC=${RC-"windres"} CFLAGS= compiler=$CC _LT_TAGVAR(compiler, $1)=$CC _LT_CC_BASENAME([$compiler]) _LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=yes if test -n "$compiler"; then : _LT_CONFIG($1) fi GCC=$lt_save_GCC AC_LANG_RESTORE CC=$lt_save_CC CFLAGS=$lt_save_CFLAGS ])# _LT_LANG_RC_CONFIG # LT_PROG_GCJ # ----------- AC_DEFUN([LT_PROG_GCJ], [m4_ifdef([AC_PROG_GCJ], [AC_PROG_GCJ], [m4_ifdef([A][M_PROG_GCJ], [A][M_PROG_GCJ], [AC_CHECK_TOOL(GCJ, gcj,) test set = "${GCJFLAGS+set}" || GCJFLAGS="-g -O2" AC_SUBST(GCJFLAGS)])])[]dnl ]) # Old name: AU_ALIAS([LT_AC_PROG_GCJ], [LT_PROG_GCJ]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([LT_AC_PROG_GCJ], []) # LT_PROG_GO # ---------- AC_DEFUN([LT_PROG_GO], [AC_CHECK_TOOL(GOC, gccgo,) ]) # LT_PROG_RC # ---------- AC_DEFUN([LT_PROG_RC], [AC_CHECK_TOOL(RC, windres,) ]) # Old name: AU_ALIAS([LT_AC_PROG_RC], [LT_PROG_RC]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([LT_AC_PROG_RC], []) # _LT_DECL_EGREP # -------------- # If we don't have a new enough Autoconf to choose the best grep # available, choose the one first in the user's PATH. m4_defun([_LT_DECL_EGREP], [AC_REQUIRE([AC_PROG_EGREP])dnl AC_REQUIRE([AC_PROG_FGREP])dnl test -z "$GREP" && GREP=grep _LT_DECL([], [GREP], [1], [A grep program that handles long lines]) _LT_DECL([], [EGREP], [1], [An ERE matcher]) _LT_DECL([], [FGREP], [1], [A literal string matcher]) dnl Non-bleeding-edge autoconf doesn't subst GREP, so do it here too AC_SUBST([GREP]) ]) # _LT_DECL_OBJDUMP # -------------- # If we don't have a new enough Autoconf to choose the best objdump # available, choose the one first in the user's PATH. m4_defun([_LT_DECL_OBJDUMP], [AC_CHECK_TOOL(OBJDUMP, objdump, false) test -z "$OBJDUMP" && OBJDUMP=objdump _LT_DECL([], [OBJDUMP], [1], [An object symbol dumper]) AC_SUBST([OBJDUMP]) ]) # _LT_DECL_DLLTOOL # ---------------- # Ensure DLLTOOL variable is set. m4_defun([_LT_DECL_DLLTOOL], [AC_CHECK_TOOL(DLLTOOL, dlltool, false) test -z "$DLLTOOL" && DLLTOOL=dlltool _LT_DECL([], [DLLTOOL], [1], [DLL creation program]) AC_SUBST([DLLTOOL]) ]) # _LT_DECL_SED # ------------ # Check for a fully-functional sed program, that truncates # as few characters as possible. Prefer GNU sed if found. m4_defun([_LT_DECL_SED], [AC_PROG_SED test -z "$SED" && SED=sed Xsed="$SED -e 1s/^X//" _LT_DECL([], [SED], [1], [A sed program that does not truncate output]) _LT_DECL([], [Xsed], ["\$SED -e 1s/^X//"], [Sed that helps us avoid accidentally triggering echo(1) options like -n]) ])# _LT_DECL_SED m4_ifndef([AC_PROG_SED], [ ############################################################ # NOTE: This macro has been submitted for inclusion into # # GNU Autoconf as AC_PROG_SED. When it is available in # # a released version of Autoconf we should remove this # # macro and use it instead. # ############################################################ m4_defun([AC_PROG_SED], [AC_MSG_CHECKING([for a sed that does not truncate output]) AC_CACHE_VAL(lt_cv_path_SED, [# Loop through the user's path and test for sed and gsed. # Then use that list of sed's as ones to test for truncation. as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for lt_ac_prog in sed gsed; do for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$lt_ac_prog$ac_exec_ext"; then lt_ac_sed_list="$lt_ac_sed_list $as_dir/$lt_ac_prog$ac_exec_ext" fi done done done IFS=$as_save_IFS lt_ac_max=0 lt_ac_count=0 # Add /usr/xpg4/bin/sed as it is typically found on Solaris # along with /bin/sed that truncates output. for lt_ac_sed in $lt_ac_sed_list /usr/xpg4/bin/sed; do test ! -f "$lt_ac_sed" && continue cat /dev/null > conftest.in lt_ac_count=0 echo $ECHO_N "0123456789$ECHO_C" >conftest.in # Check for GNU sed and select it if it is found. if "$lt_ac_sed" --version 2>&1 < /dev/null | grep 'GNU' > /dev/null; then lt_cv_path_SED=$lt_ac_sed break fi while true; do cat conftest.in conftest.in >conftest.tmp mv conftest.tmp conftest.in cp conftest.in conftest.nl echo >>conftest.nl $lt_ac_sed -e 's/a$//' < conftest.nl >conftest.out || break cmp -s conftest.out conftest.nl || break # 10000 chars as input seems more than enough test 10 -lt "$lt_ac_count" && break lt_ac_count=`expr $lt_ac_count + 1` if test "$lt_ac_count" -gt "$lt_ac_max"; then lt_ac_max=$lt_ac_count lt_cv_path_SED=$lt_ac_sed fi done done ]) SED=$lt_cv_path_SED AC_SUBST([SED]) AC_MSG_RESULT([$SED]) ])#AC_PROG_SED ])#m4_ifndef # Old name: AU_ALIAS([LT_AC_PROG_SED], [AC_PROG_SED]) dnl aclocal-1.4 backwards compatibility: dnl AC_DEFUN([LT_AC_PROG_SED], []) # _LT_CHECK_SHELL_FEATURES # ------------------------ # Find out whether the shell is Bourne or XSI compatible, # or has some other useful features. m4_defun([_LT_CHECK_SHELL_FEATURES], [if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then lt_unset=unset else lt_unset=false fi _LT_DECL([], [lt_unset], [0], [whether the shell understands "unset"])dnl # test EBCDIC or ASCII case `echo X|tr X '\101'` in A) # ASCII based system # \n is not interpreted correctly by Solaris 8 /usr/ucb/tr lt_SP2NL='tr \040 \012' lt_NL2SP='tr \015\012 \040\040' ;; *) # EBCDIC based system lt_SP2NL='tr \100 \n' lt_NL2SP='tr \r\n \100\100' ;; esac _LT_DECL([SP2NL], [lt_SP2NL], [1], [turn spaces into newlines])dnl _LT_DECL([NL2SP], [lt_NL2SP], [1], [turn newlines into spaces])dnl ])# _LT_CHECK_SHELL_FEATURES # _LT_PATH_CONVERSION_FUNCTIONS # ----------------------------- # Determine what file name conversion functions should be used by # func_to_host_file (and, implicitly, by func_to_host_path). These are needed # for certain cross-compile configurations and native mingw. m4_defun([_LT_PATH_CONVERSION_FUNCTIONS], [AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_REQUIRE([AC_CANONICAL_BUILD])dnl AC_MSG_CHECKING([how to convert $build file names to $host format]) AC_CACHE_VAL(lt_cv_to_host_file_cmd, [case $host in *-*-mingw* ) case $build in *-*-mingw* ) # actually msys lt_cv_to_host_file_cmd=func_convert_file_msys_to_w32 ;; *-*-cygwin* ) lt_cv_to_host_file_cmd=func_convert_file_cygwin_to_w32 ;; * ) # otherwise, assume *nix lt_cv_to_host_file_cmd=func_convert_file_nix_to_w32 ;; esac ;; *-*-cygwin* ) case $build in *-*-mingw* ) # actually msys lt_cv_to_host_file_cmd=func_convert_file_msys_to_cygwin ;; *-*-cygwin* ) lt_cv_to_host_file_cmd=func_convert_file_noop ;; * ) # otherwise, assume *nix lt_cv_to_host_file_cmd=func_convert_file_nix_to_cygwin ;; esac ;; * ) # unhandled hosts (and "normal" native builds) lt_cv_to_host_file_cmd=func_convert_file_noop ;; esac ]) to_host_file_cmd=$lt_cv_to_host_file_cmd AC_MSG_RESULT([$lt_cv_to_host_file_cmd]) _LT_DECL([to_host_file_cmd], [lt_cv_to_host_file_cmd], [0], [convert $build file names to $host format])dnl AC_MSG_CHECKING([how to convert $build file names to toolchain format]) AC_CACHE_VAL(lt_cv_to_tool_file_cmd, [#assume ordinary cross tools, or native build. lt_cv_to_tool_file_cmd=func_convert_file_noop case $host in *-*-mingw* ) case $build in *-*-mingw* ) # actually msys lt_cv_to_tool_file_cmd=func_convert_file_msys_to_w32 ;; esac ;; esac ]) to_tool_file_cmd=$lt_cv_to_tool_file_cmd AC_MSG_RESULT([$lt_cv_to_tool_file_cmd]) _LT_DECL([to_tool_file_cmd], [lt_cv_to_tool_file_cmd], [0], [convert $build files to toolchain format])dnl ])# _LT_PATH_CONVERSION_FUNCTIONS elpa-2016.05.001/m4/ax_check_gnu_make.m40000644000312500001440000000541112664056454014234 00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_check_gnu_make.html # =========================================================================== # # SYNOPSIS # # AX_CHECK_GNU_MAKE() # # DESCRIPTION # # This macro searches for a GNU version of make. If a match is found, the # makefile variable `ifGNUmake' is set to the empty string, otherwise it # is set to "#". This is useful for including a special features in a # Makefile, which cannot be handled by other versions of make. The # variable _cv_gnu_make_command is set to the command to invoke GNU make # if it exists, the empty string otherwise. # # Here is an example of its use: # # Makefile.in might contain: # # # A failsafe way of putting a dependency rule into a makefile # $(DEPEND): # $(CC) -MM $(srcdir)/*.c > $(DEPEND) # # @ifGNUmake@ ifeq ($(DEPEND),$(wildcard $(DEPEND))) # @ifGNUmake@ include $(DEPEND) # @ifGNUmake@ endif # # Then configure.in would normally contain: # # AX_CHECK_GNU_MAKE() # AC_OUTPUT(Makefile) # # Then perhaps to cause gnu make to override any other make, we could do # something like this (note that GNU make always looks for GNUmakefile # first): # # if ! test x$_cv_gnu_make_command = x ; then # mv Makefile GNUmakefile # echo .DEFAULT: > Makefile ; # echo \ $_cv_gnu_make_command \$@ >> Makefile; # fi # # Then, if any (well almost any) other make is called, and GNU make also # exists, then the other make wraps the GNU make. # # LICENSE # # Copyright (c) 2008 John Darrington # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 7 AC_DEFUN([AX_CHECK_GNU_MAKE], [ AC_CACHE_CHECK( for GNU make,_cv_gnu_make_command, _cv_gnu_make_command='' ; dnl Search all the common names for GNU make for a in "$MAKE" make gmake gnumake ; do if test -z "$a" ; then continue ; fi ; if ( sh -c "$a --version" 2> /dev/null | grep GNU 2>&1 > /dev/null ) ; then _cv_gnu_make_command=$a ; break; fi done ; ) ; dnl If there was a GNU version, then set @ifGNUmake@ to the empty string, '#' otherwise if test "x$_cv_gnu_make_command" != "x" ; then ifGNUmake='' ; else ifGNUmake='#' ; AC_MSG_RESULT("Not found"); fi AC_SUBST(ifGNUmake) ] ) elpa-2016.05.001/test-driver0000755000312500001440000001104012717533405012223 00000000000000#! /bin/sh # test-driver - basic testsuite driver script. scriptversion=2013-07-13.22; # UTC # Copyright (C) 2011-2014 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # This file is maintained in Automake, please report # bugs to or send patches to # . # Make unconditional expansion of undefined variables an error. This # helps a lot in preventing typo-related bugs. set -u usage_error () { echo "$0: $*" >&2 print_usage >&2 exit 2 } print_usage () { cat <$log_file 2>&1 estatus=$? if test $enable_hard_errors = no && test $estatus -eq 99; then tweaked_estatus=1 else tweaked_estatus=$estatus fi case $tweaked_estatus:$expect_failure in 0:yes) col=$red res=XPASS recheck=yes gcopy=yes;; 0:*) col=$grn res=PASS recheck=no gcopy=no;; 77:*) col=$blu res=SKIP recheck=no gcopy=yes;; 99:*) col=$mgn res=ERROR recheck=yes gcopy=yes;; *:yes) col=$lgn res=XFAIL recheck=no gcopy=yes;; *:*) col=$red res=FAIL recheck=yes gcopy=yes;; esac # Report the test outcome and exit status in the logs, so that one can # know whether the test passed or failed simply by looking at the '.log' # file, without the need of also peaking into the corresponding '.trs' # file (automake bug#11814). echo "$res $test_name (exit status: $estatus)" >>$log_file # Report outcome to console. echo "${col}${res}${std}: $test_name" # Register the test result, and other relevant metadata. echo ":test-result: $res" > $trs_file echo ":global-test-result: $res" >> $trs_file echo ":recheck: $recheck" >> $trs_file echo ":copy-in-global-log: $gcopy" >> $trs_file # Local Variables: # mode: shell-script # sh-indentation: 2 # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-time-zone: "UTC" # time-stamp-end: "; # UTC" # End: elpa-2016.05.001/depcomp0000755000312500001440000005601612717533405011416 00000000000000#! /bin/sh # depcomp - compile a program generating dependencies as side-effects scriptversion=2013-05-30.07; # UTC # Copyright (C) 1999-2014 Free Software Foundation, Inc. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Originally written by Alexandre Oliva . case $1 in '') echo "$0: No command. Try '$0 --help' for more information." 1>&2 exit 1; ;; -h | --h*) cat <<\EOF Usage: depcomp [--help] [--version] PROGRAM [ARGS] Run PROGRAMS ARGS to compile a file, generating dependencies as side-effects. Environment variables: depmode Dependency tracking mode. source Source file read by 'PROGRAMS ARGS'. object Object file output by 'PROGRAMS ARGS'. DEPDIR directory where to store dependencies. depfile Dependency file to output. tmpdepfile Temporary file to use when outputting dependencies. libtool Whether libtool is used (yes/no). Report bugs to . EOF exit $? ;; -v | --v*) echo "depcomp $scriptversion" exit $? ;; esac # Get the directory component of the given path, and save it in the # global variables '$dir'. Note that this directory component will # be either empty or ending with a '/' character. This is deliberate. set_dir_from () { case $1 in */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;; *) dir=;; esac } # Get the suffix-stripped basename of the given path, and save it the # global variable '$base'. set_base_from () { base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'` } # If no dependency file was actually created by the compiler invocation, # we still have to create a dummy depfile, to avoid errors with the # Makefile "include basename.Plo" scheme. make_dummy_depfile () { echo "#dummy" > "$depfile" } # Factor out some common post-processing of the generated depfile. # Requires the auxiliary global variable '$tmpdepfile' to be set. aix_post_process_depfile () { # If the compiler actually managed to produce a dependency file, # post-process it. if test -f "$tmpdepfile"; then # Each line is of the form 'foo.o: dependency.h'. # Do two passes, one to just change these to # $object: dependency.h # and one to simply output # dependency.h: # which is needed to avoid the deleted-header problem. { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile" sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile" } > "$depfile" rm -f "$tmpdepfile" else make_dummy_depfile fi } # A tabulation character. tab=' ' # A newline character. nl=' ' # Character ranges might be problematic outside the C locale. # These definitions help. upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ lower=abcdefghijklmnopqrstuvwxyz digits=0123456789 alpha=${upper}${lower} if test -z "$depmode" || test -z "$source" || test -z "$object"; then echo "depcomp: Variables source, object and depmode must be set" 1>&2 exit 1 fi # Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po. depfile=${depfile-`echo "$object" | sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`} tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} rm -f "$tmpdepfile" # Avoid interferences from the environment. gccflag= dashmflag= # Some modes work just like other modes, but use different flags. We # parameterize here, but still list the modes in the big case below, # to make depend.m4 easier to write. Note that we *cannot* use a case # here, because this file can only contain one case statement. if test "$depmode" = hp; then # HP compiler uses -M and no extra arg. gccflag=-M depmode=gcc fi if test "$depmode" = dashXmstdout; then # This is just like dashmstdout with a different argument. dashmflag=-xM depmode=dashmstdout fi cygpath_u="cygpath -u -f -" if test "$depmode" = msvcmsys; then # This is just like msvisualcpp but w/o cygpath translation. # Just convert the backslash-escaped backslashes to single forward # slashes to satisfy depend.m4 cygpath_u='sed s,\\\\,/,g' depmode=msvisualcpp fi if test "$depmode" = msvc7msys; then # This is just like msvc7 but w/o cygpath translation. # Just convert the backslash-escaped backslashes to single forward # slashes to satisfy depend.m4 cygpath_u='sed s,\\\\,/,g' depmode=msvc7 fi if test "$depmode" = xlc; then # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information. gccflag=-qmakedep=gcc,-MF depmode=gcc fi case "$depmode" in gcc3) ## gcc 3 implements dependency tracking that does exactly what ## we want. Yay! Note: for some reason libtool 1.4 doesn't like ## it if -MD -MP comes after the -MF stuff. Hmm. ## Unfortunately, FreeBSD c89 acceptance of flags depends upon ## the command line argument order; so add the flags where they ## appear in depend2.am. Note that the slowdown incurred here ## affects only configure: in makefiles, %FASTDEP% shortcuts this. for arg do case $arg in -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;; *) set fnord "$@" "$arg" ;; esac shift # fnord shift # $arg done "$@" stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile" exit $stat fi mv "$tmpdepfile" "$depfile" ;; gcc) ## Note that this doesn't just cater to obsosete pre-3.x GCC compilers. ## but also to in-use compilers like IMB xlc/xlC and the HP C compiler. ## (see the conditional assignment to $gccflag above). ## There are various ways to get dependency output from gcc. Here's ## why we pick this rather obscure method: ## - Don't want to use -MD because we'd like the dependencies to end ## up in a subdir. Having to rename by hand is ugly. ## (We might end up doing this anyway to support other compilers.) ## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like ## -MM, not -M (despite what the docs say). Also, it might not be ## supported by the other compilers which use the 'gcc' depmode. ## - Using -M directly means running the compiler twice (even worse ## than renaming). if test -z "$gccflag"; then gccflag=-MD, fi "$@" -Wp,"$gccflag$tmpdepfile" stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" echo "$object : \\" > "$depfile" # The second -e expression handles DOS-style file names with drive # letters. sed -e 's/^[^:]*: / /' \ -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile" ## This next piece of magic avoids the "deleted header file" problem. ## The problem is that when a header file which appears in a .P file ## is deleted, the dependency causes make to die (because there is ## typically no way to rebuild the header). We avoid this by adding ## dummy dependencies for each header file. Too bad gcc doesn't do ## this for us directly. ## Some versions of gcc put a space before the ':'. On the theory ## that the space means something, we add a space to the output as ## well. hp depmode also adds that space, but also prefixes the VPATH ## to the object. Take care to not repeat it in the output. ## Some versions of the HPUX 10.20 sed can't process this invocation ## correctly. Breaking it into two sed invocations is a workaround. tr ' ' "$nl" < "$tmpdepfile" \ | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \ | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; hp) # This case exists only to let depend.m4 do its work. It works by # looking at the text of this script. This case will never be run, # since it is checked for above. exit 1 ;; sgi) if test "$libtool" = yes; then "$@" "-Wp,-MDupdate,$tmpdepfile" else "$@" -MDupdate "$tmpdepfile" fi stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files echo "$object : \\" > "$depfile" # Clip off the initial element (the dependent). Don't try to be # clever and replace this with sed code, as IRIX sed won't handle # lines with more than a fixed number of characters (4096 in # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines; # the IRIX cc adds comments like '#:fec' to the end of the # dependency line. tr ' ' "$nl" < "$tmpdepfile" \ | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \ | tr "$nl" ' ' >> "$depfile" echo >> "$depfile" # The second pass generates a dummy entry for each header file. tr ' ' "$nl" < "$tmpdepfile" \ | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \ >> "$depfile" else make_dummy_depfile fi rm -f "$tmpdepfile" ;; xlc) # This case exists only to let depend.m4 do its work. It works by # looking at the text of this script. This case will never be run, # since it is checked for above. exit 1 ;; aix) # The C for AIX Compiler uses -M and outputs the dependencies # in a .u file. In older versions, this file always lives in the # current directory. Also, the AIX compiler puts '$object:' at the # start of each line; $object doesn't have directory information. # Version 6 uses the directory in both cases. set_dir_from "$object" set_base_from "$object" if test "$libtool" = yes; then tmpdepfile1=$dir$base.u tmpdepfile2=$base.u tmpdepfile3=$dir.libs/$base.u "$@" -Wc,-M else tmpdepfile1=$dir$base.u tmpdepfile2=$dir$base.u tmpdepfile3=$dir$base.u "$@" -M fi stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" exit $stat fi for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" do test -f "$tmpdepfile" && break done aix_post_process_depfile ;; tcc) # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26 # FIXME: That version still under development at the moment of writing. # Make that this statement remains true also for stable, released # versions. # It will wrap lines (doesn't matter whether long or short) with a # trailing '\', as in: # # foo.o : \ # foo.c \ # foo.h \ # # It will put a trailing '\' even on the last line, and will use leading # spaces rather than leading tabs (at least since its commit 0394caf7 # "Emit spaces for -MD"). "$@" -MD -MF "$tmpdepfile" stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'. # We have to change lines of the first kind to '$object: \'. sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile" # And for each line of the second kind, we have to emit a 'dep.h:' # dummy dependency, to avoid the deleted-header problem. sed -n -e 's|^ *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile" rm -f "$tmpdepfile" ;; ## The order of this option in the case statement is important, since the ## shell code in configure will try each of these formats in the order ## listed in this file. A plain '-MD' option would be understood by many ## compilers, so we must ensure this comes after the gcc and icc options. pgcc) # Portland's C compiler understands '-MD'. # Will always output deps to 'file.d' where file is the root name of the # source file under compilation, even if file resides in a subdirectory. # The object file name does not affect the name of the '.d' file. # pgcc 10.2 will output # foo.o: sub/foo.c sub/foo.h # and will wrap long lines using '\' : # foo.o: sub/foo.c ... \ # sub/foo.h ... \ # ... set_dir_from "$object" # Use the source, not the object, to determine the base name, since # that's sadly what pgcc will do too. set_base_from "$source" tmpdepfile=$base.d # For projects that build the same source file twice into different object # files, the pgcc approach of using the *source* file root name can cause # problems in parallel builds. Use a locking strategy to avoid stomping on # the same $tmpdepfile. lockdir=$base.d-lock trap " echo '$0: caught signal, cleaning up...' >&2 rmdir '$lockdir' exit 1 " 1 2 13 15 numtries=100 i=$numtries while test $i -gt 0; do # mkdir is a portable test-and-set. if mkdir "$lockdir" 2>/dev/null; then # This process acquired the lock. "$@" -MD stat=$? # Release the lock. rmdir "$lockdir" break else # If the lock is being held by a different process, wait # until the winning process is done or we timeout. while test -d "$lockdir" && test $i -gt 0; do sleep 1 i=`expr $i - 1` done fi i=`expr $i - 1` done trap - 1 2 13 15 if test $i -le 0; then echo "$0: failed to acquire lock after $numtries attempts" >&2 echo "$0: check lockdir '$lockdir'" >&2 exit 1 fi if test $stat -ne 0; then rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" # Each line is of the form `foo.o: dependent.h', # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'. # Do two passes, one to just change these to # `$object: dependent.h' and one to simply `dependent.h:'. sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile" # Some versions of the HPUX 10.20 sed can't process this invocation # correctly. Breaking it into two sed invocations is a workaround. sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \ | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; hp2) # The "hp" stanza above does not work with aCC (C++) and HP's ia64 # compilers, which have integrated preprocessors. The correct option # to use with these is +Maked; it writes dependencies to a file named # 'foo.d', which lands next to the object file, wherever that # happens to be. # Much of this is similar to the tru64 case; see comments there. set_dir_from "$object" set_base_from "$object" if test "$libtool" = yes; then tmpdepfile1=$dir$base.d tmpdepfile2=$dir.libs/$base.d "$@" -Wc,+Maked else tmpdepfile1=$dir$base.d tmpdepfile2=$dir$base.d "$@" +Maked fi stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile1" "$tmpdepfile2" exit $stat fi for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" do test -f "$tmpdepfile" && break done if test -f "$tmpdepfile"; then sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile" # Add 'dependent.h:' lines. sed -ne '2,${ s/^ *// s/ \\*$// s/$/:/ p }' "$tmpdepfile" >> "$depfile" else make_dummy_depfile fi rm -f "$tmpdepfile" "$tmpdepfile2" ;; tru64) # The Tru64 compiler uses -MD to generate dependencies as a side # effect. 'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'. # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put # dependencies in 'foo.d' instead, so we check for that too. # Subdirectories are respected. set_dir_from "$object" set_base_from "$object" if test "$libtool" = yes; then # Libtool generates 2 separate objects for the 2 libraries. These # two compilations output dependencies in $dir.libs/$base.o.d and # in $dir$base.o.d. We have to check for both files, because # one of the two compilations can be disabled. We should prefer # $dir$base.o.d over $dir.libs/$base.o.d because the latter is # automatically cleaned when .libs/ is deleted, while ignoring # the former would cause a distcleancheck panic. tmpdepfile1=$dir$base.o.d # libtool 1.5 tmpdepfile2=$dir.libs/$base.o.d # Likewise. tmpdepfile3=$dir.libs/$base.d # Compaq CCC V6.2-504 "$@" -Wc,-MD else tmpdepfile1=$dir$base.d tmpdepfile2=$dir$base.d tmpdepfile3=$dir$base.d "$@" -MD fi stat=$? if test $stat -ne 0; then rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" exit $stat fi for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" do test -f "$tmpdepfile" && break done # Same post-processing that is required for AIX mode. aix_post_process_depfile ;; msvc7) if test "$libtool" = yes; then showIncludes=-Wc,-showIncludes else showIncludes=-showIncludes fi "$@" $showIncludes > "$tmpdepfile" stat=$? grep -v '^Note: including file: ' "$tmpdepfile" if test $stat -ne 0; then rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" echo "$object : \\" > "$depfile" # The first sed program below extracts the file names and escapes # backslashes for cygpath. The second sed program outputs the file # name when reading, but also accumulates all include files in the # hold buffer in order to output them again at the end. This only # works with sed implementations that can handle large buffers. sed < "$tmpdepfile" -n ' /^Note: including file: *\(.*\)/ { s//\1/ s/\\/\\\\/g p }' | $cygpath_u | sort -u | sed -n ' s/ /\\ /g s/\(.*\)/'"$tab"'\1 \\/p s/.\(.*\) \\/\1:/ H $ { s/.*/'"$tab"'/ G p }' >> "$depfile" echo >> "$depfile" # make sure the fragment doesn't end with a backslash rm -f "$tmpdepfile" ;; msvc7msys) # This case exists only to let depend.m4 do its work. It works by # looking at the text of this script. This case will never be run, # since it is checked for above. exit 1 ;; #nosideeffect) # This comment above is used by automake to tell side-effect # dependency tracking mechanisms from slower ones. dashmstdout) # Important note: in order to support this mode, a compiler *must* # always write the preprocessed file to stdout, regardless of -o. "$@" || exit $? # Remove the call to Libtool. if test "$libtool" = yes; then while test "X$1" != 'X--mode=compile'; do shift done shift fi # Remove '-o $object'. IFS=" " for arg do case $arg in -o) shift ;; $object) shift ;; *) set fnord "$@" "$arg" shift # fnord shift # $arg ;; esac done test -z "$dashmflag" && dashmflag=-M # Require at least two characters before searching for ':' # in the target name. This is to cope with DOS-style filenames: # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise. "$@" $dashmflag | sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile" rm -f "$depfile" cat < "$tmpdepfile" > "$depfile" # Some versions of the HPUX 10.20 sed can't process this sed invocation # correctly. Breaking it into two sed invocations is a workaround. tr ' ' "$nl" < "$tmpdepfile" \ | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; dashXmstdout) # This case only exists to satisfy depend.m4. It is never actually # run, as this mode is specially recognized in the preamble. exit 1 ;; makedepend) "$@" || exit $? # Remove any Libtool call if test "$libtool" = yes; then while test "X$1" != 'X--mode=compile'; do shift done shift fi # X makedepend shift cleared=no eat=no for arg do case $cleared in no) set ""; shift cleared=yes ;; esac if test $eat = yes; then eat=no continue fi case "$arg" in -D*|-I*) set fnord "$@" "$arg"; shift ;; # Strip any option that makedepend may not understand. Remove # the object too, otherwise makedepend will parse it as a source file. -arch) eat=yes ;; -*|$object) ;; *) set fnord "$@" "$arg"; shift ;; esac done obj_suffix=`echo "$object" | sed 's/^.*\././'` touch "$tmpdepfile" ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@" rm -f "$depfile" # makedepend may prepend the VPATH from the source file name to the object. # No need to regex-escape $object, excess matching of '.' is harmless. sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile" # Some versions of the HPUX 10.20 sed can't process the last invocation # correctly. Breaking it into two sed invocations is a workaround. sed '1,2d' "$tmpdepfile" \ | tr ' ' "$nl" \ | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" "$tmpdepfile".bak ;; cpp) # Important note: in order to support this mode, a compiler *must* # always write the preprocessed file to stdout. "$@" || exit $? # Remove the call to Libtool. if test "$libtool" = yes; then while test "X$1" != 'X--mode=compile'; do shift done shift fi # Remove '-o $object'. IFS=" " for arg do case $arg in -o) shift ;; $object) shift ;; *) set fnord "$@" "$arg" shift # fnord shift # $arg ;; esac done "$@" -E \ | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ | sed '$ s: \\$::' > "$tmpdepfile" rm -f "$depfile" echo "$object : \\" > "$depfile" cat < "$tmpdepfile" >> "$depfile" sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; msvisualcpp) # Important note: in order to support this mode, a compiler *must* # always write the preprocessed file to stdout. "$@" || exit $? # Remove the call to Libtool. if test "$libtool" = yes; then while test "X$1" != 'X--mode=compile'; do shift done shift fi IFS=" " for arg do case "$arg" in -o) shift ;; $object) shift ;; "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI") set fnord "$@" shift shift ;; *) set fnord "$@" "$arg" shift shift ;; esac done "$@" -E 2>/dev/null | sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile" rm -f "$depfile" echo "$object : \\" > "$depfile" sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile" echo "$tab" >> "$depfile" sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile" rm -f "$tmpdepfile" ;; msvcmsys) # This case exists only to let depend.m4 do its work. It works by # looking at the text of this script. This case will never be run, # since it is checked for above. exit 1 ;; none) exec "$@" ;; *) echo "Unknown depmode $depmode" 1>&2 exit 1 ;; esac exit 0 # Local Variables: # mode: shell-script # sh-indentation: 2 # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-time-zone: "UTC" # time-stamp-end: "; # UTC" # End: elpa-2016.05.001/Makefile.in0000644000312500001440000035377712717533406012125 00000000000000# Makefile.in generated by automake 1.15 from Makefile.am. # @configure_input@ # Copyright (C) 1994-2014 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ VPATH = @srcdir@ am__is_gnu_make = { \ if test -z '$(MAKELEVEL)'; then \ false; \ elif test -n '$(MAKE_HOST)'; then \ true; \ elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ true; \ else \ false; \ fi; \ } am__make_running_with_option = \ case $${target_option-} in \ ?) ;; \ *) echo "am__make_running_with_option: internal error: invalid" \ "target option '$${target_option-}' specified" >&2; \ exit 1;; \ esac; \ has_opt=no; \ sane_makeflags=$$MAKEFLAGS; \ if $(am__is_gnu_make); then \ sane_makeflags=$$MFLAGS; \ else \ case $$MAKEFLAGS in \ *\\[\ \ ]*) \ bs=\\; \ sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ esac; \ fi; \ skip_next=no; \ strip_trailopt () \ { \ flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ }; \ for flg in $$sane_makeflags; do \ test $$skip_next = yes && { skip_next=no; continue; }; \ case $$flg in \ *=*|--*) continue;; \ -*I) strip_trailopt 'I'; skip_next=yes;; \ -*I?*) strip_trailopt 'I';; \ -*O) strip_trailopt 'O'; skip_next=yes;; \ -*O?*) strip_trailopt 'O';; \ -*l) strip_trailopt 'l'; skip_next=yes;; \ -*l?*) strip_trailopt 'l';; \ -[dEDm]) skip_next=yes;; \ -[JT]) skip_next=yes;; \ esac; \ case $$flg in \ *$$target_option*) has_opt=yes; break;; \ esac; \ done; \ test $$has_opt = yes am__make_dryrun = (target_option=n; $(am__make_running_with_option)) am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ @HAVE_DETAILED_TIMINGS_TRUE@am__append_1 = \ @HAVE_DETAILED_TIMINGS_TRUE@ src/timer.F90 \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/ftimings.F90 \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/ftimings_type.F90 \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/ftimings_value.F90 \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/highwater_mark.c \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/resident_set_size.c \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/time.c \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/virtual_memory.c \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/papi.c @WITH_MPI_FALSE@am__append_2 = src/mod_time_c.F90 @HAVE_DETAILED_TIMINGS_FALSE@@WITH_MPI_FALSE@am__append_3 = src/ftimings/time.c @WITH_REAL_GENERIC_KERNEL_TRUE@am__append_4 = src/elpa2_kernels/elpa2_kernels_real.F90 @WITH_COMPLEX_GENERIC_KERNEL_TRUE@am__append_5 = src/elpa2_kernels/elpa2_kernels_complex.F90 @WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE@am__append_6 = src/elpa2_kernels/elpa2_kernels_real_simple.F90 @WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE@am__append_7 = src/elpa2_kernels/elpa2_kernels_complex_simple.F90 @WITH_REAL_BGP_KERNEL_TRUE@am__append_8 = src/elpa2_kernels/elpa2_kernels_real_bgp.f90 @WITH_REAL_BGQ_KERNEL_TRUE@am__append_9 = src/elpa2_kernels/elpa2_kernels_real_bgq.f90 @WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__append_10 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s @WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__append_11 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s @WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__append_12 = src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c @WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__append_13 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c @WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__append_14 = src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c @WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__append_15 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c @WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__append_16 = src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c @WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__append_17 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c @WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__append_18 = src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c @WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__append_19 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c @WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__append_20 = src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c @WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c bin_PROGRAMS = elpa2_print_kernels@SUFFIX@$(EXEEXT) noinst_PROGRAMS = elpa1_test_real@SUFFIX@$(EXEEXT) \ elpa1_test_complex@SUFFIX@$(EXEEXT) \ elpa2_test_real@SUFFIX@$(EXEEXT) \ elpa2_test_complex@SUFFIX@$(EXEEXT) \ elpa2_test_real_default_kernel@SUFFIX@$(EXEEXT) \ elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@$(EXEEXT) \ elpa2_test_complex_default_kernel@SUFFIX@$(EXEEXT) \ elpa2_test_real_choose_kernel_with_api@SUFFIX@$(EXEEXT) \ elpa2_test_complex_choose_kernel_with_api@SUFFIX@$(EXEEXT) \ elpa1_test_real_with_c@SUFFIX@$(EXEEXT) $(am__EXEEXT_1) @WITH_OPENMP_FALSE@am__append_22 = \ @WITH_OPENMP_FALSE@ elpa1_test_real_c_version@SUFFIX@ \ @WITH_OPENMP_FALSE@ elpa1_test_complex_c_version@SUFFIX@ \ @WITH_OPENMP_FALSE@ elpa2_test_real_c_version@SUFFIX@ \ @WITH_OPENMP_FALSE@ elpa2_test_complex_c_version@SUFFIX@ @WITH_OPENMP_FALSE@am__append_23 = \ @WITH_OPENMP_FALSE@ elpa1_test_real_c_version@SUFFIX@.sh \ @WITH_OPENMP_FALSE@ elpa1_test_complex_c_version@SUFFIX@.sh \ @WITH_OPENMP_FALSE@ elpa2_test_real_c_version@SUFFIX@.sh \ @WITH_OPENMP_FALSE@ elpa2_test_complex_c_version@SUFFIX@.sh TESTS = $(am__EXEEXT_2) subdir = . ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_gnu_make.m4 \ $(top_srcdir)/m4/ax_prog_cc_mpi.m4 \ $(top_srcdir)/m4/ax_prog_doxygen.m4 \ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ $(top_srcdir)/m4/lt~obsolete.m4 \ $(top_srcdir)/fdep/fortran_dependencies.m4 \ $(top_srcdir)/m4/ax_elpa_openmp.m4 \ $(top_srcdir)/m4/ax_prog_fc_mpi.m4 \ $(top_srcdir)/m4/ax_elpa_specific_kernels.m4 \ $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \ $(am__configure_deps) $(dist_doc_DATA) $(dist_files_DATA) \ $(nobase_elpa_include_HEADERS) $(am__DIST_COMMON) am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ configure.lineno config.status.lineno mkinstalldirs = $(install_sh) -d CONFIG_HEADER = config.h CONFIG_CLEAN_FILES = Doxyfile ${PKG_CONFIG_FILE} CONFIG_CLEAN_VPATH_FILES = am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; am__vpath_adj = case $$p in \ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ *) f=$$p;; \ esac; am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; am__install_max = 40 am__nobase_strip_setup = \ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` am__nobase_strip = \ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" am__nobase_list = $(am__nobase_strip_setup); \ for p in $$list; do echo "$$p $$p"; done | \ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ if (++n[$$2] == $(am__install_max)) \ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ END { for (dir in files) print dir, files[dir] }' am__base_list = \ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' am__uninstall_files_from_dir = { \ test -z "$$files" \ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ $(am__cd) "$$dir" && rm -f $$files; }; \ } am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \ "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" \ "$(DESTDIR)$(docdir)" "$(DESTDIR)$(filesdir)" \ "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(elpa_includedir)" LTLIBRARIES = $(lib_LTLIBRARIES) libelpa@SUFFIX@_la_LIBADD = am__libelpa@SUFFIX@_la_SOURCES_DIST = src/mod_precision.f90 \ src/mod_mpi.F90 src/mod_mpi_stubs.F90 \ src/elpa2_kernels/mod_fortran_interfaces.F90 \ src/elpa_utilities.F90 src/elpa1_compute.F90 src/elpa1.F90 \ src/elpa2_utilities.F90 src/mod_pack_unpack_real.F90 \ src/elpa2_kernels/mod_single_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_complex.F90 \ src/mod_pack_unpack_complex.F90 src/aligned_mem.F90 \ src/elpa2_compute.F90 src/elpa2.F90 src/elpa_c_interface.F90 \ src/elpa_qr/qr_utils.F90 src/elpa_qr/elpa_qrkernels.f90 \ src/elpa_qr/elpa_pdlarfb.F90 src/elpa_qr/elpa_pdgeqrf.F90 \ src/timer.F90 src/ftimings/ftimings.F90 \ src/ftimings/ftimings_type.F90 src/ftimings/ftimings_value.F90 \ src/ftimings/highwater_mark.c src/ftimings/resident_set_size.c \ src/ftimings/time.c src/ftimings/virtual_memory.c \ src/ftimings/papi.c src/mod_time_c.F90 \ src/elpa2_kernels/elpa2_kernels_real.F90 \ src/elpa2_kernels/elpa2_kernels_complex.F90 \ src/elpa2_kernels/elpa2_kernels_real_simple.F90 \ src/elpa2_kernels/elpa2_kernels_complex_simple.F90 \ src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ src/elpa2_kernels/elpa2_kernels_asm_x86_64.s \ src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c \ src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \ src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c \ src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c \ src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c \ src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c \ src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c \ src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c \ src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c \ src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c am__dirstamp = $(am__leading_dot)dirstamp @HAVE_DETAILED_TIMINGS_TRUE@am__objects_1 = src/timer.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/ftimings.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/ftimings_type.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/ftimings_value.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/highwater_mark.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/resident_set_size.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/time.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/virtual_memory.lo \ @HAVE_DETAILED_TIMINGS_TRUE@ src/ftimings/papi.lo @WITH_MPI_FALSE@am__objects_2 = src/mod_time_c.lo @HAVE_DETAILED_TIMINGS_FALSE@@WITH_MPI_FALSE@am__objects_3 = src/ftimings/time.lo @WITH_REAL_GENERIC_KERNEL_TRUE@am__objects_4 = src/elpa2_kernels/elpa2_kernels_real.lo @WITH_COMPLEX_GENERIC_KERNEL_TRUE@am__objects_5 = src/elpa2_kernels/elpa2_kernels_complex.lo @WITH_REAL_GENERIC_SIMPLE_KERNEL_TRUE@am__objects_6 = src/elpa2_kernels/elpa2_kernels_real_simple.lo @WITH_COMPLEX_GENERIC_SIMPLE_KERNEL_TRUE@am__objects_7 = src/elpa2_kernels/elpa2_kernels_complex_simple.lo @WITH_REAL_BGP_KERNEL_TRUE@am__objects_8 = src/elpa2_kernels/elpa2_kernels_real_bgp.lo @WITH_REAL_BGQ_KERNEL_TRUE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_real_bgq.lo @WITH_REAL_SSE_ASSEMBLY_KERNEL_TRUE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo @WITH_COMPLEX_SSE_ASSEMBLY_KERNEL_TRUE@@WITH_REAL_SSE_ASSEMBLY_KERNEL_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo @WITH_REAL_SSE_BLOCK2_KERNEL_TRUE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse_2hv.lo @WITH_REAL_AVX_BLOCK2_KERNEL_TRUE@am__objects_13 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.lo @WITH_REAL_SSE_BLOCK4_KERNEL_TRUE@am__objects_14 = src/elpa2_kernels/elpa2_kernels_real_sse_4hv.lo @WITH_REAL_AVX_BLOCK4_KERNEL_TRUE@am__objects_15 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.lo @WITH_REAL_SSE_BLOCK6_KERNEL_TRUE@am__objects_16 = src/elpa2_kernels/elpa2_kernels_real_sse_6hv.lo @WITH_REAL_AVX_BLOCK6_KERNEL_TRUE@am__objects_17 = src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.lo @WITH_COMPLEX_SSE_BLOCK1_KERNEL_TRUE@am__objects_18 = src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.lo @WITH_COMPLEX_AVX_BLOCK1_KERNEL_TRUE@am__objects_19 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.lo @WITH_COMPLEX_SSE_BLOCK2_KERNEL_TRUE@am__objects_20 = src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.lo @WITH_COMPLEX_AVX_BLOCK2_KERNEL_TRUE@am__objects_21 = src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.lo am_libelpa@SUFFIX@_la_OBJECTS = src/mod_precision.lo src/mod_mpi.lo \ src/mod_mpi_stubs.lo \ src/elpa2_kernels/mod_fortran_interfaces.lo \ src/elpa_utilities.lo src/elpa1_compute.lo src/elpa1.lo \ src/elpa2_utilities.lo src/mod_pack_unpack_real.lo \ src/elpa2_kernels/mod_single_hh_trafo_real.lo \ src/mod_compute_hh_trafo_real.lo \ src/mod_compute_hh_trafo_complex.lo \ src/mod_pack_unpack_complex.lo src/aligned_mem.lo \ src/elpa2_compute.lo src/elpa2.lo src/elpa_c_interface.lo \ src/elpa_qr/qr_utils.lo src/elpa_qr/elpa_qrkernels.lo \ src/elpa_qr/elpa_pdlarfb.lo src/elpa_qr/elpa_pdgeqrf.lo \ $(am__objects_1) $(am__objects_2) $(am__objects_3) \ $(am__objects_4) $(am__objects_5) $(am__objects_6) \ $(am__objects_7) $(am__objects_8) $(am__objects_9) \ $(am__objects_10) $(am__objects_11) $(am__objects_12) \ $(am__objects_13) $(am__objects_14) $(am__objects_15) \ $(am__objects_16) $(am__objects_17) $(am__objects_18) \ $(am__objects_19) $(am__objects_20) $(am__objects_21) libelpa@SUFFIX@_la_OBJECTS = $(am_libelpa@SUFFIX@_la_OBJECTS) @WITH_OPENMP_FALSE@am__EXEEXT_1 = \ @WITH_OPENMP_FALSE@ elpa1_test_real_c_version@SUFFIX@$(EXEEXT) \ @WITH_OPENMP_FALSE@ elpa1_test_complex_c_version@SUFFIX@$(EXEEXT) \ @WITH_OPENMP_FALSE@ elpa2_test_real_c_version@SUFFIX@$(EXEEXT) \ @WITH_OPENMP_FALSE@ elpa2_test_complex_c_version@SUFFIX@$(EXEEXT) PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS) am__elpa1_test_complex@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_complex.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am__objects_22 = test/shared_sources/util.$(OBJEXT) \ test/shared_sources/read_input_parameters.$(OBJEXT) \ test/shared_sources/check_correctnes.$(OBJEXT) \ test/shared_sources/setup_mpi.$(OBJEXT) \ test/shared_sources/blacs_infrastructure.$(OBJEXT) \ test/shared_sources/prepare_matrix.$(OBJEXT) \ test/shared_sources/mod_output_types.$(OBJEXT) @HAVE_REDIRECT_TRUE@am__objects_23 = \ @HAVE_REDIRECT_TRUE@ test/shared_sources/redir.$(OBJEXT) \ @HAVE_REDIRECT_TRUE@ test/shared_sources/redirect.$(OBJEXT) am_elpa1_test_complex@SUFFIX@_OBJECTS = \ test/fortran_test_programs/test_complex.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa1_test_complex@SUFFIX@_OBJECTS = \ $(am_elpa1_test_complex@SUFFIX@_OBJECTS) elpa1_test_complex@SUFFIX@_DEPENDENCIES = $(build_lib) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) am__v_lt_0 = --silent am__v_lt_1 = am__elpa1_test_complex_c_version@SUFFIX@_SOURCES_DIST = \ test/c_test_programs/elpa1_test_complex_c_version.c \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 @WITH_OPENMP_FALSE@am_elpa1_test_complex_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa1_test_complex_c_version.$(OBJEXT) \ @WITH_OPENMP_FALSE@ $(am__objects_22) $(am__objects_23) elpa1_test_complex_c_version@SUFFIX@_OBJECTS = \ $(am_elpa1_test_complex_c_version@SUFFIX@_OBJECTS) @WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES = \ @WITH_OPENMP_FALSE@ $(build_lib) am__elpa1_test_real@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_real.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa1_test_real@SUFFIX@_OBJECTS = \ test/fortran_test_programs/test_real.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa1_test_real@SUFFIX@_OBJECTS = \ $(am_elpa1_test_real@SUFFIX@_OBJECTS) elpa1_test_real@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa1_test_real_c_version@SUFFIX@_SOURCES_DIST = \ test/c_test_programs/elpa1_test_real_c_version.c \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 @WITH_OPENMP_FALSE@am_elpa1_test_real_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa1_test_real_c_version.$(OBJEXT) \ @WITH_OPENMP_FALSE@ $(am__objects_22) $(am__objects_23) elpa1_test_real_c_version@SUFFIX@_OBJECTS = \ $(am_elpa1_test_real_c_version@SUFFIX@_OBJECTS) @WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES = \ @WITH_OPENMP_FALSE@ $(build_lib) am__elpa1_test_real_with_c@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_real_with_c.F90 \ test/shared_sources/mod_from_c.F90 \ test/shared_sources/call_elpa1.c test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa1_test_real_with_c@SUFFIX@_OBJECTS = \ test/fortran_test_programs/test_real_with_c.$(OBJEXT) \ test/shared_sources/mod_from_c.$(OBJEXT) \ test/shared_sources/call_elpa1.$(OBJEXT) $(am__objects_22) \ $(am__objects_23) elpa1_test_real_with_c@SUFFIX@_OBJECTS = \ $(am_elpa1_test_real_with_c@SUFFIX@_OBJECTS) elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa2_print_kernels@SUFFIX@_SOURCES_DIST = \ src/elpa2_print_kernels.F90 test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_print_kernels@SUFFIX@_OBJECTS = \ src/elpa2_print_kernels.$(OBJEXT) $(am__objects_22) \ $(am__objects_23) elpa2_print_kernels@SUFFIX@_OBJECTS = \ $(am_elpa2_print_kernels@SUFFIX@_OBJECTS) elpa2_print_kernels@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa2_test_complex@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_complex2.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_complex@SUFFIX@_OBJECTS = \ test/fortran_test_programs/test_complex2.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_complex@SUFFIX@_OBJECTS = \ $(am_elpa2_test_complex@SUFFIX@_OBJECTS) elpa2_test_complex@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa2_test_complex_c_version@SUFFIX@_SOURCES_DIST = \ test/c_test_programs/elpa2_test_complex_c_version.c \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 @WITH_OPENMP_FALSE@am_elpa2_test_complex_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa2_test_complex_c_version.$(OBJEXT) \ @WITH_OPENMP_FALSE@ $(am__objects_22) $(am__objects_23) elpa2_test_complex_c_version@SUFFIX@_OBJECTS = \ $(am_elpa2_test_complex_c_version@SUFFIX@_OBJECTS) @WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES = \ @WITH_OPENMP_FALSE@ $(build_lib) am__elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES_DIST = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS = test/fortran_test_programs/test_complex2_choose_kernel_with_api.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS = $(am_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS) elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = \ $(build_lib) am__elpa2_test_complex_default_kernel@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_complex2_default_kernel.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS = test/fortran_test_programs/test_complex2_default_kernel.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS = \ $(am_elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS) elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa2_test_real@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_real2.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_real@SUFFIX@_OBJECTS = \ test/fortran_test_programs/test_real2.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_real@SUFFIX@_OBJECTS = \ $(am_elpa2_test_real@SUFFIX@_OBJECTS) elpa2_test_real@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa2_test_real_c_version@SUFFIX@_SOURCES_DIST = \ test/c_test_programs/elpa2_test_real_c_version.c \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 @WITH_OPENMP_FALSE@am_elpa2_test_real_c_version@SUFFIX@_OBJECTS = test/c_test_programs/elpa2_test_real_c_version.$(OBJEXT) \ @WITH_OPENMP_FALSE@ $(am__objects_22) $(am__objects_23) elpa2_test_real_c_version@SUFFIX@_OBJECTS = \ $(am_elpa2_test_real_c_version@SUFFIX@_OBJECTS) @WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES = \ @WITH_OPENMP_FALSE@ $(build_lib) am__elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES_DIST = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS = test/fortran_test_programs/test_real2_choose_kernel_with_api.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS = \ $(am_elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS) elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = \ $(build_lib) am__elpa2_test_real_default_kernel@SUFFIX@_SOURCES_DIST = \ test/fortran_test_programs/test_real2_default_kernel.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_real_default_kernel@SUFFIX@_OBJECTS = test/fortran_test_programs/test_real2_default_kernel.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_real_default_kernel@SUFFIX@_OBJECTS = \ $(am_elpa2_test_real_default_kernel@SUFFIX@_OBJECTS) elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES = $(build_lib) am__elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES_DIST = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \ test/shared_sources/util.F90 \ test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 \ test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 \ test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 \ test/shared_sources/redir.c test/shared_sources/redirect.F90 am_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.$(OBJEXT) \ $(am__objects_22) $(am__objects_23) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS = $(am_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES = \ $(build_lib) AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) am__v_P_0 = false am__v_P_1 = : AM_V_GEN = $(am__v_GEN_@AM_V@) am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) am__v_GEN_0 = @echo " GEN " $@; am__v_GEN_1 = AM_V_at = $(am__v_at_@AM_V@) am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) am__v_at_0 = @ am__v_at_1 = DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles am__mv = mv -f PPFCCOMPILE = $(FC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_FCFLAGS) $(FCFLAGS) LTPPFCCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=compile $(FC) $(DEFS) \ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ $(AM_FCFLAGS) $(FCFLAGS) AM_V_PPFC = $(am__v_PPFC_@AM_V@) am__v_PPFC_ = $(am__v_PPFC_@AM_DEFAULT_V@) am__v_PPFC_0 = @echo " PPFC " $@; am__v_PPFC_1 = FCLD = $(FC) FCLINK = $(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(FCLD) $(AM_FCFLAGS) $(FCFLAGS) \ $(AM_LDFLAGS) $(LDFLAGS) -o $@ AM_V_FCLD = $(am__v_FCLD_@AM_V@) am__v_FCLD_ = $(am__v_FCLD_@AM_DEFAULT_V@) am__v_FCLD_0 = @echo " FCLD " $@; am__v_FCLD_1 = COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ $(AM_CFLAGS) $(CFLAGS) AM_V_CC = $(am__v_CC_@AM_V@) am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) am__v_CC_0 = @echo " CC " $@; am__v_CC_1 = CCLD = $(CC) LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ $(AM_LDFLAGS) $(LDFLAGS) -o $@ AM_V_CCLD = $(am__v_CCLD_@AM_V@) am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) am__v_CCLD_0 = @echo " CCLD " $@; am__v_CCLD_1 = FCCOMPILE = $(FC) $(AM_FCFLAGS) $(FCFLAGS) LTFCCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=FC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=compile $(FC) $(AM_FCFLAGS) $(FCFLAGS) AM_V_FC = $(am__v_FC_@AM_V@) am__v_FC_ = $(am__v_FC_@AM_DEFAULT_V@) am__v_FC_0 = @echo " FC " $@; am__v_FC_1 = CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \ $(CCASFLAGS) AM_V_CCAS = $(am__v_CCAS_@AM_V@) am__v_CCAS_ = $(am__v_CCAS_@AM_DEFAULT_V@) am__v_CCAS_0 = @echo " CCAS " $@; am__v_CCAS_1 = SOURCES = $(libelpa@SUFFIX@_la_SOURCES) \ $(elpa1_test_complex@SUFFIX@_SOURCES) \ $(elpa1_test_complex_c_version@SUFFIX@_SOURCES) \ $(elpa1_test_real@SUFFIX@_SOURCES) \ $(elpa1_test_real_c_version@SUFFIX@_SOURCES) \ $(elpa1_test_real_with_c@SUFFIX@_SOURCES) \ $(elpa2_print_kernels@SUFFIX@_SOURCES) \ $(elpa2_test_complex@SUFFIX@_SOURCES) \ $(elpa2_test_complex_c_version@SUFFIX@_SOURCES) \ $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES) \ $(elpa2_test_complex_default_kernel@SUFFIX@_SOURCES) \ $(elpa2_test_real@SUFFIX@_SOURCES) \ $(elpa2_test_real_c_version@SUFFIX@_SOURCES) \ $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES) \ $(elpa2_test_real_default_kernel@SUFFIX@_SOURCES) \ $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES) DIST_SOURCES = $(am__libelpa@SUFFIX@_la_SOURCES_DIST) \ $(am__elpa1_test_complex@SUFFIX@_SOURCES_DIST) \ $(am__elpa1_test_complex_c_version@SUFFIX@_SOURCES_DIST) \ $(am__elpa1_test_real@SUFFIX@_SOURCES_DIST) \ $(am__elpa1_test_real_c_version@SUFFIX@_SOURCES_DIST) \ $(am__elpa1_test_real_with_c@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_print_kernels@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_complex@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_complex_c_version@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_complex_default_kernel@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_real@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_real_c_version@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_real_default_kernel@SUFFIX@_SOURCES_DIST) \ $(am__elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES_DIST) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac man1dir = $(mandir)/man1 man3dir = $(mandir)/man3 NROFF = nroff MANS = $(dist_man_MANS) DATA = $(dist_doc_DATA) $(dist_files_DATA) $(pkgconfig_DATA) HEADERS = $(nobase_elpa_include_HEADERS) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \ $(LISP)config.h.in # Read a list of newline-separated strings from the standard input, # and print each of them once, without duplicates. Input order is # *not* preserved. am__uniquify_input = $(AWK) '\ BEGIN { nonempty = 0; } \ { items[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in items) print i; }; } \ ' # Make sure the list of sources is unique. This is necessary because, # e.g., the same source file might be shared among _SOURCES variables # for different programs/libraries. am__define_uniq_tagged_files = \ list='$(am__tagged_files)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | $(am__uniquify_input)` ETAGS = etags CTAGS = ctags CSCOPE = cscope AM_RECURSIVE_TARGETS = cscope check recheck am__tty_colors_dummy = \ mgn= red= grn= lgn= blu= brg= std=; \ am__color_tests=no am__tty_colors = { \ $(am__tty_colors_dummy); \ if test "X$(AM_COLOR_TESTS)" = Xno; then \ am__color_tests=no; \ elif test "X$(AM_COLOR_TESTS)" = Xalways; then \ am__color_tests=yes; \ elif test "X$$TERM" != Xdumb && { test -t 1; } 2>/dev/null; then \ am__color_tests=yes; \ fi; \ if test $$am__color_tests = yes; then \ red=''; \ grn=''; \ lgn=''; \ blu=''; \ mgn=''; \ brg=''; \ std=''; \ fi; \ } am__recheck_rx = ^[ ]*:recheck:[ ]* am__global_test_result_rx = ^[ ]*:global-test-result:[ ]* am__copy_in_global_log_rx = ^[ ]*:copy-in-global-log:[ ]* # A command that, given a newline-separated list of test names on the # standard input, print the name of the tests that are to be re-run # upon "make recheck". am__list_recheck_tests = $(AWK) '{ \ recheck = 1; \ while ((rc = (getline line < ($$0 ".trs"))) != 0) \ { \ if (rc < 0) \ { \ if ((getline line2 < ($$0 ".log")) < 0) \ recheck = 0; \ break; \ } \ else if (line ~ /$(am__recheck_rx)[nN][Oo]/) \ { \ recheck = 0; \ break; \ } \ else if (line ~ /$(am__recheck_rx)[yY][eE][sS]/) \ { \ break; \ } \ }; \ if (recheck) \ print $$0; \ close ($$0 ".trs"); \ close ($$0 ".log"); \ }' # A command that, given a newline-separated list of test names on the # standard input, create the global log from their .trs and .log files. am__create_global_log = $(AWK) ' \ function fatal(msg) \ { \ print "fatal: making $@: " msg | "cat >&2"; \ exit 1; \ } \ function rst_section(header) \ { \ print header; \ len = length(header); \ for (i = 1; i <= len; i = i + 1) \ printf "="; \ printf "\n\n"; \ } \ { \ copy_in_global_log = 1; \ global_test_result = "RUN"; \ while ((rc = (getline line < ($$0 ".trs"))) != 0) \ { \ if (rc < 0) \ fatal("failed to read from " $$0 ".trs"); \ if (line ~ /$(am__global_test_result_rx)/) \ { \ sub("$(am__global_test_result_rx)", "", line); \ sub("[ ]*$$", "", line); \ global_test_result = line; \ } \ else if (line ~ /$(am__copy_in_global_log_rx)[nN][oO]/) \ copy_in_global_log = 0; \ }; \ if (copy_in_global_log) \ { \ rst_section(global_test_result ": " $$0); \ while ((rc = (getline line < ($$0 ".log"))) != 0) \ { \ if (rc < 0) \ fatal("failed to read from " $$0 ".log"); \ print line; \ }; \ printf "\n"; \ }; \ close ($$0 ".trs"); \ close ($$0 ".log"); \ }' # Restructured Text title. am__rst_title = { sed 's/.*/ & /;h;s/./=/g;p;x;s/ *$$//;p;g' && echo; } # Solaris 10 'make', and several other traditional 'make' implementations, # pass "-e" to $(SHELL), and POSIX 2008 even requires this. Work around it # by disabling -e (using the XSI extension "set +e") if it's set. am__sh_e_setup = case $$- in *e*) set +e;; esac # Default flags passed to test drivers. am__common_driver_flags = \ --color-tests "$$am__color_tests" \ --enable-hard-errors "$$am__enable_hard_errors" \ --expect-failure "$$am__expect_failure" # To be inserted before the command running the test. Creates the # directory for the log if needed. Stores in $dir the directory # containing $f, in $tst the test, in $log the log. Executes the # developer- defined test setup AM_TESTS_ENVIRONMENT (if any), and # passes TESTS_ENVIRONMENT. Set up options for the wrapper that # will run the test scripts (or their associated LOG_COMPILER, if # thy have one). am__check_pre = \ $(am__sh_e_setup); \ $(am__vpath_adj_setup) $(am__vpath_adj) \ $(am__tty_colors); \ srcdir=$(srcdir); export srcdir; \ case "$@" in \ */*) am__odir=`echo "./$@" | sed 's|/[^/]*$$||'`;; \ *) am__odir=.;; \ esac; \ test "x$$am__odir" = x"." || test -d "$$am__odir" \ || $(MKDIR_P) "$$am__odir" || exit $$?; \ if test -f "./$$f"; then dir=./; \ elif test -f "$$f"; then dir=; \ else dir="$(srcdir)/"; fi; \ tst=$$dir$$f; log='$@'; \ if test -n '$(DISABLE_HARD_ERRORS)'; then \ am__enable_hard_errors=no; \ else \ am__enable_hard_errors=yes; \ fi; \ case " $(XFAIL_TESTS) " in \ *[\ \ ]$$f[\ \ ]* | *[\ \ ]$$dir$$f[\ \ ]*) \ am__expect_failure=yes;; \ *) \ am__expect_failure=no;; \ esac; \ $(AM_TESTS_ENVIRONMENT) $(TESTS_ENVIRONMENT) # A shell command to get the names of the tests scripts with any registered # extension removed (i.e., equivalently, the names of the test logs, with # the '.log' extension removed). The result is saved in the shell variable # '$bases'. This honors runtime overriding of TESTS and TEST_LOGS. Sadly, # we cannot use something simpler, involving e.g., "$(TEST_LOGS:.log=)", # since that might cause problem with VPATH rewrites for suffix-less tests. # See also 'test-harness-vpath-rewrite.sh' and 'test-trs-basic.sh'. am__set_TESTS_bases = \ bases='$(TEST_LOGS)'; \ bases=`for i in $$bases; do echo $$i; done | sed 's/\.log$$//'`; \ bases=`echo $$bases` RECHECK_LOGS = $(TEST_LOGS) am__EXEEXT_2 = elpa1_test_real@SUFFIX@.sh \ elpa1_test_real_with_c@SUFFIX@.sh elpa2_test_real@SUFFIX@.sh \ elpa2_test_real_default_kernel@SUFFIX@.sh \ elpa1_test_complex@SUFFIX@.sh elpa2_test_complex@SUFFIX@.sh \ elpa2_test_complex_default_kernel@SUFFIX@.sh \ elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \ elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \ elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \ elpa2_print_kernels@SUFFIX@$(EXEEXT) $(am__append_23) TEST_SUITE_LOG = test-suite.log TEST_EXTENSIONS = @EXEEXT@ .test LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver LOG_COMPILE = $(LOG_COMPILER) $(AM_LOG_FLAGS) $(LOG_FLAGS) am__set_b = \ case '$@' in \ */*) \ case '$*' in \ */*) b='$*';; \ *) b=`echo '$@' | sed 's/\.log$$//'`; \ esac;; \ *) \ b='$*';; \ esac am__test_logs1 = $(TESTS:=.log) am__test_logs2 = $(am__test_logs1:@EXEEXT@.log=.log) TEST_LOGS = $(am__test_logs2:.test.log=.log) TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \ $(TEST_LOG_FLAGS) am__DIST_COMMON = $(dist_man_MANS) $(srcdir)/Doxyfile.in \ $(srcdir)/Makefile.in $(srcdir)/config.h.in \ $(srcdir)/doxygen.am $(srcdir)/elpa.pc.in \ $(srcdir)/generated_headers.am ar-lib compile config.guess \ config.sub depcomp install-sh ltmain.sh missing test-driver DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) am__remove_distdir = \ if test -d "$(distdir)"; then \ find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \ && rm -rf "$(distdir)" \ || { sleep 5 && rm -rf "$(distdir)"; }; \ else :; fi am__post_remove_distdir = $(am__remove_distdir) DIST_ARCHIVES = $(distdir).tar.gz GZIP_ENV = --best DIST_TARGETS = dist-gzip distuninstallcheck_listfiles = find . -type f -print am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \ | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$' distcleancheck_listfiles = find . -type f -print ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ CCAS = @CCAS@ CCASDEPMODE = @CCASDEPMODE@ CCASFLAGS = @CCASFLAGS@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DOXYGEN_OUTPUT_DIR = @DOXYGEN_OUTPUT_DIR@ DOXYGEN_PAPER_SIZE = @DOXYGEN_PAPER_SIZE@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ DX_CONFIG = @DX_CONFIG@ DX_DOCDIR = @DX_DOCDIR@ DX_DOT = @DX_DOT@ DX_DOXYGEN = @DX_DOXYGEN@ DX_DVIPS = @DX_DVIPS@ DX_EGREP = @DX_EGREP@ DX_ENV = @DX_ENV@ DX_FLAG_chi = @DX_FLAG_chi@ DX_FLAG_chm = @DX_FLAG_chm@ DX_FLAG_doc = @DX_FLAG_doc@ DX_FLAG_dot = @DX_FLAG_dot@ DX_FLAG_html = @DX_FLAG_html@ DX_FLAG_man = @DX_FLAG_man@ DX_FLAG_pdf = @DX_FLAG_pdf@ DX_FLAG_ps = @DX_FLAG_ps@ DX_FLAG_rtf = @DX_FLAG_rtf@ DX_FLAG_xml = @DX_FLAG_xml@ DX_HHC = @DX_HHC@ DX_LATEX = @DX_LATEX@ DX_MAKEINDEX = @DX_MAKEINDEX@ DX_PDFLATEX = @DX_PDFLATEX@ DX_PERL = @DX_PERL@ DX_PROJECT = @DX_PROJECT@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ ELPA_SO_VERSION = @ELPA_SO_VERSION@ EXEEXT = @EXEEXT@ FC = @FC@ FCFLAGS = @FCFLAGS@ FCLIBS = @FCLIBS@ FC_MODINC = @FC_MODINC@ FC_MODOUT = @FC_MODOUT@ FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ MAINT = @MAINT@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OPENMP_CFLAGS = @OPENMP_CFLAGS@ OPENMP_FCFLAGS = @OPENMP_FCFLAGS@ OPENMP_LDFLAGS = @OPENMP_LDFLAGS@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ PKG_CONFIG_FILE = @PKG_CONFIG_FILE@ RANLIB = @RANLIB@ SCALAPACK_FCFLAGS = @SCALAPACK_FCFLAGS@ SCALAPACK_LDFLAGS = @SCALAPACK_LDFLAGS@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ SUFFIX = @SUFFIX@ VERSION = @VERSION@ WITH_BLACS = @WITH_BLACS@ WITH_MKL = @WITH_MKL@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ac_ct_FC = @ac_ct_FC@ ac_empty = @ac_empty@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ ifGNUmake = @ifGNUmake@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ with_amd_bulldozer_kernel = @with_amd_bulldozer_kernel@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4 AM_FCFLAGS = $(SCALAPACK_FCFLAGS) @FC_MODINC@modules @FC_MODOUT@modules AM_LDFLAGS = $(SCALAPACK_LDFLAGS) # libelpa lib_LTLIBRARIES = libelpa@SUFFIX@.la libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) -lstdc++ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 src/mod_mpi.F90 \ src/mod_mpi_stubs.F90 \ src/elpa2_kernels/mod_fortran_interfaces.F90 \ src/elpa_utilities.F90 src/elpa1_compute.F90 src/elpa1.F90 \ src/elpa2_utilities.F90 src/mod_pack_unpack_real.F90 \ src/elpa2_kernels/mod_single_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_real.F90 \ src/mod_compute_hh_trafo_complex.F90 \ src/mod_pack_unpack_complex.F90 src/aligned_mem.F90 \ src/elpa2_compute.F90 src/elpa2.F90 src/elpa_c_interface.F90 \ src/elpa_qr/qr_utils.F90 src/elpa_qr/elpa_qrkernels.f90 \ src/elpa_qr/elpa_pdlarfb.F90 src/elpa_qr/elpa_pdgeqrf.F90 \ $(am__append_1) $(am__append_2) $(am__append_3) \ $(am__append_4) $(am__append_5) $(am__append_6) \ $(am__append_7) $(am__append_8) $(am__append_9) \ $(am__append_10) $(am__append_11) $(am__append_12) \ $(am__append_13) $(am__append_14) $(am__append_15) \ $(am__append_16) $(am__append_17) $(am__append_18) \ $(am__append_19) $(am__append_20) $(am__append_21) EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES = \ src/elpa_reduce_add_vectors.X90 \ src/elpa_transpose_vectors.X90 \ src/redist_band.X90 generated_headers = config-f90.h elpa/elpa_generated.h test/shared_sources/generated.h elpa/elpa_generated_fortran_interfaces.h BUILT_SOURCES = $(generated_headers) # install any .mod files in the include/ dir elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ nobase_elpa_include_HEADERS = $(wildcard modules/*) elpa/elpa.h \ elpa/elpa_kernel_constants.h elpa/elpa_generated.h dist_man_MANS = \ man/solve_evp_real.3 \ man/solve_evp_real_1stage.3 \ man/solve_evp_complex.3 \ man/solve_evp_complex_1stage.3 \ man/solve_evp_real_2stage.3 \ man/solve_evp_complex_2stage.3 \ man/get_elpa_row_col_comms.3 \ man/get_elpa_communicators.3 \ man/elpa2_print_kernels.1 # other files to distribute filesdir = $(docdir)/examples dist_files_DATA = \ test/fortran_test_programs/read_real.F90 \ test/fortran_test_programs/test_complex2.F90 \ test/fortran_test_programs/test_complex2_default_kernel.F90 \ test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \ test/fortran_test_programs/test_complex.F90 \ test/fortran_test_programs/test_real2.F90 \ test/fortran_test_programs/test_real2_default_kernel.F90 \ test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \ test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \ test/fortran_test_programs/test_real.F90 \ test/fortran_test_programs/test_real_with_c.F90 \ src/elpa2_print_kernels.F90 dist_doc_DATA = README.md USERS_GUIDE.md INSTALL.md CONTRIBUTING.md LICENSE Changelog COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt # pkg-config stuff pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = @PKG_CONFIG_FILE@ build_lib = libelpa@SUFFIX@.la @HAVE_REDIRECT_FALSE@redirect_sources = @HAVE_REDIRECT_TRUE@redirect_sources = test/shared_sources/redir.c test/shared_sources/redirect.F90 #test/shared_sources/mod_precision_created.f90: src/mod_precision.f90 # cp $(top_srcdir)/src/mod_precision.f90 $(top_srcdir)/test/shared_sources/mod_precision_created.f90 shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90 \ test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \ test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90 \ test/shared_sources/mod_output_types.F90 @WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_real_c_version.c $(shared_sources) $(redirect_sources) @WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib) @WITH_OPENMP_FALSE@elpa1_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) @WITH_OPENMP_FALSE@EXTRA_elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 @WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa1_test_complex_c_version.c $(shared_sources) $(redirect_sources) @WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_LDADD = $(build_lib) @WITH_OPENMP_FALSE@elpa1_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) @WITH_OPENMP_FALSE@EXTRA_elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 @WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_real_c_version.c $(shared_sources) $(redirect_sources) @WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_LDADD = $(build_lib) @WITH_OPENMP_FALSE@elpa2_test_real_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) @WITH_OPENMP_FALSE@EXTRA_elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 @WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_SOURCES = test/c_test_programs/elpa2_test_complex_c_version.c $(shared_sources) $(redirect_sources) @WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_LDADD = $(build_lib) @WITH_OPENMP_FALSE@elpa2_test_complex_c_version@SUFFIX@_LINK = $(LINK) $(FCLIBS) @WITH_OPENMP_FALSE@EXTRA_elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa1_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real.F90 $(shared_sources) $(redirect_sources) elpa1_test_real@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa1_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa1_test_real_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_real_with_c.F90 test/shared_sources/mod_from_c.F90 \ test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources) elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 #elpa1_test_complex_with_c@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex_with_c.F90 test/shared_sources/mod_from_c.F90 test/shared_sources/call_elpa1.c $(shared_sources) $(redirect_sources) #elpa1_test_complex_with_c@SUFFIX@_LDADD = $(build_lib) #EXTRA_elpa1_test_complex_with_c@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2.F90 $(shared_sources) $(redirect_sources) elpa2_test_real@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \ $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \ $(shared_sources) $(redirect_sources) elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa1_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex.F90 $(shared_sources) $(redirect_sources) elpa1_test_complex@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa1_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2.F90 $(shared_sources) $(redirect_sources) elpa2_test_complex@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_complex@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel.F90 $(shared_sources) $(redirect_sources) elpa2_test_complex_default_kernel@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \ $(shared_sources) $(redirect_sources) elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib) EXTRA_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES = test/fortran_test_programs/elpa_test_programs_print_headers.X90 elpa2_print_kernels@SUFFIX@_SOURCES = src/elpa2_print_kernels.F90 $(shared_sources) $(redirect_sources) elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib) check_SCRIPTS = elpa1_test_real@SUFFIX@.sh \ elpa1_test_real_with_c@SUFFIX@.sh elpa2_test_real@SUFFIX@.sh \ elpa2_test_real_default_kernel@SUFFIX@.sh \ elpa1_test_complex@SUFFIX@.sh elpa2_test_complex@SUFFIX@.sh \ elpa2_test_complex_default_kernel@SUFFIX@.sh \ elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh \ elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh \ elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh \ elpa2_print_kernels@SUFFIX@ $(am__append_23) @WITH_MPI_FALSE@wrapper = "" # test scripts @WITH_MPI_TRUE@wrapper = "mpiexec -n 2 " @DX_COND_doc_TRUE@@DX_COND_html_TRUE@DX_CLEAN_HTML = @DX_DOCDIR@/html @DX_COND_chm_TRUE@@DX_COND_doc_TRUE@DX_CLEAN_CHM = @DX_DOCDIR@/chm @DX_COND_chi_TRUE@@DX_COND_chm_TRUE@@DX_COND_doc_TRUE@DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi @DX_COND_doc_TRUE@@DX_COND_man_TRUE@DX_CLEAN_MAN = @DX_DOCDIR@/man @DX_COND_doc_TRUE@@DX_COND_rtf_TRUE@DX_CLEAN_RTF = @DX_DOCDIR@/rtf @DX_COND_doc_TRUE@@DX_COND_xml_TRUE@DX_CLEAN_XML = @DX_DOCDIR@/xml @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@DX_PS_GOAL = doxygen-ps @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@DX_PDF_GOAL = doxygen-pdf @DX_COND_doc_TRUE@@DX_COND_latex_TRUE@DX_CLEAN_LATEX = @DX_DOCDIR@/latex @DX_COND_doc_TRUE@DX_CLEANFILES = \ @DX_COND_doc_TRUE@ @DX_DOCDIR@/@PACKAGE@.tag \ @DX_COND_doc_TRUE@ -r \ @DX_COND_doc_TRUE@ $(DX_CLEAN_HTML) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_CHM) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_CHI) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_MAN) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_RTF) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_XML) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_PS) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_PDF) \ @DX_COND_doc_TRUE@ $(DX_CLEAN_LATEX) CLEANFILES = \ elpa-generated.h \ elpa1_test* \ elpa2_test*\ *.i EXTRA_DIST = \ fdep/fortran_dependencies.pl \ fdep/fortran_dependencies.mk \ test/fortran_test_programs/elpa_test_programs_print_headers.X90 \ src/elpa_reduce_add_vectors.X90 \ src/elpa_transpose_vectors.X90 \ src/redist_band.X90 \ elpa.spec LIBTOOL_DEPS = @LIBTOOL_DEPS@ all: $(BUILT_SOURCES) config.h $(MAKE) $(AM_MAKEFLAGS) all-am .SUFFIXES: .SUFFIXES: .F90 .c .f90 .lo .log .o .obj .s .test .test$(EXEEXT) .trs am--refresh: Makefile @: $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/generated_headers.am $(srcdir)/doxygen.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ echo ' cd $(srcdir) && $(AUTOMAKE) --foreign'; \ $(am__cd) $(srcdir) && $(AUTOMAKE) --foreign \ && exit 0; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --foreign Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ echo ' $(SHELL) ./config.status'; \ $(SHELL) ./config.status;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \ esac; $(srcdir)/generated_headers.am $(srcdir)/doxygen.am $(am__empty): $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) $(SHELL) ./config.status --recheck $(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) $(am__cd) $(srcdir) && $(AUTOCONF) $(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) $(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS) $(am__aclocal_m4_deps): config.h: stamp-h1 @test -f $@ || rm -f stamp-h1 @test -f $@ || $(MAKE) $(AM_MAKEFLAGS) stamp-h1 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status @rm -f stamp-h1 cd $(top_builddir) && $(SHELL) ./config.status config.h $(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ($(am__cd) $(top_srcdir) && $(AUTOHEADER)) rm -f stamp-h1 touch $@ distclean-hdr: -rm -f config.h stamp-h1 Doxyfile: $(top_builddir)/config.status $(srcdir)/Doxyfile.in cd $(top_builddir) && $(SHELL) ./config.status $@ ${PKG_CONFIG_FILE}: $(top_builddir)/config.status $(srcdir)/elpa.pc.in cd $(top_builddir) && $(SHELL) ./config.status $@ install-libLTLIBRARIES: $(lib_LTLIBRARIES) @$(NORMAL_INSTALL) @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ list2=; for p in $$list; do \ if test -f $$p; then \ list2="$$list2 $$p"; \ else :; fi; \ done; \ test -z "$$list2" || { \ echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \ $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ } uninstall-libLTLIBRARIES: @$(NORMAL_UNINSTALL) @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ for p in $$list; do \ $(am__strip_dir) \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ done clean-libLTLIBRARIES: -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) @list='$(lib_LTLIBRARIES)'; \ locs=`for p in $$list; do echo $$p; done | \ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ sort -u`; \ test -z "$$locs" || { \ echo rm -f $${locs}; \ rm -f $${locs}; \ } src/$(am__dirstamp): @$(MKDIR_P) src @: > src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) src/$(DEPDIR) @: > src/$(DEPDIR)/$(am__dirstamp) src/mod_precision.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/mod_mpi.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp) src/mod_mpi_stubs.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/$(am__dirstamp): @$(MKDIR_P) src/elpa2_kernels @: > src/elpa2_kernels/$(am__dirstamp) src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) src/elpa2_kernels/$(DEPDIR) @: > src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/mod_fortran_interfaces.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa_utilities.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/elpa1_compute.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/elpa1.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp) src/elpa2_utilities.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/mod_pack_unpack_real.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/mod_single_hh_trafo_real.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/mod_compute_hh_trafo_real.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/mod_compute_hh_trafo_complex.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/mod_pack_unpack_complex.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/aligned_mem.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp) src/elpa2_compute.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/elpa2.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp) src/elpa_c_interface.lo: src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) src/elpa_qr/$(am__dirstamp): @$(MKDIR_P) src/elpa_qr @: > src/elpa_qr/$(am__dirstamp) src/elpa_qr/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) src/elpa_qr/$(DEPDIR) @: > src/elpa_qr/$(DEPDIR)/$(am__dirstamp) src/elpa_qr/qr_utils.lo: src/elpa_qr/$(am__dirstamp) \ src/elpa_qr/$(DEPDIR)/$(am__dirstamp) src/elpa_qr/elpa_qrkernels.lo: src/elpa_qr/$(am__dirstamp) \ src/elpa_qr/$(DEPDIR)/$(am__dirstamp) src/elpa_qr/elpa_pdlarfb.lo: src/elpa_qr/$(am__dirstamp) \ src/elpa_qr/$(DEPDIR)/$(am__dirstamp) src/elpa_qr/elpa_pdgeqrf.lo: src/elpa_qr/$(am__dirstamp) \ src/elpa_qr/$(DEPDIR)/$(am__dirstamp) src/timer.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp) src/ftimings/$(am__dirstamp): @$(MKDIR_P) src/ftimings @: > src/ftimings/$(am__dirstamp) src/ftimings/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) src/ftimings/$(DEPDIR) @: > src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/ftimings.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/ftimings_type.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/ftimings_value.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/highwater_mark.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/resident_set_size.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/time.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/virtual_memory.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/ftimings/papi.lo: src/ftimings/$(am__dirstamp) \ src/ftimings/$(DEPDIR)/$(am__dirstamp) src/mod_time_c.lo: src/$(am__dirstamp) src/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_complex.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_simple.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_complex_simple.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_bgp.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_bgq.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_sse_2hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_sse_4hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_sse_6hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) libelpa@SUFFIX@.la: $(libelpa@SUFFIX@_la_OBJECTS) $(libelpa@SUFFIX@_la_DEPENDENCIES) $(EXTRA_libelpa@SUFFIX@_la_DEPENDENCIES) $(AM_V_GEN)$(libelpa@SUFFIX@_la_LINK) -rpath $(libdir) $(libelpa@SUFFIX@_la_OBJECTS) $(libelpa@SUFFIX@_la_LIBADD) $(LIBS) install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \ $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \ fi; \ for p in $$list; do echo "$$p $$p"; done | \ sed 's/$(EXEEXT)$$//' | \ while read p p1; do if test -f $$p \ || test -f $$p1 \ ; then echo "$$p"; echo "$$p"; else :; fi; \ done | \ sed -e 'p;s,.*/,,;n;h' \ -e 's|.*|.|' \ -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \ sed 'N;N;N;s,\n, ,g' | \ $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \ { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ if ($$2 == $$4) files[d] = files[d] " " $$1; \ else { print "f", $$3 "/" $$4, $$1; } } \ END { for (d in files) print "f", d, files[d] }' | \ while read type dir files; do \ if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ test -z "$$files" || { \ echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \ $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ } \ ; done uninstall-binPROGRAMS: @$(NORMAL_UNINSTALL) @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ files=`for p in $$list; do echo "$$p"; done | \ sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \ -e 's/$$/$(EXEEXT)/' \ `; \ test -n "$$list" || exit 0; \ echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ cd "$(DESTDIR)$(bindir)" && rm -f $$files clean-binPROGRAMS: @list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \ echo " rm -f" $$list; \ rm -f $$list || exit $$?; \ test -n "$(EXEEXT)" || exit 0; \ list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ echo " rm -f" $$list; \ rm -f $$list clean-noinstPROGRAMS: @list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \ echo " rm -f" $$list; \ rm -f $$list || exit $$?; \ test -n "$(EXEEXT)" || exit 0; \ list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ echo " rm -f" $$list; \ rm -f $$list test/fortran_test_programs/$(am__dirstamp): @$(MKDIR_P) test/fortran_test_programs @: > test/fortran_test_programs/$(am__dirstamp) test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) test/fortran_test_programs/$(DEPDIR) @: > test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) test/fortran_test_programs/test_complex.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) test/shared_sources/$(am__dirstamp): @$(MKDIR_P) test/shared_sources @: > test/shared_sources/$(am__dirstamp) test/shared_sources/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) test/shared_sources/$(DEPDIR) @: > test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/util.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/read_input_parameters.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/check_correctnes.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/setup_mpi.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/blacs_infrastructure.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/prepare_matrix.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/mod_output_types.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/redir.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/redirect.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) elpa1_test_complex@SUFFIX@$(EXEEXT): $(elpa1_test_complex@SUFFIX@_OBJECTS) $(elpa1_test_complex@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_complex@SUFFIX@_DEPENDENCIES) @rm -f elpa1_test_complex@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa1_test_complex@SUFFIX@_OBJECTS) $(elpa1_test_complex@SUFFIX@_LDADD) $(LIBS) test/c_test_programs/$(am__dirstamp): @$(MKDIR_P) test/c_test_programs @: > test/c_test_programs/$(am__dirstamp) test/c_test_programs/$(DEPDIR)/$(am__dirstamp): @$(MKDIR_P) test/c_test_programs/$(DEPDIR) @: > test/c_test_programs/$(DEPDIR)/$(am__dirstamp) test/c_test_programs/elpa1_test_complex_c_version.$(OBJEXT): \ test/c_test_programs/$(am__dirstamp) \ test/c_test_programs/$(DEPDIR)/$(am__dirstamp) elpa1_test_complex_c_version@SUFFIX@$(EXEEXT): $(elpa1_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_complex_c_version@SUFFIX@_DEPENDENCIES) @rm -f elpa1_test_complex_c_version@SUFFIX@$(EXEEXT) $(AM_V_GEN)$(elpa1_test_complex_c_version@SUFFIX@_LINK) $(elpa1_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa1_test_complex_c_version@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_real.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa1_test_real@SUFFIX@$(EXEEXT): $(elpa1_test_real@SUFFIX@_OBJECTS) $(elpa1_test_real@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_real@SUFFIX@_DEPENDENCIES) @rm -f elpa1_test_real@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa1_test_real@SUFFIX@_OBJECTS) $(elpa1_test_real@SUFFIX@_LDADD) $(LIBS) test/c_test_programs/elpa1_test_real_c_version.$(OBJEXT): \ test/c_test_programs/$(am__dirstamp) \ test/c_test_programs/$(DEPDIR)/$(am__dirstamp) elpa1_test_real_c_version@SUFFIX@$(EXEEXT): $(elpa1_test_real_c_version@SUFFIX@_OBJECTS) $(elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_real_c_version@SUFFIX@_DEPENDENCIES) @rm -f elpa1_test_real_c_version@SUFFIX@$(EXEEXT) $(AM_V_GEN)$(elpa1_test_real_c_version@SUFFIX@_LINK) $(elpa1_test_real_c_version@SUFFIX@_OBJECTS) $(elpa1_test_real_c_version@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_real_with_c.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) test/shared_sources/mod_from_c.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) test/shared_sources/call_elpa1.$(OBJEXT): \ test/shared_sources/$(am__dirstamp) \ test/shared_sources/$(DEPDIR)/$(am__dirstamp) elpa1_test_real_with_c@SUFFIX@$(EXEEXT): $(elpa1_test_real_with_c@SUFFIX@_OBJECTS) $(elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa1_test_real_with_c@SUFFIX@_DEPENDENCIES) @rm -f elpa1_test_real_with_c@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa1_test_real_with_c@SUFFIX@_OBJECTS) $(elpa1_test_real_with_c@SUFFIX@_LDADD) $(LIBS) src/elpa2_print_kernels.$(OBJEXT): src/$(am__dirstamp) \ src/$(DEPDIR)/$(am__dirstamp) elpa2_print_kernels@SUFFIX@$(EXEEXT): $(elpa2_print_kernels@SUFFIX@_OBJECTS) $(elpa2_print_kernels@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_print_kernels@SUFFIX@_DEPENDENCIES) @rm -f elpa2_print_kernels@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_print_kernels@SUFFIX@_OBJECTS) $(elpa2_print_kernels@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_complex2.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_complex@SUFFIX@$(EXEEXT): $(elpa2_test_complex@SUFFIX@_OBJECTS) $(elpa2_test_complex@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_complex@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_complex@SUFFIX@_OBJECTS) $(elpa2_test_complex@SUFFIX@_LDADD) $(LIBS) test/c_test_programs/elpa2_test_complex_c_version.$(OBJEXT): \ test/c_test_programs/$(am__dirstamp) \ test/c_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_complex_c_version@SUFFIX@$(EXEEXT): $(elpa2_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex_c_version@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_complex_c_version@SUFFIX@$(EXEEXT) $(AM_V_GEN)$(elpa2_test_complex_c_version@SUFFIX@_LINK) $(elpa2_test_complex_c_version@SUFFIX@_OBJECTS) $(elpa2_test_complex_c_version@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_complex2_choose_kernel_with_api.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_complex_choose_kernel_with_api@SUFFIX@$(EXEEXT): $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_complex_choose_kernel_with_api@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_complex2_default_kernel.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_complex_default_kernel@SUFFIX@$(EXEEXT): $(elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_complex_default_kernel@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_complex_default_kernel@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_complex_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_complex_default_kernel@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_real2.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_real@SUFFIX@$(EXEEXT): $(elpa2_test_real@SUFFIX@_OBJECTS) $(elpa2_test_real@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_real@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_real@SUFFIX@_OBJECTS) $(elpa2_test_real@SUFFIX@_LDADD) $(LIBS) test/c_test_programs/elpa2_test_real_c_version.$(OBJEXT): \ test/c_test_programs/$(am__dirstamp) \ test/c_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_real_c_version@SUFFIX@$(EXEEXT): $(elpa2_test_real_c_version@SUFFIX@_OBJECTS) $(elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_c_version@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_real_c_version@SUFFIX@$(EXEEXT) $(AM_V_GEN)$(elpa2_test_real_c_version@SUFFIX@_LINK) $(elpa2_test_real_c_version@SUFFIX@_OBJECTS) $(elpa2_test_real_c_version@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_real2_choose_kernel_with_api.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_real_choose_kernel_with_api@SUFFIX@$(EXEEXT): $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_choose_kernel_with_api@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_real_choose_kernel_with_api@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_OBJECTS) $(elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_real2_default_kernel.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_real_default_kernel@SUFFIX@$(EXEEXT): $(elpa2_test_real_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_default_kernel@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_real_default_kernel@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_real_default_kernel@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel@SUFFIX@_LDADD) $(LIBS) test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.$(OBJEXT): \ test/fortran_test_programs/$(am__dirstamp) \ test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@$(EXEEXT): $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES) $(EXTRA_elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_DEPENDENCIES) @rm -f elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@$(EXEEXT) $(AM_V_FCLD)$(FCLINK) $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_OBJECTS) $(elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) -rm -f src/*.$(OBJEXT) -rm -f src/*.lo -rm -f src/elpa2_kernels/*.$(OBJEXT) -rm -f src/elpa2_kernels/*.lo -rm -f src/elpa_qr/*.$(OBJEXT) -rm -f src/elpa_qr/*.lo -rm -f src/ftimings/*.$(OBJEXT) -rm -f src/ftimings/*.lo -rm -f test/c_test_programs/*.$(OBJEXT) -rm -f test/fortran_test_programs/*.$(OBJEXT) -rm -f test/shared_sources/*.$(OBJEXT) distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_avx-avx2_1hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_avx-avx2_2hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_sse_1hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_complex_sse_2hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_avx-avx2_2hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_avx-avx2_4hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_avx-avx2_6hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_sse_2hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_sse_4hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/elpa2_kernels/$(DEPDIR)/elpa2_kernels_real_sse_6hv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/highwater_mark.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/papi.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/resident_set_size.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/time.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@src/ftimings/$(DEPDIR)/virtual_memory.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa1_test_complex_c_version.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa1_test_real_c_version.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa2_test_complex_c_version.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@test/c_test_programs/$(DEPDIR)/elpa2_test_real_c_version.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@test/shared_sources/$(DEPDIR)/call_elpa1.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@test/shared_sources/$(DEPDIR)/redir.Po@am__quote@ .F90.o: $(AM_V_PPFC)$(PPFCCOMPILE) -c -o $@ $< .F90.obj: $(AM_V_PPFC)$(PPFCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .F90.lo: $(AM_V_PPFC)$(LTPPFCCOMPILE) -c -o $@ $< .c.o: @am__fastdepCC_TRUE@ $(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @am__fastdepCC_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< .c.obj: @am__fastdepCC_TRUE@ $(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\ @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\ @am__fastdepCC_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .c.lo: @am__fastdepCC_TRUE@ $(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\ @am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @am__fastdepCC_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Plo @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< .f90.o: $(AM_V_FC)$(FCCOMPILE) -c -o $@ $< .f90.obj: $(AM_V_FC)$(FCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .f90.lo: $(AM_V_FC)$(LTFCCOMPILE) -c -o $@ $< .s.o: $(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $< .s.obj: $(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .s.lo: $(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $< mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs -rm -rf src/.libs src/_libs -rm -rf src/elpa2_kernels/.libs src/elpa2_kernels/_libs -rm -rf src/elpa_qr/.libs src/elpa_qr/_libs -rm -rf src/ftimings/.libs src/ftimings/_libs distclean-libtool: -rm -f libtool config.lt install-man1: $(dist_man_MANS) @$(NORMAL_INSTALL) @list1=''; \ list2='$(dist_man_MANS)'; \ test -n "$(man1dir)" \ && test -n "`echo $$list1$$list2`" \ || exit 0; \ echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \ $(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \ { for i in $$list1; do echo "$$i"; done; \ if test -n "$$list2"; then \ for i in $$list2; do echo "$$i"; done \ | sed -n '/\.1[a-z]*$$/p'; \ fi; \ } | while read p; do \ if test -f $$p; then d=; else d="$(srcdir)/"; fi; \ echo "$$d$$p"; echo "$$p"; \ done | \ sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \ sed 'N;N;s,\n, ,g' | { \ list=; while read file base inst; do \ if test "$$base" = "$$inst"; then list="$$list $$file"; else \ echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \ $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \ fi; \ done; \ for i in $$list; do echo "$$i"; done | $(am__base_list) | \ while read files; do \ test -z "$$files" || { \ echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \ $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \ done; } uninstall-man1: @$(NORMAL_UNINSTALL) @list=''; test -n "$(man1dir)" || exit 0; \ files=`{ for i in $$list; do echo "$$i"; done; \ l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \ sed -n '/\.1[a-z]*$$/p'; \ } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \ dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir) install-man3: $(dist_man_MANS) @$(NORMAL_INSTALL) @list1=''; \ list2='$(dist_man_MANS)'; \ test -n "$(man3dir)" \ && test -n "`echo $$list1$$list2`" \ || exit 0; \ echo " $(MKDIR_P) '$(DESTDIR)$(man3dir)'"; \ $(MKDIR_P) "$(DESTDIR)$(man3dir)" || exit 1; \ { for i in $$list1; do echo "$$i"; done; \ if test -n "$$list2"; then \ for i in $$list2; do echo "$$i"; done \ | sed -n '/\.3[a-z]*$$/p'; \ fi; \ } | while read p; do \ if test -f $$p; then d=; else d="$(srcdir)/"; fi; \ echo "$$d$$p"; echo "$$p"; \ done | \ sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^3][0-9a-z]*$$,3,;x' \ -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \ sed 'N;N;s,\n, ,g' | { \ list=; while read file base inst; do \ if test "$$base" = "$$inst"; then list="$$list $$file"; else \ echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man3dir)/$$inst'"; \ $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man3dir)/$$inst" || exit $$?; \ fi; \ done; \ for i in $$list; do echo "$$i"; done | $(am__base_list) | \ while read files; do \ test -z "$$files" || { \ echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man3dir)'"; \ $(INSTALL_DATA) $$files "$(DESTDIR)$(man3dir)" || exit $$?; }; \ done; } uninstall-man3: @$(NORMAL_UNINSTALL) @list=''; test -n "$(man3dir)" || exit 0; \ files=`{ for i in $$list; do echo "$$i"; done; \ l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \ sed -n '/\.3[a-z]*$$/p'; \ } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^3][0-9a-z]*$$,3,;x' \ -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \ dir='$(DESTDIR)$(man3dir)'; $(am__uninstall_files_from_dir) install-dist_docDATA: $(dist_doc_DATA) @$(NORMAL_INSTALL) @list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(docdir)'"; \ $(MKDIR_P) "$(DESTDIR)$(docdir)" || exit 1; \ fi; \ for p in $$list; do \ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ echo "$$d$$p"; \ done | $(am__base_list) | \ while read files; do \ echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(docdir)'"; \ $(INSTALL_DATA) $$files "$(DESTDIR)$(docdir)" || exit $$?; \ done uninstall-dist_docDATA: @$(NORMAL_UNINSTALL) @list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ dir='$(DESTDIR)$(docdir)'; $(am__uninstall_files_from_dir) install-dist_filesDATA: $(dist_files_DATA) @$(NORMAL_INSTALL) @list='$(dist_files_DATA)'; test -n "$(filesdir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(filesdir)'"; \ $(MKDIR_P) "$(DESTDIR)$(filesdir)" || exit 1; \ fi; \ for p in $$list; do \ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ echo "$$d$$p"; \ done | $(am__base_list) | \ while read files; do \ echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(filesdir)'"; \ $(INSTALL_DATA) $$files "$(DESTDIR)$(filesdir)" || exit $$?; \ done uninstall-dist_filesDATA: @$(NORMAL_UNINSTALL) @list='$(dist_files_DATA)'; test -n "$(filesdir)" || list=; \ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ dir='$(DESTDIR)$(filesdir)'; $(am__uninstall_files_from_dir) install-pkgconfigDATA: $(pkgconfig_DATA) @$(NORMAL_INSTALL) @list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(pkgconfigdir)'"; \ $(MKDIR_P) "$(DESTDIR)$(pkgconfigdir)" || exit 1; \ fi; \ for p in $$list; do \ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ echo "$$d$$p"; \ done | $(am__base_list) | \ while read files; do \ echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgconfigdir)'"; \ $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgconfigdir)" || exit $$?; \ done uninstall-pkgconfigDATA: @$(NORMAL_UNINSTALL) @list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ dir='$(DESTDIR)$(pkgconfigdir)'; $(am__uninstall_files_from_dir) install-nobase_elpa_includeHEADERS: $(nobase_elpa_include_HEADERS) @$(NORMAL_INSTALL) @list='$(nobase_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \ if test -n "$$list"; then \ echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)'"; \ $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)" || exit 1; \ fi; \ $(am__nobase_list) | while read dir files; do \ xfiles=; for file in $$files; do \ if test -f "$$file"; then xfiles="$$xfiles $$file"; \ else xfiles="$$xfiles $(srcdir)/$$file"; fi; done; \ test -z "$$xfiles" || { \ test "x$$dir" = x. || { \ echo " $(MKDIR_P) '$(DESTDIR)$(elpa_includedir)/$$dir'"; \ $(MKDIR_P) "$(DESTDIR)$(elpa_includedir)/$$dir"; }; \ echo " $(INSTALL_HEADER) $$xfiles '$(DESTDIR)$(elpa_includedir)/$$dir'"; \ $(INSTALL_HEADER) $$xfiles "$(DESTDIR)$(elpa_includedir)/$$dir" || exit $$?; }; \ done uninstall-nobase_elpa_includeHEADERS: @$(NORMAL_UNINSTALL) @list='$(nobase_elpa_include_HEADERS)'; test -n "$(elpa_includedir)" || list=; \ $(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \ dir='$(DESTDIR)$(elpa_includedir)'; $(am__uninstall_files_from_dir) ID: $(am__tagged_files) $(am__define_uniq_tagged_files); mkid -fID $$unique tags: tags-am TAGS: tags tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) set x; \ here=`pwd`; \ $(am__define_uniq_tagged_files); \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: ctags-am CTAGS: ctags ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) $(am__define_uniq_tagged_files); \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" cscope: cscope.files test ! -s cscope.files \ || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS) clean-cscope: -rm -f cscope.files cscope.files: clean-cscope cscopelist cscopelist: cscopelist-am cscopelist-am: $(am__tagged_files) list='$(am__tagged_files)'; \ case "$(srcdir)" in \ [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ *) sdir=$(subdir)/$(srcdir) ;; \ esac; \ for i in $$list; do \ if test -f "$$i"; then \ echo "$(subdir)/$$i"; \ else \ echo "$$sdir/$$i"; \ fi; \ done >> $(top_builddir)/cscope.files distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags -rm -f cscope.out cscope.in.out cscope.po.out cscope.files # Recover from deleted '.trs' file; this should ensure that # "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create # both 'foo.log' and 'foo.trs'. Break the recipe in two subshells # to avoid problems with "make -n". .log.trs: rm -f $< $@ $(MAKE) $(AM_MAKEFLAGS) $< # Leading 'am--fnord' is there to ensure the list of targets does not # expand to empty, as could happen e.g. with make check TESTS=''. am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck) am--force-recheck: @: $(TEST_SUITE_LOG): $(TEST_LOGS) @$(am__set_TESTS_bases); \ am__f_ok () { test -f "$$1" && test -r "$$1"; }; \ redo_bases=`for i in $$bases; do \ am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \ done`; \ if test -n "$$redo_bases"; then \ redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \ redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \ if $(am__make_dryrun); then :; else \ rm -f $$redo_logs && rm -f $$redo_results || exit 1; \ fi; \ fi; \ if test -n "$$am__remaking_logs"; then \ echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \ "recursion detected" >&2; \ elif test -n "$$redo_logs"; then \ am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \ fi; \ if $(am__make_dryrun); then :; else \ st=0; \ errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \ for i in $$redo_bases; do \ test -f $$i.trs && test -r $$i.trs \ || { echo "$$errmsg $$i.trs" >&2; st=1; }; \ test -f $$i.log && test -r $$i.log \ || { echo "$$errmsg $$i.log" >&2; st=1; }; \ done; \ test $$st -eq 0 || exit 1; \ fi @$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \ ws='[ ]'; \ results=`for b in $$bases; do echo $$b.trs; done`; \ test -n "$$results" || results=/dev/null; \ all=` grep "^$$ws*:test-result:" $$results | wc -l`; \ pass=` grep "^$$ws*:test-result:$$ws*PASS" $$results | wc -l`; \ fail=` grep "^$$ws*:test-result:$$ws*FAIL" $$results | wc -l`; \ skip=` grep "^$$ws*:test-result:$$ws*SKIP" $$results | wc -l`; \ xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \ xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \ error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \ if test `expr $$fail + $$xpass + $$error` -eq 0; then \ success=true; \ else \ success=false; \ fi; \ br='==================='; br=$$br$$br$$br$$br; \ result_count () \ { \ if test x"$$1" = x"--maybe-color"; then \ maybe_colorize=yes; \ elif test x"$$1" = x"--no-color"; then \ maybe_colorize=no; \ else \ echo "$@: invalid 'result_count' usage" >&2; exit 4; \ fi; \ shift; \ desc=$$1 count=$$2; \ if test $$maybe_colorize = yes && test $$count -gt 0; then \ color_start=$$3 color_end=$$std; \ else \ color_start= color_end=; \ fi; \ echo "$${color_start}# $$desc $$count$${color_end}"; \ }; \ create_testsuite_report () \ { \ result_count $$1 "TOTAL:" $$all "$$brg"; \ result_count $$1 "PASS: " $$pass "$$grn"; \ result_count $$1 "SKIP: " $$skip "$$blu"; \ result_count $$1 "XFAIL:" $$xfail "$$lgn"; \ result_count $$1 "FAIL: " $$fail "$$red"; \ result_count $$1 "XPASS:" $$xpass "$$red"; \ result_count $$1 "ERROR:" $$error "$$mgn"; \ }; \ { \ echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" | \ $(am__rst_title); \ create_testsuite_report --no-color; \ echo; \ echo ".. contents:: :depth: 2"; \ echo; \ for b in $$bases; do echo $$b; done \ | $(am__create_global_log); \ } >$(TEST_SUITE_LOG).tmp || exit 1; \ mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG); \ if $$success; then \ col="$$grn"; \ else \ col="$$red"; \ test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG); \ fi; \ echo "$${col}$$br$${std}"; \ echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}"; \ echo "$${col}$$br$${std}"; \ create_testsuite_report --maybe-color; \ echo "$$col$$br$$std"; \ if $$success; then :; else \ echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}"; \ if test -n "$(PACKAGE_BUGREPORT)"; then \ echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}"; \ fi; \ echo "$$col$$br$$std"; \ fi; \ $$success || exit 1 check-TESTS: @list='$(RECHECK_LOGS)'; test -z "$$list" || rm -f $$list @list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list @test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG) @set +e; $(am__set_TESTS_bases); \ log_list=`for i in $$bases; do echo $$i.log; done`; \ trs_list=`for i in $$bases; do echo $$i.trs; done`; \ log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \ $(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \ exit $$?; recheck: all $(check_SCRIPTS) @test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG) @set +e; $(am__set_TESTS_bases); \ bases=`for i in $$bases; do echo $$i; done \ | $(am__list_recheck_tests)` || exit 1; \ log_list=`for i in $$bases; do echo $$i.log; done`; \ log_list=`echo $$log_list`; \ $(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \ am__force_recheck=am--force-recheck \ TEST_LOGS="$$log_list"; \ exit $$? elpa1_test_real@SUFFIX@.sh.log: elpa1_test_real@SUFFIX@.sh @p='elpa1_test_real@SUFFIX@.sh'; \ b='elpa1_test_real@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa1_test_real_with_c@SUFFIX@.sh.log: elpa1_test_real_with_c@SUFFIX@.sh @p='elpa1_test_real_with_c@SUFFIX@.sh'; \ b='elpa1_test_real_with_c@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_real@SUFFIX@.sh.log: elpa2_test_real@SUFFIX@.sh @p='elpa2_test_real@SUFFIX@.sh'; \ b='elpa2_test_real@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_real_default_kernel@SUFFIX@.sh.log: elpa2_test_real_default_kernel@SUFFIX@.sh @p='elpa2_test_real_default_kernel@SUFFIX@.sh'; \ b='elpa2_test_real_default_kernel@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa1_test_complex@SUFFIX@.sh.log: elpa1_test_complex@SUFFIX@.sh @p='elpa1_test_complex@SUFFIX@.sh'; \ b='elpa1_test_complex@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_complex@SUFFIX@.sh.log: elpa2_test_complex@SUFFIX@.sh @p='elpa2_test_complex@SUFFIX@.sh'; \ b='elpa2_test_complex@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_complex_default_kernel@SUFFIX@.sh.log: elpa2_test_complex_default_kernel@SUFFIX@.sh @p='elpa2_test_complex_default_kernel@SUFFIX@.sh'; \ b='elpa2_test_complex_default_kernel@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh.log: elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh @p='elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh'; \ b='elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh.log: elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh @p='elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh'; \ b='elpa2_test_real_choose_kernel_with_api@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh.log: elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh @p='elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh'; \ b='elpa2_test_complex_choose_kernel_with_api@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_print_kernels@SUFFIX@.log: elpa2_print_kernels@SUFFIX@$(EXEEXT) @p='elpa2_print_kernels@SUFFIX@$(EXEEXT)'; \ b='elpa2_print_kernels@SUFFIX@'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa1_test_real_c_version@SUFFIX@.sh.log: elpa1_test_real_c_version@SUFFIX@.sh @p='elpa1_test_real_c_version@SUFFIX@.sh'; \ b='elpa1_test_real_c_version@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa1_test_complex_c_version@SUFFIX@.sh.log: elpa1_test_complex_c_version@SUFFIX@.sh @p='elpa1_test_complex_c_version@SUFFIX@.sh'; \ b='elpa1_test_complex_c_version@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_real_c_version@SUFFIX@.sh.log: elpa2_test_real_c_version@SUFFIX@.sh @p='elpa2_test_real_c_version@SUFFIX@.sh'; \ b='elpa2_test_real_c_version@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) elpa2_test_complex_c_version@SUFFIX@.sh.log: elpa2_test_complex_c_version@SUFFIX@.sh @p='elpa2_test_complex_c_version@SUFFIX@.sh'; \ b='elpa2_test_complex_c_version@SUFFIX@.sh'; \ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) .test.log: @p='$<'; \ $(am__set_b); \ $(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \ --log-file $$b.log --trs-file $$b.trs \ $(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \ "$$tst" $(AM_TESTS_FD_REDIRECT) @am__EXEEXT_TRUE@.test$(EXEEXT).log: @am__EXEEXT_TRUE@ @p='$<'; \ @am__EXEEXT_TRUE@ $(am__set_b); \ @am__EXEEXT_TRUE@ $(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \ @am__EXEEXT_TRUE@ --log-file $$b.log --trs-file $$b.trs \ @am__EXEEXT_TRUE@ $(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \ @am__EXEEXT_TRUE@ "$$tst" $(AM_TESTS_FD_REDIRECT) distdir: $(DISTFILES) $(am__remove_distdir) test -d "$(distdir)" || mkdir "$(distdir)" @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done -test -n "$(am__skip_mode_fix)" \ || find "$(distdir)" -type d ! -perm -755 \ -exec chmod u+rwx,go+rx {} \; -o \ ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \ ! -type d ! -perm -400 -exec chmod a+r {} \; -o \ ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \ || chmod -R a+r "$(distdir)" dist-gzip: distdir tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz $(am__post_remove_distdir) dist-bzip2: distdir tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2 $(am__post_remove_distdir) dist-lzip: distdir tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz $(am__post_remove_distdir) dist-xz: distdir tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz $(am__post_remove_distdir) dist-tarZ: distdir @echo WARNING: "Support for distribution archives compressed with" \ "legacy program 'compress' is deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z $(am__post_remove_distdir) dist-shar: distdir @echo WARNING: "Support for shar distribution archives is" \ "deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz $(am__post_remove_distdir) dist-zip: distdir -rm -f $(distdir).zip zip -rq $(distdir).zip $(distdir) $(am__post_remove_distdir) dist dist-all: $(MAKE) $(AM_MAKEFLAGS) $(DIST_TARGETS) am__post_remove_distdir='@:' $(am__post_remove_distdir) # This target untars the dist file and tries a VPATH configuration. Then # it guarantees that the distribution is self-contained by making another # tarfile. distcheck: dist case '$(DIST_ARCHIVES)' in \ *.tar.gz*) \ GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\ *.tar.bz2*) \ bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\ *.tar.lz*) \ lzip -dc $(distdir).tar.lz | $(am__untar) ;;\ *.tar.xz*) \ xz -dc $(distdir).tar.xz | $(am__untar) ;;\ *.tar.Z*) \ uncompress -c $(distdir).tar.Z | $(am__untar) ;;\ *.shar.gz*) \ GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\ *.zip*) \ unzip $(distdir).zip ;;\ esac chmod -R a-w $(distdir) chmod u+w $(distdir) mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst chmod a-w $(distdir) test -d $(distdir)/_build || exit 0; \ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \ && am__cwd=`pwd` \ && $(am__cd) $(distdir)/_build/sub \ && ../../configure \ $(AM_DISTCHECK_CONFIGURE_FLAGS) \ $(DISTCHECK_CONFIGURE_FLAGS) \ --srcdir=../.. --prefix="$$dc_install_base" \ && $(MAKE) $(AM_MAKEFLAGS) \ && $(MAKE) $(AM_MAKEFLAGS) dvi \ && $(MAKE) $(AM_MAKEFLAGS) check \ && $(MAKE) $(AM_MAKEFLAGS) install \ && $(MAKE) $(AM_MAKEFLAGS) installcheck \ && $(MAKE) $(AM_MAKEFLAGS) uninstall \ && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \ distuninstallcheck \ && chmod -R a-w "$$dc_install_base" \ && ({ \ (cd ../.. && umask 077 && mkdir "$$dc_destdir") \ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \ distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \ } || { rm -rf "$$dc_destdir"; exit 1; }) \ && rm -rf "$$dc_destdir" \ && $(MAKE) $(AM_MAKEFLAGS) dist \ && rm -rf $(DIST_ARCHIVES) \ && $(MAKE) $(AM_MAKEFLAGS) distcleancheck \ && cd "$$am__cwd" \ || exit 1 $(am__post_remove_distdir) @(echo "$(distdir) archives ready for distribution: "; \ list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \ sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x' distuninstallcheck: @test -n '$(distuninstallcheck_dir)' || { \ echo 'ERROR: trying to run $@ with an empty' \ '$$(distuninstallcheck_dir)' >&2; \ exit 1; \ }; \ $(am__cd) '$(distuninstallcheck_dir)' || { \ echo 'ERROR: cannot chdir into $(distuninstallcheck_dir)' >&2; \ exit 1; \ }; \ test `$(am__distuninstallcheck_listfiles) | wc -l` -eq 0 \ || { echo "ERROR: files left after uninstall:" ; \ if test -n "$(DESTDIR)"; then \ echo " (check DESTDIR support)"; \ fi ; \ $(distuninstallcheck_listfiles) ; \ exit 1; } >&2 distcleancheck: distclean @if test '$(srcdir)' = . ; then \ echo "ERROR: distcleancheck can only run from a VPATH build" ; \ exit 1 ; \ fi @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \ || { echo "ERROR: files left in build directory after distclean:" ; \ $(distcleancheck_listfiles) ; \ exit 1; } >&2 check-am: all-am $(MAKE) $(AM_MAKEFLAGS) $(check_SCRIPTS) $(MAKE) $(AM_MAKEFLAGS) check-TESTS check: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) check-am all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(MANS) $(DATA) $(HEADERS) \ config.h install-binPROGRAMS: install-libLTLIBRARIES installdirs: for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" "$(DESTDIR)$(docdir)" "$(DESTDIR)$(filesdir)" "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(elpa_includedir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) install-am install-exec: install-exec-am install-data: install-data-am uninstall: uninstall-am install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-am install-strip: if test -z '$(STRIP)'; then \ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ install; \ else \ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ fi mostlyclean-generic: -test -z "$(TEST_LOGS)" || rm -f $(TEST_LOGS) -test -z "$(TEST_LOGS:.log=.trs)" || rm -f $(TEST_LOGS:.log=.trs) -test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG) clean-generic: -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) -rm -f src/$(DEPDIR)/$(am__dirstamp) -rm -f src/$(am__dirstamp) -rm -f src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) -rm -f src/elpa2_kernels/$(am__dirstamp) -rm -f src/elpa_qr/$(DEPDIR)/$(am__dirstamp) -rm -f src/elpa_qr/$(am__dirstamp) -rm -f src/ftimings/$(DEPDIR)/$(am__dirstamp) -rm -f src/ftimings/$(am__dirstamp) -rm -f test/c_test_programs/$(DEPDIR)/$(am__dirstamp) -rm -f test/c_test_programs/$(am__dirstamp) -rm -f test/fortran_test_programs/$(DEPDIR)/$(am__dirstamp) -rm -f test/fortran_test_programs/$(am__dirstamp) -rm -f test/shared_sources/$(DEPDIR)/$(am__dirstamp) -rm -f test/shared_sources/$(am__dirstamp) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." -test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES) clean: clean-am clean-am: clean-binPROGRAMS clean-generic clean-libLTLIBRARIES \ clean-libtool clean-local clean-noinstPROGRAMS mostlyclean-am distclean: distclean-am -rm -f $(am__CONFIG_DISTCLEAN_FILES) -rm -rf src/elpa2_kernels/$(DEPDIR) src/ftimings/$(DEPDIR) test/c_test_programs/$(DEPDIR) test/shared_sources/$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-hdr distclean-libtool distclean-local distclean-tags dvi: dvi-am dvi-am: html: html-am html-am: info: info-am info-am: install-data-am: install-dist_docDATA install-dist_filesDATA \ install-man install-nobase_elpa_includeHEADERS \ install-pkgconfigDATA install-dvi: install-dvi-am install-dvi-am: install-exec-am: install-binPROGRAMS install-libLTLIBRARIES install-html: install-html-am install-html-am: install-info: install-info-am install-info-am: install-man: install-man1 install-man3 install-pdf: install-pdf-am install-pdf-am: install-ps: install-ps-am install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am -rm -f $(am__CONFIG_DISTCLEAN_FILES) -rm -rf $(top_srcdir)/autom4te.cache -rm -rf src/elpa2_kernels/$(DEPDIR) src/ftimings/$(DEPDIR) test/c_test_programs/$(DEPDIR) test/shared_sources/$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am mostlyclean-am: mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf: pdf-am pdf-am: ps: ps-am ps-am: uninstall-am: uninstall-binPROGRAMS uninstall-dist_docDATA \ uninstall-dist_filesDATA uninstall-libLTLIBRARIES \ uninstall-man uninstall-nobase_elpa_includeHEADERS \ uninstall-pkgconfigDATA uninstall-man: uninstall-man1 uninstall-man3 .MAKE: all check check-am install install-am install-strip .PHONY: CTAGS GTAGS TAGS all all-am am--refresh check check-TESTS \ check-am clean clean-binPROGRAMS clean-cscope clean-generic \ clean-libLTLIBRARIES clean-libtool clean-local \ clean-noinstPROGRAMS cscope cscopelist-am ctags ctags-am dist \ dist-all dist-bzip2 dist-gzip dist-lzip dist-shar dist-tarZ \ dist-xz dist-zip distcheck distclean distclean-compile \ distclean-generic distclean-hdr distclean-libtool \ distclean-local distclean-tags distcleancheck distdir \ distuninstallcheck dvi dvi-am html html-am info info-am \ install install-am install-binPROGRAMS install-data \ install-data-am install-dist_docDATA install-dist_filesDATA \ install-dvi install-dvi-am install-exec install-exec-am \ install-html install-html-am install-info install-info-am \ install-libLTLIBRARIES install-man install-man1 install-man3 \ install-nobase_elpa_includeHEADERS install-pdf install-pdf-am \ install-pkgconfigDATA install-ps install-ps-am install-strip \ installcheck installcheck-am installdirs maintainer-clean \ maintainer-clean-generic mostlyclean mostlyclean-compile \ mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ recheck tags tags-am uninstall uninstall-am \ uninstall-binPROGRAMS uninstall-dist_docDATA \ uninstall-dist_filesDATA uninstall-libLTLIBRARIES \ uninstall-man uninstall-man1 uninstall-man3 \ uninstall-nobase_elpa_includeHEADERS uninstall-pkgconfigDATA .PRECIOUS: Makefile define extract_interface @echo "Generating $@..."; @grep -h "^ *$1" $^ | sed 's/^ *$1//;' >> $@ || { rm $@; exit 1; } endef elpa test: mkdir $@ test/shared_sources: | test mkdir $@ config-f90.h: config.h @echo "Generating $@..."; @grep "^#define" $^ > $@ || { rm $@; exit 1; } elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa $(call extract_interface,!c>) test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90) | test/shared_sources $(call extract_interface,!c>) elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2_kernels/*.s) | elpa $(call extract_interface,!f>) $(call extract_interface,#!f>) generated-headers: $(generated_headers) %.sh: % echo '$(wrapper)./$^ $$TEST_FLAGS' > $@ chmod +x $@ #elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh: # echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@ # chmod +x $@ #elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh: # echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@ # chmod +x $@ # Preprocessed files (just used for manual inspection) elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@ elpa2.i: $(top_srcdir)/src/elpa2.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@ elpa1.i: $(top_srcdir)/src/elpa1.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 -o $@ mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@ mod_compute_hh_trafo_complex.i: $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_complex.F90 -o $@ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@@DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ cd @DX_DOCDIR@/latex; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ $(DX_LATEX) refman.tex; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ $(MAKEINDEX_PATH) refman.idx; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ $(DX_LATEX) refman.tex; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ countdown=5; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ refman.log > /dev/null 2>&1 \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ && test $$countdown -gt 0; do \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ $(DX_LATEX) refman.tex; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ countdown=`expr $$countdown - 1`; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ done; \ @DX_COND_doc_TRUE@@DX_COND_ps_TRUE@ $(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@@DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ cd @DX_DOCDIR@/latex; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ $(DX_PDFLATEX) refman.tex; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ $(DX_MAKEINDEX) refman.idx; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ $(DX_PDFLATEX) refman.tex; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ countdown=5; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ refman.log > /dev/null 2>&1 \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ && test $$countdown -gt 0; do \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ $(DX_PDFLATEX) refman.tex; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ countdown=`expr $$countdown - 1`; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ done; \ @DX_COND_doc_TRUE@@DX_COND_pdf_TRUE@ mv refman.pdf ../@PACKAGE@.pdf @DX_COND_doc_TRUE@.PHONY: doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL) @DX_COND_doc_TRUE@.INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL) @DX_COND_doc_TRUE@doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag @DX_COND_doc_TRUE@doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL) @DX_COND_doc_TRUE@@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS) @DX_COND_doc_TRUE@ rm -rf @DX_DOCDIR@ @DX_COND_doc_TRUE@ $(DX_ENV) $(DX_DOXYGEN) $(DX_CONFIG) clean-local: -rm -rf modules/* .fortran_dependencies/* -rm -rf $(generated_headers) distclean-local: -rm config-f90.h -rm -rf ./src/elpa2_kernels/.deps -rm -rf ./src/.deps -rm -rf ./test/.deps -rmdir ./src/elpa2_kernels/ -rmdir ./src -rmdir ./test -rmdir ./m4 -rmdir modules/ -rmdir .fortran_dependencies/ libtool: $(LIBTOOL_DEPS) $(SHELL) ./config.status libtool @FORTRAN_MODULE_DEPS@ # Fortran module dependencies only work within each target, # specify that the test programs need a finished library before # one can compile them # $1 Object name define require_elpa_lib $1: libelpa@SUFFIX@.la endef $(foreach p,$(bin_PROGRAMS) $(noinst_PROGRAMS),$(foreach o,$($p_OBJECTS),$(eval $(call require_elpa_lib,$o)))) # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: elpa-2016.05.001/missing0000755000312500001440000001533012717533405011432 00000000000000#! /bin/sh # Common wrapper for a few potentially missing GNU programs. scriptversion=2013-10-28.13; # UTC # Copyright (C) 1996-2014 Free Software Foundation, Inc. # Originally written by Fran,cois Pinard , 1996. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. if test $# -eq 0; then echo 1>&2 "Try '$0 --help' for more information" exit 1 fi case $1 in --is-lightweight) # Used by our autoconf macros to check whether the available missing # script is modern enough. exit 0 ;; --run) # Back-compat with the calling convention used by older automake. shift ;; -h|--h|--he|--hel|--help) echo "\ $0 [OPTION]... PROGRAM [ARGUMENT]... Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due to PROGRAM being missing or too old. Options: -h, --help display this help and exit -v, --version output version information and exit Supported PROGRAM values: aclocal autoconf autoheader autom4te automake makeinfo bison yacc flex lex help2man Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and 'g' are ignored when checking the name. Send bug reports to ." exit $? ;; -v|--v|--ve|--ver|--vers|--versi|--versio|--version) echo "missing $scriptversion (GNU Automake)" exit $? ;; -*) echo 1>&2 "$0: unknown '$1' option" echo 1>&2 "Try '$0 --help' for more information" exit 1 ;; esac # Run the given program, remember its exit status. "$@"; st=$? # If it succeeded, we are done. test $st -eq 0 && exit 0 # Also exit now if we it failed (or wasn't found), and '--version' was # passed; such an option is passed most likely to detect whether the # program is present and works. case $2 in --version|--help) exit $st;; esac # Exit code 63 means version mismatch. This often happens when the user # tries to use an ancient version of a tool on a file that requires a # minimum version. if test $st -eq 63; then msg="probably too old" elif test $st -eq 127; then # Program was missing. msg="missing on your system" else # Program was found and executed, but failed. Give up. exit $st fi perl_URL=http://www.perl.org/ flex_URL=http://flex.sourceforge.net/ gnu_software_URL=http://www.gnu.org/software program_details () { case $1 in aclocal|automake) echo "The '$1' program is part of the GNU Automake package:" echo "<$gnu_software_URL/automake>" echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:" echo "<$gnu_software_URL/autoconf>" echo "<$gnu_software_URL/m4/>" echo "<$perl_URL>" ;; autoconf|autom4te|autoheader) echo "The '$1' program is part of the GNU Autoconf package:" echo "<$gnu_software_URL/autoconf/>" echo "It also requires GNU m4 and Perl in order to run:" echo "<$gnu_software_URL/m4/>" echo "<$perl_URL>" ;; esac } give_advice () { # Normalize program name to check for. normalized_program=`echo "$1" | sed ' s/^gnu-//; t s/^gnu//; t s/^g//; t'` printf '%s\n' "'$1' is $msg." configure_deps="'configure.ac' or m4 files included by 'configure.ac'" case $normalized_program in autoconf*) echo "You should only need it if you modified 'configure.ac'," echo "or m4 files included by it." program_details 'autoconf' ;; autoheader*) echo "You should only need it if you modified 'acconfig.h' or" echo "$configure_deps." program_details 'autoheader' ;; automake*) echo "You should only need it if you modified 'Makefile.am' or" echo "$configure_deps." program_details 'automake' ;; aclocal*) echo "You should only need it if you modified 'acinclude.m4' or" echo "$configure_deps." program_details 'aclocal' ;; autom4te*) echo "You might have modified some maintainer files that require" echo "the 'autom4te' program to be rebuilt." program_details 'autom4te' ;; bison*|yacc*) echo "You should only need it if you modified a '.y' file." echo "You may want to install the GNU Bison package:" echo "<$gnu_software_URL/bison/>" ;; lex*|flex*) echo "You should only need it if you modified a '.l' file." echo "You may want to install the Fast Lexical Analyzer package:" echo "<$flex_URL>" ;; help2man*) echo "You should only need it if you modified a dependency" \ "of a man page." echo "You may want to install the GNU Help2man package:" echo "<$gnu_software_URL/help2man/>" ;; makeinfo*) echo "You should only need it if you modified a '.texi' file, or" echo "any other file indirectly affecting the aspect of the manual." echo "You might want to install the Texinfo package:" echo "<$gnu_software_URL/texinfo/>" echo "The spurious makeinfo call might also be the consequence of" echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might" echo "want to install GNU make:" echo "<$gnu_software_URL/make/>" ;; *) echo "You might have modified some files without having the proper" echo "tools for further handling them. Check the 'README' file, it" echo "often tells you about the needed prerequisites for installing" echo "this package. You may also peek at any GNU archive site, in" echo "case some other package contains this missing '$1' program." ;; esac } give_advice "$1" | sed -e '1s/^/WARNING: /' \ -e '2,$s/^/ /' >&2 # Propagate the correct exit status (expected to be 127 for a program # not found, 63 for a program that failed due to version mismatch). exit $st # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-time-zone: "UTC" # time-stamp-end: "; # UTC" # End: elpa-2016.05.001/config.sub0000755000312500001440000010622312717533405012020 00000000000000#! /bin/sh # Configuration validation subroutine script. # Copyright 1992-2014 Free Software Foundation, Inc. timestamp='2014-12-03' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that # program. This Exception is an additional permission under section 7 # of the GNU General Public License, version 3 ("GPLv3"). # Please send patches to . # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. # If it is invalid, we print an error message on stderr and exit with code 1. # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases # that are meaningful with *any* GNU software. # Each package is responsible for reporting which valid configurations # it does not support. The user should be able to distinguish # a failure to support a valid configuration from a meaningless # configuration. # The goal of this file is to map all the various variations of a given # machine specification into a single specification in the form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM # or in some cases, the newer four-part form: # CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM # It is wrong to echo any other type of specification. me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] CPU-MFR-OPSYS $0 [OPTION] ALIAS Canonicalize a configuration name. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.sub ($timestamp) Copyright 1992-2014 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" exit 1 ;; *local*) # First pass through any local machine types. echo $1 exit ;; * ) break ;; esac done case $# in 0) echo "$me: missing argument$help" >&2 exit 1;; 1) ;; *) echo "$me: too many arguments$help" >&2 exit 1;; esac # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). # Here we must recognize all the valid KERNEL-OS combinations. maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ knetbsd*-gnu* | netbsd*-gnu* | \ kopensolaris*-gnu* | \ storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` ;; android-linux) os=-linux-android basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown ;; *) basic_machine=`echo $1 | sed 's/-[^-]*$//'` if [ $basic_machine != $1 ] then os=`echo $1 | sed 's/.*-/-/'` else os=; fi ;; esac ### Let's recognize common machines as not being operating systems so ### that things like config.sub decstation-3100 work. We also ### recognize some manufacturers as not being operating systems, so we ### can provide default operating systems below. case $os in -sun*os*) # Prevent following clause from handling this invalid input. ;; -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ -apple | -axis | -knuth | -cray | -microblaze*) os= basic_machine=$1 ;; -bluegene*) os=-cnk ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 ;; -scout) ;; -wrs) os=-vxworks basic_machine=$1 ;; -chorusos*) os=-chorusos basic_machine=$1 ;; -chorusrdb) os=-chorusrdb basic_machine=$1 ;; -hiux*) os=-hiuxwe2 ;; -sco6) os=-sco5v6 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5) os=-sco3.2v5 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco4) os=-sco3.2v4 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2.[4-9]*) os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2v[4-9]*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5v6*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco*) os=-sco3.2v2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -udk*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -isc) os=-isc2.2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -clix*) basic_machine=clipper-intergraph ;; -isc*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -lynx*178) os=-lynxos178 ;; -lynx*5) os=-lynxos5 ;; -lynx*) os=-lynxos ;; -ptx*) basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` ;; -windowsnt*) os=`echo $os | sed -e 's/windowsnt/winnt/'` ;; -psos*) os=-psos ;; -mint | -mint[0-9]*) basic_machine=m68k-atari os=-mint ;; esac # Decode aliases for certain CPU-COMPANY combinations. case $basic_machine in # Recognize the basic CPU types without company name. # Some are omitted here because they have special meanings below. 1750a | 580 \ | a29k \ | aarch64 | aarch64_be \ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ | arc | arceb \ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ | avr | avr32 \ | be32 | be64 \ | bfin \ | c4x | c8051 | clipper \ | d10v | d30v | dlx | dsp16xx \ | epiphany \ | fido | fr30 | frv \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ | i370 | i860 | i960 | ia64 \ | ip2k | iq2000 \ | k1om \ | le32 | le64 \ | lm32 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ | mips64octeon | mips64octeonel \ | mips64orion | mips64orionel \ | mips64r5900 | mips64r5900el \ | mips64vr | mips64vrel \ | mips64vr4100 | mips64vr4100el \ | mips64vr4300 | mips64vr4300el \ | mips64vr5000 | mips64vr5000el \ | mips64vr5900 | mips64vr5900el \ | mipsisa32 | mipsisa32el \ | mipsisa32r2 | mipsisa32r2el \ | mipsisa32r6 | mipsisa32r6el \ | mipsisa64 | mipsisa64el \ | mipsisa64r2 | mipsisa64r2el \ | mipsisa64r6 | mipsisa64r6el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ | mipsr5900 | mipsr5900el \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ | nios | nios2 | nios2eb | nios2el \ | ns16k | ns32k \ | open8 | or1k | or1knd | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pyramid \ | riscv32 | riscv64 \ | rl78 | rx \ | score \ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ | spu \ | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ | visium \ | we32k \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) basic_machine=$basic_machine-unknown ;; c54x) basic_machine=tic54x-unknown ;; c55x) basic_machine=tic55x-unknown ;; c6x) basic_machine=tic6x-unknown ;; leon|leon[3-9]) basic_machine=sparc-$basic_machine ;; m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip) basic_machine=$basic_machine-unknown os=-none ;; m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) ;; ms1) basic_machine=mt-unknown ;; strongarm | thumb | xscale) basic_machine=arm-unknown ;; xgate) basic_machine=$basic_machine-unknown os=-none ;; xscaleeb) basic_machine=armeb-unknown ;; xscaleel) basic_machine=armel-unknown ;; # We use `pc' rather than `unknown' # because (1) that's what they normally are, and # (2) the word "unknown" tends to confuse beginning users. i*86 | x86_64) basic_machine=$basic_machine-pc ;; # Object if more than one company name word. *-*-*) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; # Recognize the basic CPU types with company name. 580-* \ | a29k-* \ | aarch64-* | aarch64_be-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ | c8051-* | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ | i*86-* | i860-* | i960-* | ia64-* \ | ip2k-* | iq2000-* \ | k1om-* \ | le32-* | le64-* \ | lm32-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ | microblaze-* | microblazeel-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ | mips64octeon-* | mips64octeonel-* \ | mips64orion-* | mips64orionel-* \ | mips64r5900-* | mips64r5900el-* \ | mips64vr-* | mips64vrel-* \ | mips64vr4100-* | mips64vr4100el-* \ | mips64vr4300-* | mips64vr4300el-* \ | mips64vr5000-* | mips64vr5000el-* \ | mips64vr5900-* | mips64vr5900el-* \ | mipsisa32-* | mipsisa32el-* \ | mipsisa32r2-* | mipsisa32r2el-* \ | mipsisa32r6-* | mipsisa32r6el-* \ | mipsisa64-* | mipsisa64el-* \ | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64r6-* | mipsisa64r6el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ | mipsr5900-* | mipsr5900el-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ | nios-* | nios2-* | nios2eb-* | nios2el-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ | or1k*-* \ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ | pyramid-* \ | rl78-* | romp-* | rs6000-* | rx-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ | sparclite-* \ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \ | tahoe-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tile*-* \ | tron-* \ | ubicom32-* \ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ | visium-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ | ymp-* \ | z8k-* | z80-*) ;; # Recognize the basic CPU types without company name, with glob match. xtensa*) basic_machine=$basic_machine-unknown ;; # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. 386bsd) basic_machine=i386-unknown os=-bsd ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) basic_machine=m68000-att ;; 3b*) basic_machine=we32k-att ;; a29khif) basic_machine=a29k-amd os=-udi ;; abacus) basic_machine=abacus-unknown ;; adobe68k) basic_machine=m68010-adobe os=-scout ;; alliant | fx80) basic_machine=fx80-alliant ;; altos | altos3068) basic_machine=m68k-altos ;; am29k) basic_machine=a29k-none os=-bsd ;; amd64) basic_machine=x86_64-pc ;; amd64-*) basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; amdahl) basic_machine=580-amdahl os=-sysv ;; amiga | amiga-*) basic_machine=m68k-unknown ;; amigaos | amigados) basic_machine=m68k-unknown os=-amigaos ;; amigaunix | amix) basic_machine=m68k-unknown os=-sysv4 ;; apollo68) basic_machine=m68k-apollo os=-sysv ;; apollo68bsd) basic_machine=m68k-apollo os=-bsd ;; aros) basic_machine=i386-pc os=-aros ;; aux) basic_machine=m68k-apple os=-aux ;; balance) basic_machine=ns32k-sequent os=-dynix ;; blackfin) basic_machine=bfin-unknown os=-linux ;; blackfin-*) basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; bluegene*) basic_machine=powerpc-ibm os=-cnk ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c55x-*) basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c6x-*) basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c90) basic_machine=c90-cray os=-unicos ;; cegcc) basic_machine=arm-unknown os=-cegcc ;; convex-c1) basic_machine=c1-convex os=-bsd ;; convex-c2) basic_machine=c2-convex os=-bsd ;; convex-c32) basic_machine=c32-convex os=-bsd ;; convex-c34) basic_machine=c34-convex os=-bsd ;; convex-c38) basic_machine=c38-convex os=-bsd ;; cray | j90) basic_machine=j90-cray os=-unicos ;; craynv) basic_machine=craynv-cray os=-unicosmp ;; cr16 | cr16-*) basic_machine=cr16-unknown os=-elf ;; crds | unos) basic_machine=m68k-crds ;; crisv32 | crisv32-* | etraxfs*) basic_machine=crisv32-axis ;; cris | cris-* | etrax*) basic_machine=cris-axis ;; crx) basic_machine=crx-unknown os=-elf ;; da30 | da30-*) basic_machine=m68k-da30 ;; decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) basic_machine=mips-dec ;; decsystem10* | dec10*) basic_machine=pdp10-dec os=-tops10 ;; decsystem20* | dec20*) basic_machine=pdp10-dec os=-tops20 ;; delta | 3300 | motorola-3300 | motorola-delta \ | 3300-motorola | delta-motorola) basic_machine=m68k-motorola ;; delta88) basic_machine=m88k-motorola os=-sysv3 ;; dicos) basic_machine=i686-pc os=-dicos ;; djgpp) basic_machine=i586-pc os=-msdosdjgpp ;; dpx20 | dpx20-*) basic_machine=rs6000-bull os=-bosx ;; dpx2* | dpx2*-bull) basic_machine=m68k-bull os=-sysv3 ;; ebmon29k) basic_machine=a29k-amd os=-ebmon ;; elxsi) basic_machine=elxsi-elxsi os=-bsd ;; encore | umax | mmax) basic_machine=ns32k-encore ;; es1800 | OSE68k | ose68k | ose | OSE) basic_machine=m68k-ericsson os=-ose ;; fx2800) basic_machine=i860-alliant ;; genix) basic_machine=ns32k-ns ;; gmicro) basic_machine=tron-gmicro os=-sysv ;; go32) basic_machine=i386-pc os=-go32 ;; h3050r* | hiux*) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; h8300hms) basic_machine=h8300-hitachi os=-hms ;; h8300xray) basic_machine=h8300-hitachi os=-xray ;; h8500hms) basic_machine=h8500-hitachi os=-hms ;; harris) basic_machine=m88k-harris os=-sysv3 ;; hp300-*) basic_machine=m68k-hp ;; hp300bsd) basic_machine=m68k-hp os=-bsd ;; hp300hpux) basic_machine=m68k-hp os=-hpux ;; hp3k9[0-9][0-9] | hp9[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k2[0-9][0-9] | hp9k31[0-9]) basic_machine=m68000-hp ;; hp9k3[2-9][0-9]) basic_machine=m68k-hp ;; hp9k6[0-9][0-9] | hp6[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k7[0-79][0-9] | hp7[0-79][0-9]) basic_machine=hppa1.1-hp ;; hp9k78[0-9] | hp78[0-9]) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[0-9][13679] | hp8[0-9][13679]) basic_machine=hppa1.1-hp ;; hp9k8[0-9][0-9] | hp8[0-9][0-9]) basic_machine=hppa1.0-hp ;; hppa-next) os=-nextstep3 ;; hppaosf) basic_machine=hppa1.1-hp os=-osf ;; hppro) basic_machine=hppa1.1-hp os=-proelf ;; i370-ibm* | ibm*) basic_machine=i370-ibm ;; i*86v32) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv32 ;; i*86v4*) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv4 ;; i*86v) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv ;; i*86sol2) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-solaris2 ;; i386mach) basic_machine=i386-mach os=-mach ;; i386-vsta | vsta) basic_machine=i386-unknown os=-vsta ;; iris | iris4d) basic_machine=mips-sgi case $os in -irix*) ;; *) os=-irix4 ;; esac ;; isi68 | isi) basic_machine=m68k-isi os=-sysv ;; leon-*|leon[3-9]-*) basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'` ;; m68knommu) basic_machine=m68k-unknown os=-linux ;; m68knommu-*) basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; m88k-omron*) basic_machine=m88k-omron ;; magnum | m3230) basic_machine=mips-mips os=-sysv ;; merlin) basic_machine=ns32k-utek os=-sysv ;; microblaze*) basic_machine=microblaze-xilinx ;; mingw64) basic_machine=x86_64-pc os=-mingw64 ;; mingw32) basic_machine=i686-pc os=-mingw32 ;; mingw32ce) basic_machine=arm-unknown os=-mingw32ce ;; miniframe) basic_machine=m68000-convergent ;; *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) basic_machine=m68k-atari os=-mint ;; mips3*-*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` ;; mips3*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown ;; monitor) basic_machine=m68k-rom68k os=-coff ;; morphos) basic_machine=powerpc-unknown os=-morphos ;; moxiebox) basic_machine=moxie-unknown os=-moxiebox ;; msdos) basic_machine=i386-pc os=-msdos ;; ms1-*) basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) basic_machine=i686-pc os=-msys ;; mvs) basic_machine=i370-ibm os=-mvs ;; nacl) basic_machine=le32-unknown os=-nacl ;; ncr3000) basic_machine=i486-ncr os=-sysv4 ;; netbsd386) basic_machine=i386-unknown os=-netbsd ;; netwinder) basic_machine=armv4l-rebel os=-linux ;; news | news700 | news800 | news900) basic_machine=m68k-sony os=-newsos ;; news1000) basic_machine=m68030-sony os=-newsos ;; news-3600 | risc-news) basic_machine=mips-sony os=-newsos ;; necv70) basic_machine=v70-nec os=-sysv ;; next | m*-next ) basic_machine=m68k-next case $os in -nextstep* ) ;; -ns2*) os=-nextstep2 ;; *) os=-nextstep3 ;; esac ;; nh3000) basic_machine=m68k-harris os=-cxux ;; nh[45]000) basic_machine=m88k-harris os=-cxux ;; nindy960) basic_machine=i960-intel os=-nindy ;; mon960) basic_machine=i960-intel os=-mon960 ;; nonstopux) basic_machine=mips-compaq os=-nonstopux ;; np1) basic_machine=np1-gould ;; neo-tandem) basic_machine=neo-tandem ;; nse-tandem) basic_machine=nse-tandem ;; nsr-tandem) basic_machine=nsr-tandem ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf ;; openrisc | openrisc-*) basic_machine=or32-unknown ;; os400) basic_machine=powerpc-ibm os=-os400 ;; OSE68000 | ose68000) basic_machine=m68000-ericsson os=-ose ;; os68k) basic_machine=m68k-none os=-os68k ;; pa-hitachi) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; paragon) basic_machine=i860-intel os=-osf ;; parisc) basic_machine=hppa-unknown os=-linux ;; parisc-*) basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; pbd) basic_machine=sparc-tti ;; pbb) basic_machine=m68k-tti ;; pc532 | pc532-*) basic_machine=ns32k-pc532 ;; pc98) basic_machine=i386-pc ;; pc98-*) basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium | p5 | k5 | k6 | nexgen | viac3) basic_machine=i586-pc ;; pentiumpro | p6 | 6x86 | athlon | athlon_*) basic_machine=i686-pc ;; pentiumii | pentium2 | pentiumiii | pentium3) basic_machine=i686-pc ;; pentium4) basic_machine=i786-pc ;; pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumpro-* | p6-* | 6x86-* | athlon-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium4-*) basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pn) basic_machine=pn-gould ;; power) basic_machine=power-ibm ;; ppc | ppcbe) basic_machine=powerpc-unknown ;; ppc-* | ppcbe-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppcle | powerpclittle | ppc-le | powerpc-little) basic_machine=powerpcle-unknown ;; ppcle-* | powerpclittle-*) basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64) basic_machine=powerpc64-unknown ;; ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64le | powerpc64little | ppc64-le | powerpc64-little) basic_machine=powerpc64le-unknown ;; ppc64le-* | powerpc64little-*) basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ps2) basic_machine=i386-ibm ;; pw32) basic_machine=i586-unknown os=-pw32 ;; rdos | rdos64) basic_machine=x86_64-pc os=-rdos ;; rdos32) basic_machine=i386-pc os=-rdos ;; rom68k) basic_machine=m68k-rom68k os=-coff ;; rm[46]00) basic_machine=mips-siemens ;; rtpc | rtpc-*) basic_machine=romp-ibm ;; s390 | s390-*) basic_machine=s390-ibm ;; s390x | s390x-*) basic_machine=s390x-ibm ;; sa29200) basic_machine=a29k-amd os=-udi ;; sb1) basic_machine=mipsisa64sb1-unknown ;; sb1el) basic_machine=mipsisa64sb1el-unknown ;; sde) basic_machine=mipsisa32-sde os=-elf ;; sei) basic_machine=mips-sei os=-seiux ;; sequent) basic_machine=i386-sequent ;; sh) basic_machine=sh-hitachi os=-hms ;; sh5el) basic_machine=sh5le-unknown ;; sh64) basic_machine=sh64-unknown ;; sparclite-wrs | simso-wrs) basic_machine=sparclite-wrs os=-vxworks ;; sps7) basic_machine=m68k-bull os=-sysv2 ;; spur) basic_machine=spur-unknown ;; st2000) basic_machine=m68k-tandem ;; stratus) basic_machine=i860-stratus os=-sysv4 ;; strongarm-* | thumb-*) basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'` ;; sun2) basic_machine=m68000-sun ;; sun2os3) basic_machine=m68000-sun os=-sunos3 ;; sun2os4) basic_machine=m68000-sun os=-sunos4 ;; sun3os3) basic_machine=m68k-sun os=-sunos3 ;; sun3os4) basic_machine=m68k-sun os=-sunos4 ;; sun4os3) basic_machine=sparc-sun os=-sunos3 ;; sun4os4) basic_machine=sparc-sun os=-sunos4 ;; sun4sol2) basic_machine=sparc-sun os=-solaris2 ;; sun3 | sun3-*) basic_machine=m68k-sun ;; sun4) basic_machine=sparc-sun ;; sun386 | sun386i | roadrunner) basic_machine=i386-sun ;; sv1) basic_machine=sv1-cray os=-unicos ;; symmetry) basic_machine=i386-sequent os=-dynix ;; t3e) basic_machine=alphaev5-cray os=-unicos ;; t90) basic_machine=t90-cray os=-unicos ;; tile*) basic_machine=$basic_machine-unknown os=-linux-gnu ;; tx39) basic_machine=mipstx39-unknown ;; tx39el) basic_machine=mipstx39el-unknown ;; toad1) basic_machine=pdp10-xkl os=-tops20 ;; tower | tower-32) basic_machine=m68k-ncr ;; tpf) basic_machine=s390x-ibm os=-tpf ;; udi29k) basic_machine=a29k-amd os=-udi ;; ultra3) basic_machine=a29k-nyu os=-sym1 ;; v810 | necv810) basic_machine=v810-nec os=-none ;; vaxv) basic_machine=vax-dec os=-sysv ;; vms) basic_machine=vax-dec os=-vms ;; vpp*|vx|vx-*) basic_machine=f301-fujitsu ;; vxworks960) basic_machine=i960-wrs os=-vxworks ;; vxworks68) basic_machine=m68k-wrs os=-vxworks ;; vxworks29k) basic_machine=a29k-wrs os=-vxworks ;; w65*) basic_machine=w65-wdc os=-none ;; w89k-*) basic_machine=hppa1.1-winbond os=-proelf ;; xbox) basic_machine=i686-pc os=-mingw32 ;; xps | xps100) basic_machine=xps100-honeywell ;; xscale-* | xscalee[bl]-*) basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'` ;; ymp) basic_machine=ymp-cray os=-unicos ;; z8k-*-coff) basic_machine=z8k-unknown os=-sim ;; z80-*-coff) basic_machine=z80-unknown os=-sim ;; none) basic_machine=none-none os=-none ;; # Here we handle the default manufacturer of certain CPU types. It is in # some cases the only manufacturer, in others, it is the most popular. w89k) basic_machine=hppa1.1-winbond ;; op50n) basic_machine=hppa1.1-oki ;; op60c) basic_machine=hppa1.1-oki ;; romp) basic_machine=romp-ibm ;; mmix) basic_machine=mmix-knuth ;; rs6000) basic_machine=rs6000-ibm ;; vax) basic_machine=vax-dec ;; pdp10) # there are many clones, so DEC is not a safe bet basic_machine=pdp10-unknown ;; pdp11) basic_machine=pdp11-dec ;; we32k) basic_machine=we32k-att ;; sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) basic_machine=sh-unknown ;; sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) basic_machine=sparc-sun ;; cydra) basic_machine=cydra-cydrome ;; orion) basic_machine=orion-highlevel ;; orion105) basic_machine=clipper-highlevel ;; mac | mpw | mac-mpw) basic_machine=m68k-apple ;; pmac | pmac-mpw) basic_machine=powerpc-apple ;; *-unknown) # Make sure to match an already-canonicalized machine name. ;; *) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; esac # Here we canonicalize certain aliases for manufacturers. case $basic_machine in *-digital*) basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` ;; *-commodore*) basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` ;; *) ;; esac # Decode manufacturer-specific aliases for certain operating systems. if [ x"$os" != x"" ] then case $os in # First match some system type aliases # that might get confused with valid system types. # -solaris* is a basic system type, with this one exception. -auroraux) os=-auroraux ;; -solaris1 | -solaris1.*) os=`echo $os | sed -e 's|solaris1|sunos4|'` ;; -solaris) os=-solaris2 ;; -svr4*) os=-sysv4 ;; -unixware*) os=-sysv4.2uw ;; -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; # First accept the basic system types. # The portable systems comes first. # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* | -aros* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ | -bitrig* | -openbsd* | -solidbsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ | -chorusos* | -chorusrdb* | -cegcc* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) case $basic_machine in x86-* | i*86-*) ;; *) os=-nto$os ;; esac ;; -nto-qnx*) ;; -nto*) os=`echo $os | sed -e 's|nto|nto-qnx|'` ;; -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) ;; -mac*) os=`echo $os | sed -e 's|mac|macos|'` ;; -linux-dietlibc) os=-linux-dietlibc ;; -linux*) os=`echo $os | sed -e 's|linux|linux-gnu|'` ;; -sunos5*) os=`echo $os | sed -e 's|sunos5|solaris2|'` ;; -sunos6*) os=`echo $os | sed -e 's|sunos6|solaris3|'` ;; -opened*) os=-openedition ;; -os400*) os=-os400 ;; -wince*) os=-wince ;; -osfrose*) os=-osfrose ;; -osf*) os=-osf ;; -utek*) os=-bsd ;; -dynix*) os=-bsd ;; -acis*) os=-aos ;; -atheos*) os=-atheos ;; -syllable*) os=-syllable ;; -386bsd) os=-bsd ;; -ctix* | -uts*) os=-sysv ;; -nova*) os=-rtmk-nova ;; -ns2 ) os=-nextstep2 ;; -nsk*) os=-nsk ;; # Preserve the version number of sinix5. -sinix5.*) os=`echo $os | sed -e 's|sinix|sysv|'` ;; -sinix*) os=-sysv4 ;; -tpf*) os=-tpf ;; -triton*) os=-sysv3 ;; -oss*) os=-sysv3 ;; -svr4) os=-sysv4 ;; -svr3) os=-sysv3 ;; -sysvr4) os=-sysv4 ;; # This must come after -sysvr4. -sysv*) ;; -ose*) os=-ose ;; -es1800*) os=-ose ;; -xenix) os=-xenix ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) os=-mint ;; -aros*) os=-aros ;; -zvmoe) os=-zvmoe ;; -dicos*) os=-dicos ;; -nacl*) ;; -none) ;; *) # Get rid of the `-' at the beginning of $os. os=`echo $os | sed 's/[^-]*-//'` echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 exit 1 ;; esac else # Here we handle the default operating systems that come with various machines. # The value should be what the vendor currently ships out the door with their # machine or put another way, the most popular os provided with the machine. # Note that if you're going to try to match "-MANUFACTURER" here (say, # "-sun"), then you have to tell the case statement up towards the top # that MANUFACTURER isn't an operating system. Otherwise, code above # will signal an error saying that MANUFACTURER isn't an operating # system, and we'll never get to this point. case $basic_machine in score-*) os=-elf ;; spu-*) os=-elf ;; *-acorn) os=-riscix1.2 ;; arm*-rebel) os=-linux ;; arm*-semi) os=-aout ;; c4x-* | tic4x-*) os=-coff ;; c8051-*) os=-elf ;; hexagon-*) os=-elf ;; tic54x-*) os=-coff ;; tic55x-*) os=-coff ;; tic6x-*) os=-coff ;; # This must come before the *-dec entry. pdp10-*) os=-tops20 ;; pdp11-*) os=-none ;; *-dec | vax-*) os=-ultrix4.2 ;; m68*-apollo) os=-domain ;; i386-sun) os=-sunos4.0.2 ;; m68000-sun) os=-sunos3 ;; m68*-cisco) os=-aout ;; mep-*) os=-elf ;; mips*-cisco) os=-elf ;; mips*-*) os=-elf ;; or32-*) os=-coff ;; *-tti) # must be before sparc entry or we get the wrong os. os=-sysv3 ;; sparc-* | *-sun) os=-sunos4.1.1 ;; *-be) os=-beos ;; *-haiku) os=-haiku ;; *-ibm) os=-aix ;; *-knuth) os=-mmixware ;; *-wec) os=-proelf ;; *-winbond) os=-proelf ;; *-oki) os=-proelf ;; *-hp) os=-hpux ;; *-hitachi) os=-hiux ;; i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) os=-sysv ;; *-cbm) os=-amigaos ;; *-dg) os=-dgux ;; *-dolphin) os=-sysv3 ;; m68k-ccur) os=-rtu ;; m88k-omron*) os=-luna ;; *-next ) os=-nextstep ;; *-sequent) os=-ptx ;; *-crds) os=-unos ;; *-ns) os=-genix ;; i370-*) os=-mvs ;; *-next) os=-nextstep3 ;; *-gould) os=-sysv ;; *-highlevel) os=-bsd ;; *-encore) os=-bsd ;; *-sgi) os=-irix ;; *-siemens) os=-sysv4 ;; *-masscomp) os=-rtu ;; f30[01]-fujitsu | f700-fujitsu) os=-uxpv ;; *-rom68k) os=-coff ;; *-*bug) os=-coff ;; *-apple) os=-macos ;; *-atari*) os=-mint ;; *) os=-none ;; esac fi # Here we handle the case where we know the os, and the CPU type, but not the # manufacturer. We pick the logical manufacturer. vendor=unknown case $basic_machine in *-unknown) case $os in -riscix*) vendor=acorn ;; -sunos*) vendor=sun ;; -cnk*|-aix*) vendor=ibm ;; -beos*) vendor=be ;; -hpux*) vendor=hp ;; -mpeix*) vendor=hp ;; -hiux*) vendor=hitachi ;; -unos*) vendor=crds ;; -dgux*) vendor=dg ;; -luna*) vendor=omron ;; -genix*) vendor=ns ;; -mvs* | -opened*) vendor=ibm ;; -os400*) vendor=ibm ;; -ptx*) vendor=sequent ;; -tpf*) vendor=ibm ;; -vxsim* | -vxworks* | -windiss*) vendor=wrs ;; -aux*) vendor=apple ;; -hms*) vendor=hitachi ;; -mpw* | -macos*) vendor=apple ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) vendor=atari ;; -vos*) vendor=stratus ;; esac basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` ;; esac echo $basic_machine$os exit # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: elpa-2016.05.001/INSTALL.md0000644000312500001440000001275412717402663011473 00000000000000# Installation guide # ## Preamle ## This file provides documentation on how to build the *ELPA* library in **version ELPA-2016.05.001**. Although most of the documentation is generic to any *ELPA* release, some configure options described in this document might be specific to the above mentioned version of *ELPA*. ## How to install ELPA ## First of all, if you do not want to build *ELPA* yourself, and you run Linux, it is worth having a look at the [*ELPA* webpage*] (http://elpa.mpcdf.mpg.de) and/or the repositories of your Linux distribution: there exist pre-build packages for a number of Linux distributions like Fedora, Debian, and OpenSuse. More, will hopefully follow in the future. If you want to build (or have to since no packages are available) *ELPA* yourself, please note that *ELPA* is shipped with a typical "configure" and "make" autotools procedure. This is the **only supported way** how to build and install *ELPA*. If you obtained *ELPA* from the official git repository, you will not find the needed configure script! Please look at the "**INSTALL_FROM_GIT_VERSION**" file for the documentation how to proceed. ## (A): Installing ELPA as library with configure ## *ELPA* can be installed with the build steps - configure - make - make check - make install Please look at configure --help for all available options. ### Setting of MPI compiler and libraries ### In the standard case *ELPA* need a MPI compiler and MPI libraries. The configure script will try to set this by itself. If, however, on the build system the compiler wrapper cannot automatically found, it is recommended to set it by hand with a variable, e.g. configure FC=mpif90 ### Hybrid MPI/OpenMP library build ### The *ELPA* library can be build to support hybrid MPI/OpenMP support. To do this the "--enable-openmp" configure option should be said. If also a hybrid version of *ELPA* is wanted, it is recommended to build to version of *ELPA*: one with pure MPI and a hybrid version. They can be both installed in the same path, since the have different so library names. ### Standard libraries in default installation paths### In order to build the *ELPA* library, some (depending on the settings during the configure step, see below) libraries are needed. Typically these are: - Basic Linear Algebra Subroutines (BLAS) - Lapack routines - Basic Linear Algebra Communication Subroutines (BLACS) - Scalapack routines - a working MPI library If the needed library are installed on the build system in standard paths (e.g. /usr/lib64) the in most cases the *ELPA* configure step will recognize the needed libraries automatically. No setting of any library paths should be necessary. ### Non standard paths or non standard libraries ### If standard libraries are on the build system either installed in non standard paths, or special non standard libraries (e.g. *Intel's MKL*) should be used, it might be necessary to specify the appropriate link-line with the **SCALAPACK_LDFLAGS** and **SCALAPACK_FCFLAGS** variables. For example, due to performance reasons it might be benefical to use the *BLAS*, *BLACS*, *LAPACK*, and *SCALAPACK* implementation from *Intel's MKL* library. Togehter with the Intel Fortran Compiler the call to configure might then look like: configure SCALAPACK_LDFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential \ -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" \ SCALAPACK_FCFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential \ -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKL_HOME/include/intel64/lp64" and for *INTEL MKL* togehter with *GNU GFORTRAN* : configure SCALAPACK_LDFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_sequential \ -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -Wl,-rpath,$MKL_HOME/lib/intel64" \ SCALAPACK_FCFLAGS="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_sequential \ -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -I$MKL_HOME/include/intel64/lp64" Please, for the correct link-line refer to the documentation of the correspondig library. In case of *Intel's MKL* we sugest the [Intel Math Kernel Library Link Line Advisor] (https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor). ### Choice of ELPA2 compute kernels ### In the default the configure script tries to configure and build all ELPA2 compute kernels which are available for the architecture. Then the specific kernel can be chosen at run-time via the api or an environment variable (see the **USERS_GUIDE** for details). It this is not desired, it is possible to build *ELPA* with only one (not necessary the same) kernel for the real and complex valued case, respectively. This can be done with the "--with-real-..-kernel-only" and "--with-complex-..-kernel-only" configure options. For details please do a "configure --help" ### No MPI, one node shared-memory version of ELPA ### Since release 2016.05.001 it is possible to build *ELPA* without any MPI support. This version can be used by applications, which do not have any MPI parallelisation. To set this version, use the "--with-mpi=0" configure flag. It is strongly recommmended to also set the "--enable-openmp" option, otherwise no parallelisation whatsoever will be present. ### Doxygen documentation ### A doxygen documentation can be created with the "--enable-doxygen-doc" configure option elpa-2016.05.001/generated_headers.am0000644000312500001440000000160712717402663014006 00000000000000define extract_interface @echo "Generating $@..."; @grep -h "^ *$1" $^ | sed 's/^ *$1//;' >> $@ || { rm $@; exit 1; } endef elpa test: mkdir $@ test/shared_sources: | test mkdir $@ config-f90.h: config.h @echo "Generating $@..."; @grep "^#define" $^ > $@ || { rm $@; exit 1; } elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa $(call extract_interface,!c>) test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90) | test/shared_sources $(call extract_interface,!c>) elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2_kernels/*.s) | elpa $(call extract_interface,!f>) $(call extract_interface,#!f>) generated_headers= config-f90.h elpa/elpa_generated.h test/shared_sources/generated.h elpa/elpa_generated_fortran_interfaces.h generated-headers: $(generated_headers) elpa-2016.05.001/configure.ac0000644000312500001440000007066612717516040012332 00000000000000AC_PREREQ([2.69]) # Remember to change the version also in elpa.spec AC_INIT([elpa],[2016.05.001], [elpa-library@mpcdf.mpg.de]) AC_SUBST([PACKAGE_VERSION]) AC_CONFIG_SRCDIR([src/elpa1.F90]) AM_INIT_AUTOMAKE([foreign -Wall subdir-objects]) # Without this, automake tries to be smart and rebuilt # the autoconf generated files such as configure, aclocal.m4, etc., # in case the timestamps of files such as configure.ac are newer # # This only makes trouble for end users with out-of-date autoconf versions # that cannot produce these files AM_MAINTAINER_MODE([disable]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_HEADERS([config.h]) AM_SILENT_RULES([yes]) # Set the libtool library version, see LIBRARY_INTERFACE # # See http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html # # [c:r:a] # # c: The currently implement interface # r: The revision number of the current interface, that is the number # of released source-code changes for the current interface # a: The "age" is the number of interfaces prior to c that are also supported # by the current interface, as they are ABI compatible (e.g. only new symbols # were added by the new interface) # AC_SUBST([ELPA_SO_VERSION], [5:0:1]) # AX_CHECK_GNU_MAKE() if test x$_cv_gnu_make_command = x ; then AC_MSG_ERROR([Need GNU Make]) fi #AC_CHECK_PROG(CPP_FOUND,cpp,yes,no) #if test x"${CPP_FOUND}" = xno; then # AC_MSG_ERROR([no cpp found]) #fi # gnu-make fortran module dependencies m4_include([fdep/fortran_dependencies.m4]) FDEP_F90_GNU_MAKE_DEPS ### m4_include([m4/ax_elpa_openmp.m4]) AC_MSG_CHECKING(whether --enable-openmp is specified) AC_ARG_ENABLE([openmp], AS_HELP_STRING([--enable-openmp], [use OpenMP threading, default no.]), [], [enable_openmp=no]) AC_MSG_RESULT([${enable_openmp}]) AM_CONDITIONAL([WITH_OPENMP],[test x"$enable_openmp" = x"yes"]) if test x"${enable_openmp}" = x"yes"; then AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading]) fi dnl mpi AC_ARG_WITH(mpi, [AS_HELP_STRING([--with-mpi=[[yes|no]]], [compile with MPI. Default: yes])],,[with_mpi=yes]) AM_CONDITIONAL([WITH_MPI],[test x"with_mpi" = x"yes"]) if test x"${with_mpi}" = x"yes"; then AC_DEFINE([WITH_MPI], [1], [use MPI]) fi # C AC_LANG([C]) AX_PROG_CC_MPI([test x"$with_mpi" = x"yes"],[found_mpi_c=yes],[found_mpi_c=no]) if test x"$with_mpi" = x"yes"; then if test x"$found_mpi_c" = x"no"; then AC_MSG_ERROR([Could not compile an MPI C program]) fi fi if test x"${enable_openmp}" = x"yes"; then AX_ELPA_OPENMP if test "$ac_cv_prog_cc_openmp" = unsupported; then AC_MSG_ERROR([Could not compile a C program with OpenMP, adjust CFLAGS]) fi CFLAGS="$OPENMP_CFLAGS $CFLAGS" fi AC_PROG_INSTALL AM_PROG_AR AM_PROG_AS # Fortran AC_LANG([Fortran]) m4_include([m4/ax_prog_fc_mpi.m4]) AX_PROG_FC_MPI([test x"$with_mpi" = x"yes"],[found_mpi_f=yes],[found_mpi_f=no]) if test x"$with_mpi" = x"yes"; then if test x"$found_mpi_f" = x"no"; then AC_MSG_ERROR([Could not compile an MPI Fortran program]) fi fi if test x"${enable_openmp}" = x"yes"; then AX_ELPA_OPENMP if test "$ac_cv_prog_fc_openmp" = unsupported; then AC_MSG_ERROR([Could not compile a Fortran program with OpenMP, adjust FCFLAGS]) fi FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS" fi ## C++ #AC_LANG([C++]) #AC_PROG_CXX # #if test x"${enable_openmp}" = x"yes"; then # AX_ELPA_OPENMP # if test "$ac_cv_prog_cxx_openmp" = unsupported; then # AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS]) # fi # CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS" #fi dnl variables needed for the tests dnl do NOT remove any variables here, until dnl 1. you know 100% what you are doing dnl 2. you tested ALL configure functionality afterwards dnl Otherwise, you most likely break some functionality dnl as default always define the generic kernels to be build dnl this is only unset if gpu_support_only is defined, or dnl other specific real/complex kernels are wanted install_real_generic=yes install_real_generic_simple=yes install_complex_generic=yes install_complex_generic_simple=yes #want_avx=yes #want_avx2=yes #want_sse=yes AC_LANG([C]) dnl build with ftimings support AC_MSG_CHECKING(whether ELPA should be build with ftimings support) AC_ARG_WITH([ftimings], AS_HELP_STRING([--with-ftimings], [detailed timings, default no.]), [with_ftimings=yes], [with_ftimings=no]) AC_MSG_RESULT([${with_ftimings}]) dnl build with the possibilty to redirect stdout and stderr dnl per MPI task in a file AC_MSG_CHECKING(whether stdout/stderr file redirect should be enabled) AC_ARG_WITH([redirect], AS_HELP_STRING([--with-redirect], [for test programs, allow redirection of stdout/stderr per MPI taks in a file (useful for ftimings), default no.]), [with_redirect=yes], [with_redirect=no]) AC_MSG_RESULT([${with_redirect}]) if test x"${with_redirect}" = x"yes"; then AC_DEFINE([HAVE_REDIRECT], [1], [Redirect stdout and stderr of test programs per MPI tasks to a file]) fi AM_CONDITIONAL([HAVE_REDIRECT],[test x"$with_redirect" = x"yes"]) if test x"${with_ftimings}" = x"yes"; then AC_DEFINE([HAVE_DETAILED_TIMINGS], [1], [Enable more timings]) AC_ARG_ENABLE([papi], [AS_HELP_STRING([--disable-papi],[Do not use PAPI to also measure flop count, autodetected by default])], [want_papi=$enableval],[want_papi="auto"]) papi_found=unknown if test x"$want_papi" != x"no" ; then AC_CHECK_LIB([papi],[PAPI_library_init],[papi_found="yes"],[papi_found="no"]) if test x"$want_papi" = x"yes" ; then if test x"$papi_found" = x"no" ; then AC_MSG_ERROR(["Could not find usable PAPI installation, please adjust CFLAGS, LDFLAGS"]) fi fi fi if test x"$papi_found" = x"yes"; then AC_DEFINE([HAVE_LIBPAPI], [1], [Use the PAPI library]) LIBS="-lpapi $LIBS" fi fi AM_CONDITIONAL([HAVE_DETAILED_TIMINGS],[test x"$with_ftimings" = x"yes"]) AC_MSG_CHECKING(whether SSE assembly kernel can be compiled) $CC -c $srcdir/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -o test.o 2>/dev/null if test "$?" == 0; then can_compile_sse_assembly=yes install_real_sse_assembly=yes install_complex_sse_assembly=yes else can_compile_sse_assembly=no install_real_sse_assembly=no install_complex_sse_assembly=no fi rm -f ./test.o AC_MSG_RESULT([${can_compile_sse_assembly}]) dnl check whether on can compile with sse-gcc intrinsics AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ #include int main(int argc, char **argv){ double* q; __m128d h1 = _mm_loaddup_pd(q); return 0; } ])], [can_compile_sse_intrinsics=yes], [can_compile_sse_intrinsics=no] ) AC_MSG_RESULT([${can_compile_sse_intrinsics}]) if test "${can_compile_sse_intrinsics}" = "yes"; then install_real_sse_intrinsics=yes install_real_sse_block2=yes install_real_sse_block4=yes install_real_sse_block6=yes install_complex_sse_intrinsics=yes install_complex_sse_block1=yes install_complex_sse_block2=yes else install_real_sse_intrinsics=no install_real_sse_block2=no install_real_sse_block4=no install_real_sse_block6=no install_complex_sse_intrinsics=no install_complex_sse_block1=no install_complex_sse_block2=no fi dnl check whether one can compile with avx - gcc intrinsics dnl first pass: try with specified CFLAGS and CXXFLAGS AC_MSG_CHECKING([whether we can compile AVX intrinsics in C]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ #include int main(int argc, char **argv){ double* q; __m256d a1_1 = _mm256_load_pd(q); return 0; } ])], [can_compile_avx=yes], [can_compile_avx=no] ) AC_MSG_RESULT([${can_compile_avx}]) #if test "${can_compile_avx}" = "yes" ; then # AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++]) # AC_LANG_PUSH([C++]) # AC_COMPILE_IFELSE([AC_LANG_SOURCE([ # #include # int main(int argc, char **argv){ # double* q; # __m256d a1_1 = _mm256_load_pd(q); # return 0; # } # ])], # [can_compile_avx=yes], # [can_compile_avx=no] # ) # AC_LANG_POP([C++]) # AC_MSG_RESULT([${can_compile_avx}]) # if test "${can_compile_avx}" = "no" ; then # AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether]) # fi #fi AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ #include int main(int argc, char **argv){ double* q; __m256d q1 = _mm256_load_pd(q); __m256d y1 = _mm256_fmadd_pd(q1, q1, q1); return 0; } ])], [can_compile_avx2=yes], [can_compile_avx2=no] ) AC_MSG_RESULT([${can_compile_avx2}]) #if test "${can_compile_avx2}" = "yes" ; then # AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++]) # AC_LANG_PUSH([C++]) # AC_COMPILE_IFELSE([AC_LANG_SOURCE([ # #include # int main(int argc, char **argv){ # double* q; # __m256d q1 = _mm256_load_pd(q); # __m256d y1 = _mm256_fmadd_pd(q1, q1, q1); # return 0; # } # ])], # [can_compile_avx2=yes], # [can_compile_avx2=no] # ) # AC_LANG_POP([C++]) # AC_MSG_RESULT([${can_compile_avx2}]) # if test "${can_compile_avx2}" = "no" ; then # AC_MSG_WARN([Cannot compile C++ with AVX2!]) # fi #fi if test "${can_compile_avx}" = "yes" ; then install_real_avx_block2=yes install_real_avx_block4=yes install_real_avx_block6=yes install_complex_avx_block1=yes install_complex_avx_block2=yes else install_real_avx_block2=no install_real_avx_block4=no install_real_avx_block6=no install_complex_avx_block1=no install_complex_avx_block2=no fi if test "${can_compile_avx2}" = "yes" ; then install_real_avx2_block2=yes install_real_avx2_block4=yes install_real_avx2_block6=yes install_complex_avx2_block1=yes install_complex_avx2_block2=yes else install_real_avx2_block2=no install_real_avx2_block4=no install_real_avx2_block6=no install_complex_avx2_block1=no install_complex_avx2_block2=no fi AM_CONDITIONAL([HAVE_SSE_ASSEMBLY],[test x"$can_compile_sse_assembly" = x"yes"]) if test x"${can_compile_sse_assembly}" = x"yes" ; then AC_DEFINE([HAVE_SSE_ASSEMBLY],[1],[assembly SSE is supported on this CPU]) fi AM_CONDITIONAL([HAVE_SSE_INTRINSICS],[test x"$can_compile_sse_intrinsics" = x"yes"]) if test x"${can_compile_sse_intrinsics}" = x"yes" ; then AC_DEFINE([HAVE_SSE_INTRINSICS],[1],[gcc intrinsics SSE is supported on this CPU]) fi AM_CONDITIONAL([HAVE_AVX],[test x"$can_compile_avx" = x"yes"]) if test x"${can_compile_avx}" = x"yes" ; then AC_DEFINE([HAVE_AVX],[1],[AVX is supported on this CPU]) fi AM_CONDITIONAL([HAVE_AVX2],[test x"$can_compile_avx2" = x"yes"]) if test x"${can_compile_avx2}" = x"yes" ; then AC_DEFINE([HAVE_AVX2],[1],[AVX2 is supported on this CPU]) fi dnl set the AVX optimization flags if this option is specified AC_MSG_CHECKING(whether AVX optimization flags should be set automatically) AC_ARG_WITH([avx-optimization], AS_HELP_STRING([--with-avx-optimization], [use AVX optimization, default no.]), [with_avx_optimization=yes], [with_avx_optimization=no]) AC_MSG_RESULT([${with_avx_optimization}]) if test x"${with_avx_optimization}" = x"yes"; then CFLAGS="$CFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize" CXXFLAGS="$CXXFLAGS -funsafe-loop-optimizations -funsafe-math-optimizations -ftree-vect-loop-version -ftree-vectorize" fi AC_LANG([Fortran]) AC_FC_FREEFORM AC_FC_MODULE_FLAG AC_FC_MODULE_OUTPUT_FLAG AC_FC_LIBRARY_LDFLAGS save_FCFLAGS=$FCFLAGS save_LDFLAGS=$LDFLAGS AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack]) AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack]) FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS" LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS" dnl check whether fortran error_unit is defined AC_MSG_CHECKING([whether Fortran module iso_fortran_env is available]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ program test_error_unit use ISO_FORTRAN_ENV, only : error_unit implicit none write(error_unit,*) "error_unit is defined" end program ])], [can_use_iso_fortran_env=yes], [can_use_iso_fortran_env=no] ) AC_MSG_RESULT([${can_use_iso_fortran_env}]) dnl check whether one can link with specified MKL (desired method) AC_MSG_CHECKING([whether we can compile a Fortran program using MKL]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ program test_mkl use mkl_service character*198 :: string call mkl_get_version_string(string) write(*,'(a)') string end program ])], [can_compile_with_mkl=yes], [can_compile_with_mkl=no] ) AC_MSG_RESULT([${can_compile_with_mkl}]) if test x"$can_compile_with_mkl" = x"yes" ; then AC_MSG_CHECKING([whether we can link a Fortran program with MKL]) AC_LINK_IFELSE([AC_LANG_SOURCE([ program test_mkl use mkl_service character*198 :: string call mkl_get_version_string(string) write(*,'(a)') string end program ])], [have_mkl=yes], [have_mkl=no] ) AC_MSG_RESULT([${have_mkl}]) fi dnl if not mkl, check all the necessary individually if test x"${have_mkl}" = x"yes" ; then WITH_MKL=1 else dnl first check blas AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no]) AC_MSG_CHECKING([whether we can link a program with a blas lib]) AC_MSG_RESULT([${have_blas}]) if test x"${have_blas}" = x"no" ; then AC_MSG_ERROR([could not link with blas: specify path]) fi dnl now lapack AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no]) AC_MSG_CHECKING([whether we can link a program with a lapack lib]) AC_MSG_RESULT([${have_lapack}]) if test x"${have_lapack}" = x"no" ; then AC_MSG_ERROR([could not link with lapack: specify path]) fi if test x"${with_mpi}" = x"yes"; then dnl test whether scalapack already contains blacs scalapack_libs="mpiscalapack scalapack scalapack-openmpi" old_LIBS="$LIBS" for lib in ${scalapack_libs}; do LIBS="-l${lib} ${old_LIBS}" AC_MSG_CHECKING([whether -l${lib} already contains a BLACS implementation]) AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([blacs_gridinit])],[blacs_in_scalapack=yes],[blacs_in_scalapack=no]) AC_MSG_RESULT([${blacs_in_scalapack}]) if test x"${blacs_in_scalapack}" = x"yes"; then break fi done if test x"${blacs_in_scalapack}" = x"no"; then LIBS="${old_LIBS}" dnl Test for stand-alone blacs AC_SEARCH_LIBS([bi_f77_init],[mpiblacsF77init],[],[],[-lmpiblacs]) dnl for debian AC_SEARCH_LIBS([blacs_gridinit],[blacs-openmpi],[have_blacs=yes],[have_blacs=no],[-lblacsCinit-openmpi -lscalapack-openmpi]) if test x"${have_blacs}" = x"no"; then unset ac_cv_search_blacs_gridinit fi AC_SEARCH_LIBS([blacs_gridinit],[mpiblacs blacs],[have_blacs=yes],[have_blacs=no]) if test x"${have_blacs}" = x"no"; then AC_MSG_ERROR([No usable BLACS found. If installed in a non-standard place, please specify suitable LDFLAGS and FCFLAGS as arguments to configure]) fi fi AC_SEARCH_LIBS([pdtran],[$scalapack_libs],[have_scalapack=yes],[have_scalapack=no]) if test x"${have_scalapack}" = x"no" ; then AC_MSG_ERROR([could not link with scalapack: specify path]) fi fi dnl check whether we can link alltogehter AC_MSG_CHECKING([whether we can link a Fortran program with all blacs/scalapack]) AC_LINK_IFELSE([AC_LANG_SOURCE([ program dgemm_test integer , parameter:: M = 4, N = 3, K = 2 real :: A(M,K), B(K,N), C(M,N) call dgemm('N','N',M,N,K,1.0,A,M,B,K,0.0,C,M) end program dgemm_test ])], [can_link_with_blacs_scalapack=yes], [can_link_with_blacs_scalapack=no] ) AC_MSG_RESULT([${can_link_with_blacs_scalapack}]) if test x"${can_link_with_blacs_scalapack}" = x"yes" ; then WITH_BLACS=1 else AC_MSG_ERROR([We can neither link with MKL or another Scalpack. Please specify SCALAPACK_LDFLAGS and SCALAPACK_FCFLAGS!]) fi fi dnl important: reset them again! FCFLAGS=$save_FCFLAGS LDFLAGS=$save_LDFLAGS dnl check for intrinsic fortran function of 2003 standard AC_MSG_CHECKING([whether we can use the intrinsic Fortran function "get_environment_variable"]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([ program test_get_environment character(len=256) :: homedir call get_environment_variable("HOME",homedir) end program ])], [fortran_can_check_environment=yes], [fortran_can_check_environment=no] ) AC_MSG_RESULT([${fortran_can_check_environment}]) dnl now check which kernels can be compiled dnl the checks for SSE were already done before dnl the checks for AVX were already done before dnl check BGP kernel AC_MSG_CHECKING([whether we can compile with BGP intrinsics]) AC_LINK_IFELSE([AC_LANG_SOURCE([ program test_bgp complex*16 :: y3,q3,h2 y3 = fxcpmadd(y3,q3,h2) end program ])], [can_compile_bgp=yes], [can_compile_bgp=no] ) AC_MSG_RESULT([${can_compile_bgp}]) if test x"${can_compile_bgp}" = x"yes" ; then install_real_bgp=yes install_complex_bgp=yes else install_real_bgp=no install_complex_bgp=no fi dnl check BGQ kernel AC_MSG_CHECKING([whether we can compile with BGQ intrinsics]) AC_LINK_IFELSE([AC_LANG_SOURCE([ program test_bgq VECTOR(REAL(8))::QPX_h2 real*8 :: hh(10,2) QPX_h2 = VEC_SPLATS(hh(2,2)) end program ])], [can_compile_bgq=yes], [can_compile_bgq=no] ) AC_MSG_RESULT([${can_compile_bgq}]) if test x"${can_compile_bgq}" = x"yes" ; then install_real_bgq=yes install_complex_bgq=yes else install_real_bgq=no install_complex_bgq=no fi dnl environment variable setting of kernel if test x"${fortran_can_check_environment}" = x"yes" ; then AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can querry environment variables]) fi dnl last check whether user wants to compile only a specific kernel dnl m4_include([m4/ax_elpa_specific_kernels.m4]) dnl real kernels dnl do not remove this variable it is needed in the macros use_specific_real_kernel=no dnl generic kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic]) dnl generic-simple kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple]) dnl sse kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-assembly-kernel-only],[sse-assembly-kernel],[install_real_sse_assembly]) dnl bgp kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp]) dnl bgq kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq]) dnl real-sse-block2 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block2-kernel-only],[real-sse-block2-kernel],[install_real_sse_block2]) dnl real-sse-block4 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block4-kernel]-only,[real-sse-block4-kernel],[install_real_sse_block4]) dnl real-sse-block6 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-block6-kernel-only],[real-sse-block6-kernel],[install_real_sse_block6]) dnl real-avx-block2 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2]) dnl real-avx-block4 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4]) dnl real-avx-block6 kernel DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6]) dnl complex kernels dnl do not remove this variable it is needed in the macros use_specific_complex_kernel=no dnl generic kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic]) dnl generic-simple kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple]) dnl sse kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-assembly-kernel-only],[sse-assembly-kernel],[install_complex_sse_assembly]) dnl complex-bqp kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp]) dnl complex-bqq kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq]) dnl complex-sse-block1 kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block1-kernel-only],[complex-sse-block1-kernel],[install_complex_sse_block1]) dnl complex-avx-block2 kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-block2-kernel-only],[complex-sse-block2-kernel],[install_complex_sse_block2]) dnl complex-avx-block1 kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1]) dnl complex-avx-block2 kernel DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2]) dnl set the conditionals according to the previous tests if test x"${can_use_iso_fortran_env}" = x"yes" ; then AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env]) fi AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"]) if test x"${install_real_generic}" = x"yes" ; then AC_DEFINE([WITH_REAL_GENERIC_KERNEL],[1],[can use real generic kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_GENERIC_KERNEL],[test x"$install_complex_generic" = x"yes"]) if test x"${install_complex_generic}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_GENERIC_KERNEL],[1],[can use complex generic kernel]) fi AM_CONDITIONAL([WITH_REAL_GENERIC_SIMPLE_KERNEL],[test x"$install_real_generic_simple" = x"yes"]) if test x"${install_real_generic_simple}" = x"yes" ; then AC_DEFINE([WITH_REAL_GENERIC_SIMPLE_KERNEL],[1],[can use real generic-simple kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[test x"$install_complex_generic_simple" = x"yes"]) if test x"${install_complex_generic_simple}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_GENERIC_SIMPLE_KERNEL],[1],[can use complex generic-simple kernel]) fi AM_CONDITIONAL([WITH_REAL_SSE_ASSEMBLY_KERNEL],[test x"$install_real_sse_assembly" = x"yes"]) if test x"${install_real_sse_assembly}" = x"yes" ; then AC_DEFINE([WITH_REAL_SSE_ASSEMBLY_KERNEL],[1],[can use real SSE assembly kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[test x"$install_complex_sse_assembly" = x"yes"]) if test x"${install_complex_sse_assembly}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_SSE_ASSEMBLY_KERNEL],[1],[can use complex SSE assembly kernel]) fi AM_CONDITIONAL([WITH_REAL_SSE_BLOCK2_KERNEL],[test x"$install_real_sse_block2" = x"yes"]) if test x"${install_real_sse_block2}" = x"yes" ; then AC_DEFINE([WITH_REAL_SSE_BLOCK2_KERNEL],[1],[can use real_sse_block2 kernel]) fi AM_CONDITIONAL([WITH_REAL_SSE_BLOCK4_KERNEL],[test x"$install_real_sse_block4" = x"yes"]) if test x"${install_real_sse_block4}" = x"yes" ; then AC_DEFINE([WITH_REAL_SSE_BLOCK4_KERNEL],[1],[can use real_sse_block4 kernel]) fi AM_CONDITIONAL([WITH_REAL_SSE_BLOCK6_KERNEL],[test x"$install_real_sse_block6" = x"yes"]) if test x"${install_real_sse_block6}" = x"yes" ; then AC_DEFINE([WITH_REAL_SSE_BLOCK6_KERNEL],[1],[can use real_sse_block6 kernel]) fi AM_CONDITIONAL([WITH_REAL_AVX_BLOCK2_KERNEL],[test x"$install_real_avx_block2" = x"yes"]) if test x"${install_real_avx_block2}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX_BLOCK2_KERNEL],[1],[can use real_avx_block2 kernel]) fi AM_CONDITIONAL([WITH_REAL_AVX_BLOCK4_KERNEL],[test x"$install_real_avx_block4" = x"yes"]) if test x"${install_real_avx_block4}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX_BLOCK4_KERNEL],[1],[can use real_avx_block4 kernel]) fi AM_CONDITIONAL([WITH_REAL_AVX_BLOCK6_KERNEL],[test x"$install_real_avx_block6" = x"yes"]) if test x"${install_real_avx_block6}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX_BLOCK6_KERNEL],[1],[can use real_avx_block6 kernel]) fi AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK2_KERNEL],[test x"$install_real_avx2_block2" = x"yes"]) if test x"${install_real_avx2_block2}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX2_BLOCK2_KERNEL],[1],[can use real_avx2_block2 kernel]) fi AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK4_KERNEL],[test x"$install_real_avx2_block4" = x"yes"]) if test x"${install_real_avx2_block4}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX2_BLOCK4_KERNEL],[1],[can use real_avx2_block4 kernel]) fi AM_CONDITIONAL([WITH_REAL_AVX2_BLOCK6_KERNEL],[test x"$install_real_avx2_block6" = x"yes"]) if test x"${install_real_avx2_block6}" = x"yes" ; then AC_DEFINE([WITH_REAL_AVX2_BLOCK6_KERNEL],[1],[can use real_avx2_block6 kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[test x"$install_complex_sse_block1" = x"yes"]) if test x"${install_complex_sse_block1}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_SSE_BLOCK1_KERNEL],[1],[can use complex_sse_block1 kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[test x"$install_complex_sse_block2" = x"yes"]) if test x"${install_complex_sse_block2}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_SSE_BLOCK2_KERNEL],[1],[can use complex_sse_block2 kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[test x"$install_complex_avx_block1" = x"yes"]) if test x"${install_complex_avx_block1}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_AVX_BLOCK1_KERNEL],[1],[can use complex_avx_block1 kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[test x"$install_complex_avx_block2" = x"yes"]) if test x"${install_complex_avx_block2}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_AVX_BLOCK2_KERNEL],[1],[can use complex_avx_block2 kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[test x"$install_complex_avx2_block1" = x"yes"]) if test x"${install_complex_avx2_block1}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK1_KERNEL],[1],[can use complex_avx2_block1 kernel]) fi AM_CONDITIONAL([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[test x"$install_complex_avx2_block2" = x"yes"]) if test x"${install_complex_avx2_block2}" = x"yes" ; then AC_DEFINE([WITH_COMPLEX_AVX2_BLOCK2_KERNEL],[1],[can use complex_avx2_block2 kernel]) fi AM_CONDITIONAL([WITH_REAL_BGP_KERNEL],[test x"$install_real_bgp" = x"yes"]) if test x"${install_real_bgp}" = x"yes" ; then AC_DEFINE([WITH_REAL_BGP_KERNEL],[1],[can use real BGP kernel]) fi AM_CONDITIONAL([WITH_REAL_BGQ_KERNEL],[test x"$install_real_bgq" = x"yes"]) if test x"${install_real_bgq}" = x"yes" ; then AC_DEFINE([WITH_REAL_BGQ_KERNEL],[1],[can use real BGQ kernel]) fi if test x"${use_specific_complex_kernel}" = x"no" ; then AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)]) else AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)]) fi if test x"${use_specific_real_kernel}" = x"no" ; then AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)]) else AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)]) fi LT_INIT DX_PDF_FEATURE(OFF) DX_PS_FEATURE(OFF) DX_MAN_FEATURE(ON) DX_HTML_FEATURE(ON) DX_INIT_DOXYGEN([ELPA], [Doxyfile], [docs]) DESPERATELY_WANT_ASSUMED_SIZE=0 if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable]) fi AC_SUBST([WITH_MKL]) AC_SUBST([WITH_BLACS]) AC_SUBST([with_amd_bulldozer_kernel]) AC_SUBST([FC_MODINC]) AC_SUBST([FC_MODOUT]) AC_SUBST([OPENMP_CFLAGS]) AC_SUBST([OPENMP_FCFLAGS]) AC_SUBST([OPENMP_LDFLAGS]) #AC_SUBST(OPT_FCFLAGS) AC_SUBST([DOXYGEN_OUTPUT_DIR], [docs]) rm -rf modules/ .fortran_dependencies/ mkdir modules #gl_VISIBILITY #AH_BOTTOM([#if HAVE_VISIBILITY #define EXPORTED __attribute__((__visibility__("default"))) #define HIDDEN __attribute__((__visibility__("hidden"))) #else #define EXPORTED #define HIDDEN #endif]) # Some part of libtool is too smart and tries to parse the output of # gfortran -v # and catches anything that resembles a -l library link specification. # Unfortunately, recent versions of gfortran emit # -l gfortran # with a space between -l and gfortran. The empty -l is then included # into "postdeps_FC" and causes linking errors later on. postdeps_FC=$(echo $postdeps_FC | sed 's/-l //g') if test x"${enable_openmp}" = x"yes"; then SUFFIX="_openmp" else SUFFIX="" fi AC_SUBST([SUFFIX]) AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc]) AC_CONFIG_FILES([ Makefile Doxyfile ${PKG_CONFIG_FILE}:elpa.pc.in ]) AC_OUTPUT if test "${can_compile_avx}" = "no" ; then # if test x"${want_avx}" = x"yes" ; then AC_MSG_WARN([Could not compile AVX instructions]) # fi fi if test "${can_compile_avx2}" = "no" ; then # if test x"${want_avx2}" = x"yes" ; then AC_MSG_WARN([Could not compile AVX2 instructions]) # fi fi make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" elpa-2016.05.001/config.guess0000755000312500001440000012442112717533405012355 00000000000000#! /bin/sh # Attempt to guess a canonical system name. # Copyright 1992-2014 Free Software Foundation, Inc. timestamp='2014-11-04' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that # program. This Exception is an additional permission under section 7 # of the GNU General Public License, version 3 ("GPLv3"). # # Originally written by Per Bothner; maintained since 2000 by Ben Elliston. # # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD # # Please send patches to . me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] Output the configuration name of the system \`$me' is run on. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. Copyright 1992-2014 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" >&2 exit 1 ;; * ) break ;; esac done if test $# != 0; then echo "$me: too many arguments$help" >&2 exit 1 fi trap 'exit 1' 1 2 15 # CC_FOR_BUILD -- compiler used by this script. Note that the use of a # compiler to aid in system detection is discouraged as it requires # temporary files to be created and, as you can see below, it is a # headache to deal with in a portable fashion. # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still # use `HOST_CC' if defined, but it is deprecated. # Portable tmp directory creation inspired by the Autoconf team. set_cc_for_build=' trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; : ${TMPDIR=/tmp} ; { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; dummy=$tmp/dummy ; tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; case $CC_FOR_BUILD,$HOST_CC,$CC in ,,) echo "int x;" > $dummy.c ; for c in cc gcc c89 c99 ; do if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then CC_FOR_BUILD="$c"; break ; fi ; done ; if test x"$CC_FOR_BUILD" = x ; then CC_FOR_BUILD=no_compiler_found ; fi ;; ,,*) CC_FOR_BUILD=$CC ;; ,*,*) CC_FOR_BUILD=$HOST_CC ;; esac ; set_cc_for_build= ;' # This is needed to find uname on a Pyramid OSx when run in the BSD universe. # (ghazi@noc.rutgers.edu 1994-08-24) if (test -f /.attbin/uname) >/dev/null 2>&1 ; then PATH=$PATH:/.attbin ; export PATH fi UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown case "${UNAME_SYSTEM}" in Linux|GNU|GNU/*) # If the system lacks a compiler, then just pick glibc. # We could probably try harder. LIBC=gnu eval $set_cc_for_build cat <<-EOF > $dummy.c #include #if defined(__UCLIBC__) LIBC=uclibc #elif defined(__dietlibc__) LIBC=dietlibc #else LIBC=gnu #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` ;; esac case "${UNAME_MACHINE}" in i?86) test -z "$VENDOR" && VENDOR=pc ;; *) test -z "$VENDOR" && VENDOR=unknown ;; esac test -f /etc/SuSE-release -o -f /.buildenv && VENDOR=suse # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently # switched to ELF, *-*-netbsd* would select the old # object file format. This provides both forward # compatibility and a consistent mechanism for selecting the # object file format. # # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ /usr/sbin/$sysctl 2>/dev/null || echo unknown)` case "${UNAME_MACHINE_ARCH}" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; sh5el) machine=sh5le-unknown ;; *) machine=${UNAME_MACHINE_ARCH}-unknown ;; esac # The Operating System including object format, if it has switched # to ELF recently, or will in the future. case "${UNAME_MACHINE_ARCH}" in arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ then # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). # Return netbsd for either. FIX? os=netbsd else os=netbsdelf fi ;; *) os=netbsd ;; esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need # kernel version information, so it can be replaced with a # suitable tag, in the style of linux-gnu. case "${UNAME_VERSION}" in Debian*) release='-gnu' ;; *) release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. echo "${machine}-${os}${release}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` echo ${UNAME_MACHINE_ARCH}-${VENDOR}-bitrig${UNAME_RELEASE} exit ;; *:OpenBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` echo ${UNAME_MACHINE_ARCH}-${VENDOR}-openbsd${UNAME_RELEASE} exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-${VENDOR}-ekkobsd${UNAME_RELEASE} exit ;; *:SolidBSD:*:*) echo ${UNAME_MACHINE}-${VENDOR}-solidbsd${UNAME_RELEASE} exit ;; macppc:MirBSD:*:*) echo powerpc-${VENDOR}-mirbsd${UNAME_RELEASE} exit ;; *:MirBSD:*:*) echo ${UNAME_MACHINE}-${VENDOR}-mirbsd${UNAME_RELEASE} exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` ;; *5.*) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ;; esac # According to Compaq, /usr/sbin/psrinfo has been available on # OSF/1 and Tru64 systems produced since 1995. I hope that # covers most systems running today. This code pipes the CPU # types through head -n 1, so we only detect the type of CPU 0. ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") UNAME_MACHINE="alpha" ;; "EV4.5 (21064)") UNAME_MACHINE="alpha" ;; "LCA4 (21066/21068)") UNAME_MACHINE="alpha" ;; "EV5 (21164)") UNAME_MACHINE="alphaev5" ;; "EV5.6 (21164A)") UNAME_MACHINE="alphaev56" ;; "EV5.6 (21164PC)") UNAME_MACHINE="alphapca56" ;; "EV5.7 (21164PC)") UNAME_MACHINE="alphapca57" ;; "EV6 (21264)") UNAME_MACHINE="alphaev6" ;; "EV6.7 (21264A)") UNAME_MACHINE="alphaev67" ;; "EV6.8CB (21264C)") UNAME_MACHINE="alphaev68" ;; "EV6.8AL (21264B)") UNAME_MACHINE="alphaev68" ;; "EV6.8CX (21264D)") UNAME_MACHINE="alphaev68" ;; "EV6.9A (21264/EV69A)") UNAME_MACHINE="alphaev69" ;; "EV7 (21364)") UNAME_MACHINE="alphaev7" ;; "EV7.9 (21364A)") UNAME_MACHINE="alphaev79" ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` # Reset EXIT trap before exiting to avoid spurious non-zero exit code. exitcode=$? trap '' 0 exit $exitcode ;; Alpha\ *:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # Should we change UNAME_MACHINE based on the output of uname instead # of the specific Alpha model? echo alpha-pc-interix exit ;; 21064:Windows_NT:50:3) echo alpha-dec-winnt3.5 exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-${VENDOR}-sysv4 exit ;; *:[Aa]miga[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-${VENDOR}-amigaos exit ;; *:[Mm]orph[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-${VENDOR}-morphos exit ;; *:OS/390:*:*) echo i370-ibm-openedition exit ;; *:z/VM:*:*) echo s390-ibm-zvmoe exit ;; *:OS400:*:*) echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; arm*:riscos:*:*|arm*:RISCOS:*:*) echo arm-${VENDOR}-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) echo hppa1.1-hitachi-hiuxmpp exit ;; Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. if test "`(/bin/universe) 2>/dev/null`" = att ; then echo pyramid-pyramid-sysv3 else echo pyramid-pyramid-bsd fi exit ;; NILE*:*:*:dcosx) echo pyramid-pyramid-svr4 exit ;; DRS?6000:unix:4.0:6*) echo sparc-icl-nx6 exit ;; DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) case `/usr/bin/uname -p` in sparc) echo sparc-icl-nx7; exit ;; esac ;; s390x:SunOS:*:*) echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4H:SunOS:5.*:*) echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) echo i386-pc-auroraux${UNAME_RELEASE} exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) eval $set_cc_for_build SUN_ARCH="i386" # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then SUN_ARCH="x86_64" fi fi echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:6*:*) # According to config.sub, this is the proper way to canonicalize # SunOS6. Hard to guess exactly what SunOS6 will be like, but # it's likely to be more like Solaris than SunOS4. echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:*:*) case "`/usr/bin/arch -k`" in Series*|S4*) UNAME_RELEASE=`uname -v` ;; esac # Japanese Language versions have a version number like `4.1.3-JL'. echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` exit ;; sun3*:SunOS:*:*) echo m68k-sun-sunos${UNAME_RELEASE} exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) echo m68k-sun-sunos${UNAME_RELEASE} ;; sun4) echo sparc-sun-sunos${UNAME_RELEASE} ;; esac exit ;; aushp:SunOS:*:*) echo sparc-auspex-sunos${UNAME_RELEASE} exit ;; # The situation for MiNT is a little confusing. The machine name # can be virtually everything (everything which is not # "atarist" or "atariste" at least should have a processor # > m68000). The system name ranges from "MiNT" over "FreeMiNT" # to the lowercase version "mint" (or "freemint"). Finally # the system name "TOS" denotes a system which is actually not # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) echo m68k-milan-mint${UNAME_RELEASE} exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) echo m68k-hades-mint${UNAME_RELEASE} exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) echo m68k-${VENDOR}-mint${UNAME_RELEASE} exit ;; m68k:machten:*:*) echo m68k-apple-machten${UNAME_RELEASE} exit ;; powerpc:machten:*:*) echo powerpc-apple-machten${UNAME_RELEASE} exit ;; RISC*:Mach:*:*) echo mips-dec-mach_bsd4.3 exit ;; RISC*:ULTRIX:*:*) echo mips-dec-ultrix${UNAME_RELEASE} exit ;; VAX*:ULTRIX*:*:*) echo vax-dec-ultrix${UNAME_RELEASE} exit ;; 2020:CLIX:*:* | 2430:CLIX:*:*) echo clipper-intergraph-clix${UNAME_RELEASE} exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __cplusplus #include /* for printf() prototype */ int main (int argc, char *argv[]) { #else int main (argc, argv) int argc; char *argv[]; { #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); #endif #endif exit (-1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && SYSTEM_NAME=`$dummy $dummyarg` && { echo "$SYSTEM_NAME"; exit; } echo mips-mips-riscos${UNAME_RELEASE} exit ;; Motorola:PowerMAX_OS:*:*) echo powerpc-motorola-powermax exit ;; Motorola:*:4.3:PL8-*) echo powerpc-harris-powermax exit ;; Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) echo powerpc-harris-powermax exit ;; Night_Hawk:Power_UNIX:*:*) echo powerpc-harris-powerunix exit ;; m88k:CX/UX:7*:*) echo m88k-harris-cxux7 exit ;; m88k:*:4*:R4*) echo m88k-motorola-sysv4 exit ;; m88k:*:3*:R3*) echo m88k-motorola-sysv3 exit ;; AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] then if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ [ ${TARGET_BINARY_INTERFACE}x = x ] then echo m88k-dg-dgux${UNAME_RELEASE} else echo m88k-dg-dguxbcs${UNAME_RELEASE} fi else echo i586-dg-dgux${UNAME_RELEASE} fi exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) echo m88k-dolphin-sysv3 exit ;; M88*:*:R3*:*) # Delta 88k system running SVR3 echo m88k-motorola-sysv3 exit ;; XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) echo m88k-tektronix-sysv3 exit ;; Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) echo m68k-tektronix-bsd exit ;; *:IRIX*:*:*) echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` exit ;; ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' i*86:AIX:*:*) echo i386-ibm-aix exit ;; ia64:AIX:*:*) if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include main() { if (!__power_pc()) exit(1); puts("powerpc-ibm-aix3.2.5"); exit(0); } EOF if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` then echo "$SYSTEM_NAME" else echo rs6000-ibm-aix3.2.5 fi elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then echo rs6000-ibm-aix3.2.4 else echo rs6000-ibm-aix3.2 fi exit ;; *:AIX:*:[4567]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 else IBM_ARCH=powerpc fi if [ -x /usr/bin/lslpp ] ; then IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${IBM_ARCH}-ibm-aix${IBM_REV} exit ;; *:AIX:*:*) echo rs6000-ibm-aix exit ;; ibmrt:4.4BSD:*|romp-ibm:BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to exit ;; # report: romp-ibm BSD 4.3 *:BOSX:*:*) echo rs6000-bull-bosx exit ;; DPX/2?00:B.O.S.:*:*) echo m68k-bull-sysv3 exit ;; 9000/[34]??:4.3bsd:1.*:*) echo m68k-hp-bsd exit ;; hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) echo m68k-hp-bsd4.4 exit ;; 9000/[34678]??:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` case "${UNAME_MACHINE}" in 9000/31? ) HP_ARCH=m68000 ;; 9000/[34]?? ) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "${sc_cpu_version}" in 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 case "${sc_kernel_bits}" in 32) HP_ARCH="hppa2.0n" ;; 64) HP_ARCH="hppa2.0w" ;; '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 esac ;; esac fi if [ "${HP_ARCH}" = "" ]; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #define _HPUX_SOURCE #include #include int main () { #if defined(_SC_KERNEL_BITS) long bits = sysconf(_SC_KERNEL_BITS); #endif long cpu = sysconf (_SC_CPU_VERSION); switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0"); break; case CPU_PA_RISC1_1: puts ("hppa1.1"); break; case CPU_PA_RISC2_0: #if defined(_SC_KERNEL_BITS) switch (bits) { case 64: puts ("hppa2.0w"); break; case 32: puts ("hppa2.0n"); break; default: puts ("hppa2.0"); break; } break; #else /* !defined(_SC_KERNEL_BITS) */ puts ("hppa2.0"); break; #endif default: puts ("hppa1.0"); break; } exit (0); } EOF (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac if [ ${HP_ARCH} = "hppa2.0w" ] then eval $set_cc_for_build # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler # generating 64-bit code. GNU and HP use different nomenclature: # # $ CC_FOR_BUILD=cc ./config.guess # => hppa2.0w-hp-hpux11.23 # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | grep -q __LP64__ then HP_ARCH="hppa2.0w" else HP_ARCH="hppa64" fi fi echo ${HP_ARCH}-hp-hpux${HPUX_REV} exit ;; ia64:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` echo ia64-hp-hpux${HPUX_REV} exit ;; 3050*:HI-UX:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include int main () { long cpu = sysconf (_SC_CPU_VERSION); /* The order matters, because CPU_IS_HP_MC68K erroneously returns true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct results, however. */ if (CPU_IS_PA_RISC (cpu)) { switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; default: puts ("hppa-hitachi-hiuxwe2"); break; } } else if (CPU_IS_HP_MC68K (cpu)) puts ("m68k-hitachi-hiuxwe2"); else puts ("unknown-hitachi-hiuxwe2"); exit (0); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) echo hppa1.0-hp-bsd exit ;; *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) echo hppa1.0-hp-osf exit ;; i*86:OSF1:*:*) if [ -x /usr/sbin/sysversion ] ; then echo ${UNAME_MACHINE}-${VENDOR}-osf1mk else echo ${UNAME_MACHINE}-${VENDOR}-osf1 fi exit ;; parisc*:Lites*:*:*) echo hppa1.1-hp-lites exit ;; C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) echo c1-convex-bsd exit ;; C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) echo c34-convex-bsd exit ;; C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) echo c38-convex-bsd exit ;; C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) echo c4-convex-bsd exit ;; CRAY*Y-MP:*:*:*) echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*[A-Z]90:*:*:*) echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ -e 's/\.[^.]*$/.X/' exit ;; CRAY*TS:*:*:*) echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*T3E:*:*:*) echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*SV1:*:*:*) echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; *:UNICOS/mp:*:*) echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} exit ;; sparc*:BSD/OS:*:*) echo sparc-${VENDOR}-bsdi${UNAME_RELEASE} exit ;; *:BSD/OS:*:*) echo ${UNAME_MACHINE}-${VENDOR}-bsdi${UNAME_RELEASE} exit ;; *:FreeBSD:*:*) UNAME_PROCESSOR=`/usr/bin/uname -p` case ${UNAME_PROCESSOR} in amd64) echo x86_64-${VENDOR}-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; *) echo ${UNAME_PROCESSOR}-${VENDOR}-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; esac exit ;; i*:CYGWIN*:*) echo ${UNAME_MACHINE}-pc-cygwin exit ;; *:MINGW64*:*) echo ${UNAME_MACHINE}-pc-mingw64 exit ;; *:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; *:MSYS*:*) echo ${UNAME_MACHINE}-pc-msys exit ;; i*:windows32*:*) # uname -m includes "-pc" on this system. echo ${UNAME_MACHINE}-mingw32 exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; *:Interix*:*) case ${UNAME_MACHINE} in x86) echo i586-pc-interix${UNAME_RELEASE} exit ;; authenticamd | genuineintel | EM64T) echo x86_64-${VENDOR}-interix${UNAME_RELEASE} exit ;; IA64) echo ia64-${VENDOR}-interix${UNAME_RELEASE} exit ;; esac ;; [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) echo i${UNAME_MACHINE}-pc-mks exit ;; 8664:Windows_NT:*) echo x86_64-pc-mks exit ;; i*:Windows_NT*:* | Pentium*:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we # UNAME_MACHINE based on the output of uname instead of i386? echo i586-pc-interix exit ;; i*:UWIN*:*) echo ${UNAME_MACHINE}-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) echo x86_64-${VENDOR}-cygwin exit ;; p*:CYGWIN*:*) echo powerpcle-${VENDOR}-cygwin exit ;; prep*:SunOS:5.*:*) echo powerpcle-${VENDOR}-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; *:GNU:*:*) # the GNU system echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-${VENDOR}-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland echo ${UNAME_MACHINE}-${VENDOR}-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC} exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in EV5) UNAME_MACHINE=alphaev5 ;; EV56) UNAME_MACHINE=alphaev56 ;; PCA56) UNAME_MACHINE=alphapca56 ;; PCA57) UNAME_MACHINE=alphapca56 ;; EV6) UNAME_MACHINE=alphaev6 ;; EV67) UNAME_MACHINE=alphaev67 ;; EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 if test "$?" = 0 ; then LIBC="gnulibc1" ; fi echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; arc:Linux:*:* | arceb:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}eabi else echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC}eabihf fi fi exit ;; avr32*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; cris:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; crisv32:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; frv:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; hexagon:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; i*86:Linux:*:*) echo ${UNAME_MACHINE}-pc-linux-${LIBC} exit ;; ia64:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; m32r*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; m68*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU #undef ${UNAME_MACHINE} #undef ${UNAME_MACHINE}el #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) CPU=${UNAME_MACHINE}el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) CPU=${UNAME_MACHINE} #else CPU= #endif #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` test x"${CPU}" != x && { echo "${CPU}-${VENDOR}-linux-${LIBC}"; exit; } ;; openrisc*:Linux:*:*) echo or1k-${VENDOR}-linux-${LIBC} exit ;; or32:Linux:*:* | or1k*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; padre:Linux:*:*) echo sparc-${VENDOR}-linux-${LIBC} exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) echo hppa64-${VENDOR}-linux-${LIBC} exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in PA7*) echo hppa1.1-${VENDOR}-linux-${LIBC} ;; PA8*) echo hppa2.0-${VENDOR}-linux-${LIBC} ;; *) echo hppa-${VENDOR}-linux-${LIBC} ;; esac exit ;; ppc64:Linux:*:*) echo powerpc64-${VENDOR}-linux-${LIBC} exit ;; ppc:Linux:*:*) echo powerpc-${VENDOR}-linux-${LIBC} exit ;; ppc64le:Linux:*:*) echo powerpc64le-${VENDOR}-linux-${LIBC} exit ;; ppcle:Linux:*:*) echo powerpcle-${VENDOR}-linux-${LIBC} exit ;; s390:Linux:*:* | s390x:Linux:*:*) echo ${UNAME_MACHINE}-ibm-linux-${LIBC} exit ;; sh64*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; sh*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; tile*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; vax:Linux:*:*) echo ${UNAME_MACHINE}-dec-linux-${LIBC} exit ;; x86_64:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; xtensa*:Linux:*:*) echo ${UNAME_MACHINE}-${VENDOR}-linux-${LIBC} exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. # earlier versions are messed up and put the nodename in both # sysname and nodename. echo i386-sequent-sysv4 exit ;; i*86:UNIX_SV:4.2MP:2.*) # Unixware is an offshoot of SVR4, but it has its own version # number series starting with 2... # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. # Use sysv4.2uw... so that sysv4* matches it. echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} exit ;; i*86:OS/2:*:*) # If we were able to find `uname', then EMX Unix compatibility # is probably installed. echo ${UNAME_MACHINE}-pc-os2-emx exit ;; i*86:XTS-300:*:STOP) echo ${UNAME_MACHINE}-${VENDOR}-stop exit ;; i*86:atheos:*:*) echo ${UNAME_MACHINE}-${VENDOR}-atheos exit ;; i*86:syllable:*:*) echo ${UNAME_MACHINE}-pc-syllable exit ;; i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) echo i386-${VENODR}-lynxos${UNAME_RELEASE} exit ;; i*86:*DOS:*:*) echo ${UNAME_MACHINE}-pc-msdosdjgpp exit ;; i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} else echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} fi exit ;; i*86:*:5:[678]*) # UnixWare 7.x, OpenUNIX and OpenServer 6. case `/bin/uname -X | grep "^Machine"` in *486*) UNAME_MACHINE=i486 ;; *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac echo ${UNAME_MACHINE}-${VENDOR}-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ && UNAME_MACHINE=i586 (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ && UNAME_MACHINE=i686 (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ && UNAME_MACHINE=i686 echo ${UNAME_MACHINE}-pc-sco$UNAME_REL else echo ${UNAME_MACHINE}-pc-sysv32 fi exit ;; pc:*:*:*) # Left here for compatibility: # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub # prints for the "djgpp" host, or else GDB configury will decide that # this is a cross-build. echo i586-pc-msdosdjgpp exit ;; Intel:Mach:3*:*) echo i386-pc-mach3 exit ;; paragon:*:*:*) echo i860-intel-osf1 exit ;; i860:*:4.*:*) # i860-SVR4 if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 else # Add other i860-SVR4 vendors below as they are discovered. echo i860-${VENODR}-sysv${UNAME_RELEASE} # Unknown i860-SVR4 fi exit ;; mini*:CTIX:SYS*5:*) # "miniframe" echo m68010-convergent-sysv exit ;; mc68k:UNIX:SYSTEM5:3.51m) echo m68k-convergent-sysv exit ;; M680?0:D-NIX:5.3:*) echo m68k-diab-dnix exit ;; M68*:*:R3V[5678]*:*) test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) OS_REL='' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4; exit; } ;; NCR*:*:4.2:* | MPRAS*:*:4.2:*) OS_REL='.3' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) echo m68k-${VENDOR}-lynxos${UNAME_RELEASE} exit ;; mc68030:UNIX_System_V:4.*:*) echo m68k-atari-sysv4 exit ;; TSUNAMI:LynxOS:2.*:*) echo sparc-${VENDOR}-lynxos${UNAME_RELEASE} exit ;; rs6000:LynxOS:2.*:*) echo rs6000-${VENDOR}-lynxos${UNAME_RELEASE} exit ;; PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) echo powerpc-${VENDOR}-lynxos${UNAME_RELEASE} exit ;; SM[BE]S:UNIX_SV:*:*) echo mips-dde-sysv${UNAME_RELEASE} exit ;; RM*:ReliantUNIX-*:*:*) echo mips-sni-sysv4 exit ;; RM*:SINIX-*:*:*) echo mips-sni-sysv4 exit ;; *:SINIX-*:*:*) if uname -p 2>/dev/null >/dev/null ; then UNAME_MACHINE=`(uname -p) 2>/dev/null` echo ${UNAME_MACHINE}-sni-sysv4 else echo ns32k-sni-sysv fi exit ;; PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort # says echo i586-unisys-sysv4 exit ;; *:UNIX_System_V:4*:FTX*) # From Gerald Hewes . # How about differentiating between stratus architectures? -djm echo hppa1.1-stratus-sysv4 exit ;; *:*:*:FTX*) # From seanf@swdc.stratus.com. echo i860-stratus-sysv4 exit ;; i*86:VOS:*:*) # From Paul.Green@stratus.com. echo ${UNAME_MACHINE}-stratus-vos exit ;; *:VOS:*:*) # From Paul.Green@stratus.com. echo hppa1.1-stratus-vos exit ;; mc68*:A/UX:*:*) echo m68k-apple-aux${UNAME_RELEASE} exit ;; news*:NEWS-OS:6*:*) echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then echo mips-nec-sysv${UNAME_RELEASE} else echo mips-${VENDOR}-sysv${UNAME_RELEASE} fi exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. echo powerpc-be-beos exit ;; BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. echo powerpc-apple-beos exit ;; BePC:BeOS:*:*) # BeOS running on Intel PC compatible. echo i586-pc-beos exit ;; BePC:Haiku:*:*) # Haiku running on Intel PC compatible. echo i586-pc-haiku exit ;; x86_64:Haiku:*:*) echo x86_64-${VENDOR}-haiku exit ;; SX-4:SUPER-UX:*:*) echo sx4-nec-superux${UNAME_RELEASE} exit ;; SX-5:SUPER-UX:*:*) echo sx5-nec-superux${UNAME_RELEASE} exit ;; SX-6:SUPER-UX:*:*) echo sx6-nec-superux${UNAME_RELEASE} exit ;; SX-7:SUPER-UX:*:*) echo sx7-nec-superux${UNAME_RELEASE} exit ;; SX-8:SUPER-UX:*:*) echo sx8-nec-superux${UNAME_RELEASE} exit ;; SX-8R:SUPER-UX:*:*) echo sx8r-nec-superux${UNAME_RELEASE} exit ;; Power*:Rhapsody:*:*) echo powerpc-apple-rhapsody${UNAME_RELEASE} exit ;; *:Rhapsody:*:*) echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown eval $set_cc_for_build if test "$UNAME_PROCESSOR" = unknown ; then UNAME_PROCESSOR=powerpc fi if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then case $UNAME_PROCESSOR in i386) UNAME_PROCESSOR=x86_64 ;; powerpc) UNAME_PROCESSOR=powerpc64 ;; esac fi fi elif test "$UNAME_PROCESSOR" = i386 ; then # Avoid executing cc on OS X 10.9, as it ships with a stub # that puts up a graphical alert prompting to install # developer tools. Any system running Mac OS X 10.7 or # later (Darwin 11 and later) is required to have a 64-bit # processor. This is not true of the ARM version of Darwin # that Apple uses in portable devices. UNAME_PROCESSOR=x86_64 fi echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` if test "$UNAME_PROCESSOR" = "x86"; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} exit ;; *:QNX:*:4*) echo i386-pc-qnx exit ;; NEO-?:NONSTOP_KERNEL:*:*) echo neo-tandem-nsk${UNAME_RELEASE} exit ;; NSE-*:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; NSR-?:NONSTOP_KERNEL:*:*) echo nsr-tandem-nsk${UNAME_RELEASE} exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux exit ;; BS2000:POSIX*:*:*) echo bs2000-siemens-sysv exit ;; DS/*:UNIX_System_V:*:*) echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} exit ;; *:Plan9:*:*) # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. if test "$cputype" = "386"; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" fi echo ${UNAME_MACHINE}-${VENDOR}-plan9 exit ;; *:TOPS-10:*:*) echo pdp10-${VENDOR}-tops10 exit ;; *:TENEX:*:*) echo pdp10-${VENDOR}-tenex exit ;; KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) echo pdp10-dec-tops20 exit ;; XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) echo pdp10-xkl-tops20 exit ;; *:TOPS-20:*:*) echo pdp10-${VENDOR}-tops20 exit ;; *:ITS:*:*) echo pdp10-${VENDOR}-its exit ;; SEI:*:*:SEIUX) echo mips-sei-seiux${UNAME_RELEASE} exit ;; *:DragonFly:*:*) echo ${UNAME_MACHINE}-${VENDOR}-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; *:*VMS:*:*) UNAME_MACHINE=`(uname -p) 2>/dev/null` case "${UNAME_MACHINE}" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; V*) echo vax-dec-vms ; exit ;; esac ;; *:XENIX:*:SysV) echo i386-pc-xenix exit ;; i*86:skyos:*:*) echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' exit ;; i*86:rdos:*:*) echo ${UNAME_MACHINE}-pc-rdos exit ;; i*86:AROS:*:*) echo ${UNAME_MACHINE}-pc-aros exit ;; x86_64:VMkernel:*:*) echo ${UNAME_MACHINE}-${VENDOR}-esx exit ;; esac cat >&2 < in order to provide the needed information to handle your system. config.guess timestamp = $timestamp uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` /bin/uname -X = `(/bin/uname -X) 2>/dev/null` hostinfo = `(hostinfo) 2>/dev/null` /bin/universe = `(/bin/universe) 2>/dev/null` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` /bin/arch = `(/bin/arch) 2>/dev/null` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` UNAME_MACHINE = ${UNAME_MACHINE} UNAME_RELEASE = ${UNAME_RELEASE} UNAME_SYSTEM = ${UNAME_SYSTEM} UNAME_VERSION = ${UNAME_VERSION} EOF exit 1 # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: elpa-2016.05.001/CONTRIBUTING.md0000644000312500001440000000114512717402663012264 00000000000000How to contribute to the ELPA library: We are very happy and gratefull if you are willing to help us improve ELPA. Thus, we would like to make this process as simple as possible for you, but at the same time still keep it manageable for us For recommendations and suggestions, a simple email to us is sufficient! If you would like to share with us your improvements, we suggest the following ways: 1. If you use a public accessible git repository, please send us a merge request. This is the preferred way 2. An email with a patch, will also be ok. Thank you for supporting ELPA! The ELPA development team elpa-2016.05.001/elpa.spec0000644000312500001440000002115512717402663011633 00000000000000# # spec file for package elpa # # Copyright (c) 2015 Lorenz Hüdepohl # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed # upon. The license for this file, and modifications and additions to the # file, is the same license as for the pristine package itself (unless the # license for the pristine package is not an Open Source License, in which # case the license is the MIT License). An "Open Source License" is a # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. %define so_version 4 # OpenMP support requires an MPI implementation with MPI_THREAD_MULTIPLE support, # which is only available for a sufficiently configured openmpi >= 1.8 # Set to 0 to disable %define with_openmp 1 Name: elpa Version: 2016.05.001 Release: 2 Summary: A massively parallel eigenvector solver License: LGPL-3.0 Group: System/Libraries Url: https://elpa.rzg.mpg.de/ Source0: https://elpa.mpcdf.mpg.de/html/Releases/%{version}/%{name}-%{version}.tar.gz Requires: openmpi BuildRequires: gcc-c++ BuildRequires: gcc-fortran BuildRequires: openmpi-devel BuildRequires: blas BuildRequires: blas-devel BuildRequires: lapack BuildRequires: lapack-devel BuildRequires: pkg-config %if %{defined fedora} BuildRequires: scalapack-openmpi BuildRequires: scalapack-openmpi-devel BuildRequires: blacs-openmpi BuildRequires: blacs-openmpi-devel BuildRequires: environment-modules %endif %if %{defined suse_version} BuildRequires: libscalapack2-openmpi-devel %endif # For make check, mpirun of openmpi needs an installed openssh BuildRequires: openssh %description A new efficient distributed parallel direct eigenvalue solver for symmetric matrices. It contains both an improved one-step ScaLAPACK type solver (ELPA1) and the two-step solver ELPA2. ELPA uses the same matrix layout as ScaLAPACK. The actual parallel linear algebra routines are completely rewritten. ELPA1 implements the same linear algebra as traditional solutions (reduction to tridiagonal form by Householder transforms, divide & conquer solution, eigenvector backtransform). In ELPA2, the reduction to tridiagonal form and the corresponding backtransform are replaced by a two-step version, giving an additional significant performance improvement. ELPA has demonstrated good scalability for large matrices on up to 294.000 cores of a BlueGene/P system. %package -n lib%{name}%{so_version} Summary: A massively parallel eigenvector solver Group: System/Libraries Provides: %{name} = %{version} Requires: %{name}-tools >= %{version} %description -n lib%{name}%{so_version} A new efficient distributed parallel direct eigenvalue solver for symmetric matrices. It contains both an improved one-step ScaLAPACK type solver (ELPA1) and the two-step solver ELPA2. ELPA uses the same matrix layout as ScaLAPACK. The actual parallel linear algebra routines are completely rewritten. ELPA1 implements the same linear algebra as traditional solutions (reduction to tridiagonal form by Householder transforms, divide & conquer solution, eigenvector backtransform). In ELPA2, the reduction to tridiagonal form and the corresponding backtransform are replaced by a two-step version, giving an additional significant performance improvement. ELPA has demonstrated good scalability for large matrices on up to 294.000 cores of a BlueGene/P system. %package tools Summary: Utility program for %{name} Group: Development/Libraries Requires: %{name} = %{version} %description tools A small tool program for %{name}, elpa2_print_kernels, which prints the available and currently selected numerical kernel for ELPA2. %package devel Summary: Development files for %{name} Group: Development/Libraries Requires: %{name} = %{version} Requires: openmpi Requires: libstdc++-devel Requires: lapack-devel Requires: blas-devel Requires: libscalapack2-openmpi-devel %description devel The %{name}-devel package contains libraries and header files for developing applications that use %{name}. %package devel-static Summary: Development files for %{name} - static libraries Group: Development/Libraries Requires: %{name}-devel %description devel-static This package provides the static libraries for developing applications that use %{name}. %if %{defined with_openmp} %package -n lib%{name}_openmp%{so_version} Requires: openmpi >= 1.8 BuildRequires: openmpi-devel >= 1.8 Summary: A massively parallel eigenvector solver Group: System/Libraries Provides: %{name}_openmp = %{version} Requires: %{name}_openmp-tools >= %{version} %description -n lib%{name}_openmp%{so_version} OpenMP parallelized version of %{name}, use with an Open MPI implementation that was configured and tested with MPI_THREAD_MULTIPLE support. %package -n %{name}_openmp-tools Summary: Utility program for %{name}_openmp Group: Development/Libraries Provides: %{name}_openmp = %{version} %description -n %{name}_openmp-tools A small tool program for %{name}_openmp, elpa2_print_kernels_openmp, which prints the available and currently selected numerical kernel for ELPA2. %package -n %{name}_openmp-devel Summary: Development files for %{name}_openmp Group: Development/Libraries Requires: %{name}_openmp = %{version} Requires: openmpi Requires: libstdc++-devel Requires: lapack-devel Requires: blas-devel Requires: libscalapack2-openmpi-devel %description -n %{name}_openmp-devel The %{name}_openmp-devel package contains libraries and header files for developing applications that use %{name}_openmp. %package -n %{name}_openmp-devel-static Summary: Development files for %{name} - static libraries Group: Development/Libraries Requires: %{name}-devel %description -n %{name}_openmp-devel-static This package provides the static libraries for developing applications that use %{name}_openmp. %endif %prep %setup %build %if %{defined fedora} module load mpi/openmpi-%{_arch} %endif mkdir build pushd build %define _configure ../configure %configure --docdir=%{_docdir}/%{name}-%{version} make %{?_smp_mflags} V=1 popd %if %{defined with_openmp} mkdir build_openmp pushd build_openmp %configure --docdir=%{_docdir}/%{name}_openmp-%{version} --enable-openmp make %{?_smp_mflags} V=1 popd %endif %check %if %{defined fedora} module load mpi/openmpi-%{_arch} %endif pushd build make check TEST_FLAGS="1500 50 16" || { echo "Tests failed: Content of ./test-suite.log:"; cat ./test-suite.log; echo; exit 1; } popd %if %{defined with_openmp} pushd build_openmp make check TEST_FLAGS="1500 50 16" || { echo "Tests failed: Content of ./test-suite.log:"; cat ./test-suite.log; echo; exit 1; } popd %endif %install %if %{defined with_openmp} pushd build_openmp make V=1 install DESTDIR=%{buildroot} popd %endif pushd build make V=1 install DESTDIR=%{buildroot} popd %post -n lib%{name}%{so_version} -p /sbin/ldconfig %postun -n lib%{name}%{so_version} -p /sbin/ldconfig %if %{defined with_openmp} %post -n lib%{name}_openmp%{so_version} -p /sbin/ldconfig %postun -n lib%{name}_openmp%{so_version} -p /sbin/ldconfig %endif %files -n lib%{name}%{so_version} # See http://en.opensuse.org/openSUSE:Shared_library_packaging_policy # to explain this package's name %defattr(0755,root,root) %{_libdir}/lib%{name}.so.* %doc %defattr(0644,root,root) %{_docdir}/%{name}-%{version}/* %dir %{_docdir}/%{name}-%{version} %files tools %attr(0755,root,root) %{_bindir}/elpa2_print_kernels %attr(0644,root,root) %_mandir/man1/elpa2_print_kernels.1.gz %files devel %defattr(0644,root,root) %{_libdir}/pkgconfig/%{name}-%{version}.pc %{_includedir}/%{name}-%{version} %{_libdir}/lib%{name}.so %{_libdir}/lib%{name}.la %_mandir/man3/* %files devel-static %defattr(0644,root,root) %{_libdir}/lib%{name}.a %if %{defined with_openmp} %files -n lib%{name}_openmp%{so_version} %defattr(0755,root,root) %{_libdir}/lib%{name}_openmp.so.* %doc %defattr(0644,root,root) %{_docdir}/%{name}_openmp-%{version}/* %dir %{_docdir}/%{name}_openmp-%{version} %files -n %{name}_openmp-tools %defattr(0755,root,root) %{_bindir}/elpa2_print_kernels_openmp %files -n %{name}_openmp-devel %defattr(0644,root,root) %{_libdir}/pkgconfig/%{name}_openmp-%{version}.pc %{_includedir}/%{name}_openmp-%{version} %{_libdir}/lib%{name}_openmp.so %{_libdir}/lib%{name}_openmp.la %files -n %{name}_openmp-devel-static %defattr(0644,root,root) %{_libdir}/lib%{name}_openmp.a %endif %changelog elpa-2016.05.001/fdep/0000755000312500001440000000000012717541041011022 500000000000000elpa-2016.05.001/fdep/fortran_dependencies.m40000644000312500001440000000135112717402663015373 00000000000000dnl Copyright 2015 Lorenz Hüdepohl dnl dnl This file is part of fdep and licensed under the MIT license dnl see the file LICENSE for more information dnl AC_DEFUN([FDEP_F90_GNU_MAKE_DEPS],[ AC_MSG_CHECKING([for GNU make]) for a in "$MAKE" make gmake gnumake ; do if test -z "$a" ; then continue ; fi ; if ( sh -c "$a --version" 2> /dev/null | grep GNU 2>&1 > /dev/null ) ; then _fdep_gnu_make_command=$a ; break; fi done ; AC_MSG_RESULT([$_fdep_gnu_make_command]) if test x$_fdep_gnu_make_command = x ; then AC_MSG_ERROR([Need GNU Make]) fi AC_SUBST([FORTRAN_MODULE_DEPS], [" CLEANFILES += include ${srcdir}/fdep/fortran_dependencies.mk "]) AM_SUBST_NOTMAKE([FORTRAN_MODULE_DEPS]) ]) elpa-2016.05.001/fdep/fortran_dependencies.pl0000755000312500001440000000365312717402663015500 00000000000000#!/usr/bin/perl -w # # Copyright 2015 Lorenz Hüdepohl # # This file is part of fdep and licensed under the MIT license # see the file LICENSE for more information # use strict; my %defs = (); my %uses = (); my %files = (); my $use_re = qr/^\s*use\s+(\S+)\s*$/; my $def_re = qr/^\s*module\s+(\S+)\s*$/; sub add_use { my ($file, $module) = @_; if (defined($defs{$module}) && $defs{$module} eq $file) { # do not add self-dependencies return; } if (!defined($uses{$file})) { $uses{$file} = { $module => 1 }; } else { $uses{$file}{$module} = 1; } } sub add_def { my ($file, $module) = @_; if (!defined($defs{$module})) { $defs{$module} = $file; if (defined($uses{$file}) && defined($uses{$file}{$module})) { delete $uses{$file}{$module}; } } else { die "Module $module both defined in $file, $defs{$module}"; } } my $target = shift; foreach my $file (@ARGV) { if (exists $files{$file}) { next; } else { $files{$file} = 1; } my $re; my $add; my $object; if (defined($ENV{V}) && $ENV{V} ge "2") { print STDERR "fdep: Considering file $file for target $target\n"; } if ($file =~ /^(.*)\.def_mods_[^.]*(\..*)$/) { $re = $def_re; $add = \&add_def; $object = $1 . $2; } elsif ($file =~ /^(.*)\.use_mods_[^.]*(\..*)$/) { $re = $use_re; $add = \&add_use; $object = $1 . $2; } else { die "Unrecognized file extension for '$file'"; } open(FILE,"<",$file) || die "\nCan't open $file: $!\n\n"; while() { chomp; $_ = lc($_); if ($_ =~ $re) { &$add($object, $1); } else { die "At $file:$.\nCannot parse module statement '$_', was expecting $re"; } } close(FILE) } foreach my $object (sort keys %uses) { for my $m (keys %{$uses{$object}}) { if (defined $defs{$m}) { print "$object: ", $defs{$m}, "\n"; } elsif (defined($ENV{V}) && $ENV{V} ge "1") { print STDERR "Warning: Cannot find definition of module $m in files for current target $target, might be external\n"; } } } elpa-2016.05.001/fdep/fortran_dependencies.mk0000644000312500001440000000722712717402663015472 00000000000000# Copyright 2015 Lorenz Hüdepohl # # This file is part of fdep and licensed under the MIT license # see the file LICENSE for more information # define translate_name $(subst -,_,$(subst .,_,$1)) endef _f90_verbose = $(_f90_verbose_$(V)) _f90_verbose_ = $(_f90_verbose_$(AM_DEFAULT_VERBOSITY)) _f90_verbose_0 = @echo " $1"; _f90_targets = $(call translate_name,$(PROGRAMS) $(LTLIBRARIES)) FORTRAN_CPP ?= cpp -P -traditional -Wall -Werror # $1 source files # # returns: file without any .F90 .f90 .F .f extension define strip_fortran_ext $(patsubst %.F90,%,$(patsubst %.f90,%,$(patsubst %.F,%,$(patsubst %.f,%,$1)))) endef # $1 program # # returns: # '1' if object files for target $1 are prefixed due to 'per-target' flags, # '' (the empty string) otherwise. See the automake manual for 'per-target' # compilation # define is_per_target $(if $(filter $(call strip_fortran_ext,$(firstword $(call fortran_sources,$1))),$(patsubst %.o,%,$(patsubst %.lo,%,$($1_OBJECTS)))),,1) endef # $1 top-level target name (i.e. an entry of _f90_targets) # # returns: all target source files matching *.F90 *.f90 *.F *.f define fortran_sources $(filter %.F90 %.f90 %.F %.f,$($1_SOURCES)) endef # $1 top-level target name # # returns: the appropriate extension (i.e. 'o' for normal programs, '.lo' for libraries) define object_extension $(if $(filter $1,$(PROGRAMS)),o,lo) endef # $1 source file # $2 stem # $3 program # $4 kind of file ('use' or 'def') define modinfo_name $(dir $1)$(2)$(call strip_fortran_ext,$(notdir $1)).$4_mods_$(patsubst .,_,$3).$(call object_extension,$3) endef # $1 source_file # $2 stem # $3 program define module_targets $(eval _$(3)_use_mods += $(call modinfo_name,$1,$2,$3,use)) $(call modinfo_name,$1,$2,$3,use): $1 $(dir $1)$(am__dirstamp) $(call _f90_verbose,F90 USE [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | \ grep -i -o '^ *use [^ ,!:]*' | sed 's/^[[:space:]]*//;' | tr '[:upper:]' '[:lower:]' | sort -u > $$@ $(eval _$(3)_def_mods += $(call modinfo_name,$1,$2,$3,def)) $(call modinfo_name,$1,$2,$3,def): $1 $(dir $1)$(am__dirstamp) $(call _f90_verbose,F90 MOD [$3] $$<)$(FORTRAN_CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $($p_CPPFLAGS) $(CPPFLAGS) -o /dev/stdout $$< | \ grep -i -o '^ *module [^!]*' | sed 's/^[[:space:]]*//;' | tr '[:upper:]' '[:lower:]' | grep -v "\\|\" > $$@ || true endef $(foreach p,$(_f90_targets),$(if $(call is_per_target,$p),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,$p-,$p))),$(foreach s,$(call fortran_sources,$p),$(eval $(call module_targets,$s,,$p))))) _f90_depdir=$(abs_builddir)/.fortran_dependencies _f90_depfile = $(_f90_depdir)/dependencies.mk # $1 target-name define recursive_lib_deps $(foreach l,$(call translate_name,$($1_LDADD) $($1_LIBADD)),$l $(call recursive_lib_deps,$l)) endef define is_clean $(if $(filter-out mostlyclean clean distclean maintainer-clean,$(MAKECMDGOALS)),0,1) endef ifneq ($(call is_clean),1) include $(_f90_depfile) endif $(_f90_depfile): $(top_srcdir)/fdep/fortran_dependencies.pl $(foreach p,$(_f90_targets),$(_$p_use_mods) $(_$p_def_mods)) | $(foreach p,$(_f90_targets),$(_f90_depdir)/$p) $(call _f90_verbose,F90 DEPS $@)echo > $@; $(foreach p,$(_f90_targets),$(top_srcdir)/fdep/fortran_dependencies.pl $p $(_$p_use_mods) $(_$p_def_mods) $(foreach l,$(call recursive_lib_deps,$p),$(_$l_use_mods) $(_$l_def_mods)) >> $@; ) $(_f90_depdir): @mkdir $@ $(foreach p,$(_f90_targets),$(_f90_depdir)/$p): | $(_f90_depdir) @mkdir $@ CLEANFILES += $(foreach p,$(_f90_targets),$(_$p_def_mods) $(_$p_use_mods)) CLEANFILES += $(foreach p,$(_f90_targets),$(_f90_depdir)/$p/*) CLEANFILES += $(_f90_depfile) elpa-2016.05.001/elpa.pc.in0000644000312500001440000000105112664056454011705 00000000000000prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: @PACKAGE_NAME@@SUFFIX@ Description: ELPA is a Fortran-based high-performance computational library for the (massively) parallel solution of symmetric or Hermitian, standard or generalized eigenvalue problems. Version: @PACKAGE_VERSION@ URL: @PACKAGE_URL@ Libs: -L${libdir} -lelpa@SUFFIX@ @LIBS@ @OPENMP_FCFLAGS@ Cflags: -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@ @OPENMP_CFLAGS@ fcflags= -I${includedir}/elpa@SUFFIX@-@PACKAGE_VERSION@/modules @OPENMP_FCFLAGS@ elpa-2016.05.001/Doxyfile.in0000644000312500001440000032500012717516040012140 00000000000000# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = Eigenvalue SoLvers for Petaflop-Applications (ELPA) # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = @PACKAGE_VERSION@ # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. 0PROJECT_BRIEF = "Eigenvalue SoLvers for Petaflop-Applications (ELPA)" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = @DOXYGEN_OUTPUT_DIR@ # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = Yes # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = f=FortranFixed f90=FortranFree F=FortranFixed F90=FortranFree # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = YES # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = YES # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = YES # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = @top_srcdir@/src @top_srcdir@/test @builddir@/elpa # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = @top_srcdir@/src/elpa1_compute.F90 \ @top_srcdir@/src/mod_precision.f90 \ @top_srcdir@/src/aligned_mem.F90 \ @top_srcdir@/src/mod_compute_hh_trafo_real.F90 \ @top_srcdir@/src/mod_compute_hh_trafo_complex.F90 \ @top_srcdir@/src/mod_mpi.F90 \ @top_srcdir@/src/mod_mpi_stubs.F90 \ @top_srcdir@/src/mod_time_c.F90 \ @top_srcdir@/src/mod_pack_unpack_complex.F90 \ @top_srcdir@/src/mod_pack_unpack_real.F90 \ @top_srcdir@/src/elpa2_compute.F90 \ @top_srcdir@/src/elpa2_utilities.F90 \ @top_srcdir@/src/elpa_c_interface.F90 \ @top_srcdir@/src/elpa_reduce_add_vectors.X90 \ @top_srcdir@/src/elpa_transpose_vectors.X90 \ @top_srcdir@/src/elpa_utilities.F90 \ @top_srcdir@/src/timer.F90 \ @top_srcdir@/src/redist_band.X90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_asm_x86_64.s \ @top_srcdir@/src/elpa2_kernels/mod_single_hh_trafo_real.F90 \ @top_srcdir@/src/elpa2_kernels/mod_fortran_interfaces.F90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real.F90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_simple.F90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex.F90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_simple.F90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse_2hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse_4hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_sse_6hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_2hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_4hv.c \ @top_srcdir@/src/elpa2_kernels/elpa2_kernels_real_avx-avx2_6hv.c \ @top_srcdir@/src/elpa_qr/elpa_pdgeqrf.F90 \ @top_srcdir@/src/elpa_qr/elpa_pdlarfb.F90 \ @top_srcdir@/src/elpa_qr/elpa_qrkernels.f90 \ @top_srcdir@/src/elpa_qr/qr_utils.F90 \ @top_srcdir@/src/ftimings/ftimings.F90 \ @top_srcdir@/src/ftimings/ftimings_type.F90 \ @top_srcdir@/src/ftimings/ftimings_value.F90 \ @top_srcdir@/src/ftimings/highwater_mark.c \ @top_srcdir@/src/ftimings/papi.c \ @top_srcdir@/src/ftimings/resident_set_size.c \ @top_srcdir@/src/ftimings/time.c \ @top_srcdir@/src/ftimings/virtual_memory.c \ @top_srcdir@/test/shared_sources/mod_output_types.F90 \ @top_srcdir@/test/c_test_programs/elpa1_test_complex_c_version.c \ @top_srcdir@/test/c_test_programs/elpa1_test_real_c_version.c \ @top_srcdir@/test/c_test_programs/elpa2_test_complex_c_version.c \ @top_srcdir@/test/c_test_programs/elpa2_test_real_c_version.c \ @top_srcdir@/test/fortran_test_programs/read_real.F90 \ @top_srcdir@/test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 \ @top_srcdir@/test/fortran_test_programs/test_complex2_default_kernel.F90 \ @top_srcdir@/test/fortran_test_programs/test_complex2.F90 \ @top_srcdir@/test/fortran_test_programs/test_complex.F90 \ @top_srcdir@/test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 \ @top_srcdir@/test/fortran_test_programs/test_real2_default_kernel.F90 \ @top_srcdir@/test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 \ @top_srcdir@/test/fortran_test_programs/test_real2.F90 \ @top_srcdir@/test/fortran_test_programs/test_real.F90 \ @top_srcdir@/test/fortran_test_programs/test_real_with_c.F90 \ @top_srcdir@/test/shared_sources/blacs_infrastructure.F90 \ @top_srcdir@/test/shared_sources/call_elpa1.c \ @top_srcdir@/test/shared_sources/call_elpa2.c \ @top_srcdir@/test/shared_sources/check_correctnes.F90 \ @top_srcdir@/test/shared_sources/mod_from_c.F90 \ @top_srcdir@/test/shared_sources/prepare_matrix.F90 \ @top_srcdir@/test/shared_sources/read_input_parameters.F90 \ @top_srcdir@/test/shared_sources/redir.c \ @top_srcdir@/test/shared_sources/redirect.F90 \ @top_srcdir@/test/shared_sources/setup_mpi.F90 \ @top_srcdir@/test/shared_sources/util.F90 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /