pax_global_header00006660000000000000000000000064137056431460014523gustar00rootroot0000000000000052 comment=7a33bedc4bb3dff4e57c00293a2d70890db4d983 opa-psm2-PSM2_11.2.185/000077500000000000000000000000001370564314600142215ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/40-psm.rules000066400000000000000000000044611370564314600163220ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # KERNEL=="hfi1", MODE="0666" KERNEL=="hfi1_[0-9]", MODE="0666" opa-psm2-PSM2_11.2.185/COMMIT000066400000000000000000000000501370564314600151270ustar00rootroot0000000000000030c52a0fd155774e18cc06328a1ba83c2a6a8104opa-psm2-PSM2_11.2.185/CONTRIBUTORS000066400000000000000000000011561370564314600161040ustar00rootroot00000000000000The following developers have all contributed bug fixes to the open source version of the PSM library. Intel gratefully thanks them for their contributions: Michal Schmidt (michich on github.com) Lisanna Dettwyler (LisannaDettwyler on github.com) Ana Guerrero Lopez (ana on github.com) Brian Smith (bsmith94 on github.com) Michael J OConnor (michael-j-oconnor on github.com) Nicolas Morey-Chaismartin (nmorey on github.com) Bernhard M. Wiedemann (bmwidemann on github.com) Dmitry (dmitrygx on github.com) Florian Weimer (fweimer on github.com) Jonas Hahnfeld (hahnjo on github.com) Tom Stellard (tstellar on github.com) opa-psm2-PSM2_11.2.185/COPYING000066400000000000000000000413411370564314600152570ustar00rootroot00000000000000This software is available to you under a choice of one of two licenses. You may choose to be licensed under the terms of the BSD license or the GNU General Public License (GPL) Version 2, both included below. Copyright(c) 2016 Intel Corporation. All rights reserved. ================================================================== BSD Simplified License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================================== GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS opa-psm2-PSM2_11.2.185/Makefile000066400000000000000000000563211370564314600156700ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2017 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2017 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # HISTORY = .outdirs HISTORIC_TARGETS = $(patsubst %, %_clean, $(shell cat $(HISTORY) 2> /dev/null)) RPM_NAME := libpsm2 CONFIG_FILE := .config TEMP_INST_DIR := $(shell mktemp -d) ifeq ($(CONFIG_FILE), $(wildcard $(CONFIG_FILE))) include $(CONFIG_FILE) endif PSM_HAL_ENABLE ?= * PSM_HAL_ENABLE_D = $(wildcard $(addprefix psm_hal_,$(PSM_HAL_ENABLE))) PSM_HAL_INSTANCE_OBJFILES = $(addsuffix /*.o,$(PSM_HAL_ENABLE_D)) SUBDIRS = ptl_self ptl_ips ptl_am libuuid opa ${wildcard $(PSM_HAL_ENABLE_D)} top_srcdir := $(shell readlink -m .) # Default locations OUTDIR := $(top_srcdir)/build_release MOCK_OUTDIR := $(top_srcdir)/build_mock DEBUG_OUTDIR := $(top_srcdir)/build_debug # We need a temporary test variable, as the OUTDIR macro # can be overriden by the shell and thus not run. TESTOUTDIR= $(shell readlink -m $(OUTDIR)) ifeq ($(top_srcdir), $(TESTOUTDIR)) $(error OUTDIR cannot be the same as your source folder ${top_srcdir})) endif ifeq (/,$(TESTOUTDIR)) $(error OUTDIR cannot be the / folder )) endif # Forces any value to be full path. # We don't need to override MOCK_OUTDIR or DEBUG_OUTDIR # as they are recursive make invocations and use OUTDIR ifneq ($(MAKECMDGOALS), mock) ifneq ($(MAKECMDGOALS), debug) override OUTDIR := $(shell readlink -m $(OUTDIR)) endif endif PSM2_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h) PSM2_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h) PSM2_LIB_MAJOR := $(shell printf "%d" ${PSM2_VERNO_MAJOR}) PSM2_LIB_MINOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM2_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(top_srcdir)/psm2.h`) LINKER_SCRIPT_FILE = ${OUTDIR}/psm2_linker_script.map SOURCES_CHKSUM_FILES = Makefile buildflags.mak $(LINKER_SCRIPT_FILE) \ `find . -regex '\(.*\.h\|.*\.c\)' -not -path "./test/*" -not -path "./tools/*" -not -path "_revision.c" | sort` SOURCES_CHKSUM_VALUE = $(shell cat ${SOURCES_CHKSUM_FILES} | sha1sum | cut -d' ' -f 1) OPA_LIB_MAJOR := 4 OPA_LIB_MINOR := 0 export PSM2_VERNO_MAJOR export PSM2_LIB_MAJOR export PSM2_VERNO_MINOR export PSM2_LIB_MINOR export OPA_LIB_MAJOR export OPA_LIB_MINOR export CCARCH ?= gcc export FCARCH ?= gfortran export AR ?= ar include $(top_srcdir)/buildflags.mak # We need to unexport these environs as during mock testing and normal calls, # if they are exported then during each submake they will be evaulated again. # This is costly and the LINKER_SCRIPT_FILE doesn't exist until after its # target rule runs. unexport SOURCES_CHKSUM_FILES unexport SOURCES_CHKSUM_VALUE unexport LINKER_SCRIPT_FILE INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips -I$(OUTDIR) ifneq (x86_64,$(arch)) ifneq (i386,$(arch)) $(error Unsupported architecture $(arch)) endif endif ifndef LIBDIR ifeq (${arch},x86_64) INSTALL_LIB_TARG=/usr/lib64 else INSTALL_LIB_TARG=/usr/lib endif else INSTALL_LIB_TARG=${LIBDIR} endif export DESTDIR export INSTALL_LIB_TARG TARGLIB := libpsm2 COMPATMAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' \ $(top_srcdir)/psm2.h) COMPATLIB := libpsm_infinipath MAJOR := $(PSM2_LIB_MAJOR) MINOR := $(PSM2_LIB_MINOR) nthreads := $(shell echo $$(( `nproc` * 2 )) ) # The following line sets the DISTRO variable to: # 'rhel' if the host is running RHEL. # 'suse' if the host is running SUSE. # 'fedora' if the host is running Fedora. # 'ubuntu' if the host is running Ubuntu. # # The DISTRO variable is used subsequently for variable # behaviors of the 3 distros. DISTRO := $(shell . /etc/os-release; if [[ "$$ID" == "sle_hpc" ]]; then ID="sles"; fi; echo $$ID) # By default the following two variables have the following values: LIBPSM2_COMPAT_CONF_DIR := /etc LIBPSM2_COMPAT_SYM_CONF_DIR := /etc # We can't set SPEC_FILE_RELEASE_DIST to an empty value, a space will result. # It then messes up sed operations for PSM_CUDA=1. # So leaving the commented out line here as documentation to NOT set it. # SPEC_FILE_RELEASE_DIST := UDEV_40_PSM_RULES := %{_udevrulesdir}/40-psm.rules ifeq (fedora,$(DISTRO)) # On Fedora, we change these two variables to these values: LIBPSM2_COMPAT_CONF_DIR := /usr/lib LIBPSM2_COMPAT_SYM_CONF_DIR := %{_prefix}/lib SPEC_FILE_RELEASE_DIST := %{?dist} UDEV_40_PSM_RULES :=# else ifeq (rhel,${DISTRO}) # Insert code specific to RHEL here. else ifeq (sles,${DISTRO}) # Insert code specific to SLES here. endif ifdef PSM_CUDA #Value needs to be something without spaces or dashes '-' SPEC_FILE_RELEASE_DIST += cuda endif export LIBPSM2_COMPAT_CONF_DIR # The desired version number comes from the most recent tag starting with "v" ifeq (true, $(shell git rev-parse --is-inside-work-tree 2>/dev/null)) ISGIT := 1 # Cache the result for later # Note, we don't define ISGIT if we are not in a git folder VERSION := $(shell git describe --tags --abbrev=0 --match='psm-v*' | sed -e 's/^psm-v//' -e 's/-/_/') else ISGIT := 0 VERSION := version endif # If we have a file called 'rpm_release_extension' (as on github), # we take the release extension number from this file RELEASE_EXT := $(shell if [ -e rpm_release_extension ] ;\ then cat rpm_release_extension; fi) CURRENTSHA := $(shell if [ $(ISGIT) = 1 -a -f rpm_release_extension ] ;\ then git log --pretty=format:'%h' -n 1; fi) RPMEXTHASH := $(shell if [ $(ISGIT) = 1 -a -f rpm_release_extension ] ;\ then git log --pretty=format:'%h' -n 1 rpm_release_extension; fi) # This logic should kick-in only on github ifdef RELEASE_EXT ifneq ($(CURRENTSHA), $(RPMEXTHASH)) # On github, the last commit for each release should be the one to bump up # the release extension number in 'rpm_release_extension'. Further commits # are counted here and appended to the final rpm name to distinguish commits # present only on github NCOMMITS := $(shell if [ $(ISGIT) = 1 -a -f rpm_release_extension ] ;\ then git log --children $(RPMEXTHASH)..$(CURRENTSHA) \ --pretty=oneline . | wc -l; fi) RELEASE := $(RELEASE_EXT)_$(NCOMMITS) endif endif # The desired release number comes the git describe following the version which # is the number of commits since the version tag was planted suffixed by the g ifndef RELEASE RELTAG := "psm-v$(VERSION)" RELEASE := $(shell if [ -f rpm_release_extension ]; then cat rpm_release_extension;\ elif [ $(ISGIT) = 1 ] ; then git rev-list $(RELTAG)..HEAD -- . | wc -l; \ else echo "release" ; fi) endif DIST_SHA := ${shell if [ $(ISGIT) = 1 ] ; then git log -n1 --pretty=format:%H .; \ else echo DIST_SHA ; fi} # Concatenated version and release ifndef VERSION_RELEASE_OVERRIDE VERSION_RELEASE := $(VERSION).$(RELEASE) else VERSION_RELEASE := ${VERSION_RELEASE_OVERRIDE} endif LDLIBS := -lrt -ldl -lnuma ${EXTRA_LIBS} -pthread PKG_CONFIG ?= pkg-config UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) ifndef UDEVDIR UDEVDIR = /lib/udev endif export UDEVDIR # The DIST variable is a name kernel corresponding to: # 1. The name of the directory containing the source code distribution # (see dist: target below). # 2. The basename of the filename of the tar file created in the dist: # target. DIST := ${RPM_NAME}-${VERSION_RELEASE} # If user has empty RPM NAME BASEEXT (defined or not), then attempt to # see if we are running on SLES 12.3 or newer. # If we are, then change the base package name, but not the supporting # packages to libpsm2-2. Do note this requires support both in the Makefile # specfile target rule as well as changes in the libpsm2.spec.in # file as well. ifeq ($(RPM_NAME_BASEEXT),) # Detect current version of the OS OS := $(shell grep -m1 NAME /etc/os-release | cut -f 2 -d\") OSVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 1 -d.) OSSUBVERSION := $(shell grep VERSION_ID /etc/os-release | cut -f 2 -d\" | cut -f 2 -d.) override RPM_NAME_BASEEXT := $(shell \ if [ "$(OS)" = "SLES" -o "$(OS)" = "SLE_HPC" ]; then \ if [ $(OSVERSION) -gt 11 ]; then \ if [ $(OSVERSION) -eq 12 ]; then \ if [ $(OSSUBVERSION) -gt 2 ]; then \ echo "-2"; \ fi \ else \ echo "-2"; \ fi \ fi \ fi) endif HALDECLFILE=$(OUTDIR)/psm2_hal_inlines_d.h HALIMPLFILE=$(OUTDIR)/psm2_hal_inlines_i.h all: symlinks $(HALDECLFILE) $(HALIMPLFILE) | $(OUTDIR) @if [ ! -e $(HISTORY) ] || [ -z "`grep -E '^$(OUTDIR)$$' $(HISTORY)`" ]; then \ echo $(OUTDIR) >> $(HISTORY); \ fi # Our buildflags.mak exports all variables, all are propogated to submakes. @for subdir in $(SUBDIRS); do \ mkdir -p $(OUTDIR)/$$subdir; \ $(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir; \ if [ $$? -ne 0 ]; then exit 1; fi ;\ done $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.a @mkdir -p $(OUTDIR)/compat $(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat $(HALDECLFILE): | $(OUTDIR) @test -f $(HALDECLFILE) || ( \ n_hal_insts=$(words $(wildcard $(PSM_HAL_ENABLE_D)));\ echo "#define PSMI_HAL_INST_CNT $$n_hal_insts" > $(HALDECLFILE);\ if [ $$n_hal_insts -eq 1 ]; then \ echo "#define PSMI_HAL_INLINE inline" >> $(HALDECLFILE);\ hal_inst_dir=$(PSM_HAL_ENABLE_D); \ echo "#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_$(subst psm_hal_,,$(PSM_HAL_ENABLE_D))" \ "## _ ## KERNEL" >> $(HALDECLFILE);\ echo "#include \"psm2_hal_inline_t.h\"" >> $(HALDECLFILE);\ else \ echo "#define PSMI_HAL_INLINE /* nothing */" >> $(HALDECLFILE);\ fi ) $(HALIMPLFILE): | $(OUTDIR) @test -f $(HALIMPLFILE) || ( \ n_hal_insts=$(words $(wildcard $(PSM_HAL_ENABLE_D)));\ if [ $$n_hal_insts -eq 1 ]; then\ hal_inst=$(PSM_HAL_ENABLE_D);\ echo "#include \"$$hal_inst/psm_hal_inline_i.h\"" >> $(HALIMPLFILE);\ else\ echo "/* no inlining since more than 1 hal instance" >> $(HALIMPLFILE);\ echo " is included in the libpsm2 linkage. */" >> $(HALIMPLFILE);\ fi ) %_clean: make OUTDIR=$* clean clean: cleanlinks rm -rf ${OUTDIR} @if [ -e $(HISTORY) ]; then \ grep -v -E "^$(OUTDIR)$$" $(HISTORY) > $(HISTORY)_tmp; \ mv $(HISTORY)_tmp $(HISTORY); \ if [ "`wc -c $(HISTORY) | cut -d ' ' -f 1`" -eq 0 ]; then \ rm -f $(HISTORY); \ fi; \ fi rm -fr $(TEMP_INST_DIR) # Easily add more items to config target if more options need # to be cached. config: $(CONFIG_FILE) $(CONFIG_FILE): @echo PSM_HAL_ENABLE=$(PSM_HAL_ENABLE) > $(CONFIG_FILE) @echo CCARCH=$(CCARCH) >> $(CONFIG_FILE) @echo HFI_BRAKE_DEBUG=$(HFI_BRAKE_DEBUG) >> $(CONFIG_FILE) @echo PSM_DEBUG=$(PSM_DEBUG) >> $(CONFIG_FILE) @echo PSM_AVX512=$(PSM_AVX512) >> $(CONFIG_FILE) @echo PSM_LOG=$(PSM_LOG) >> $(CONFIG_FILE) @echo PSM_LOG_FAST_IO=$(PSM_LOG_FAST_IO) >> $(CONFIG_FILE) @echo PSM_PERF=$(PSM_PERF) >> $(CONFIG_FILE) @echo PSM_HEAP_DEBUG=$(PSM_HEAP_DEBUG) >> $(CONFIG_FILE) @echo PSM_PROFILE=$(PSM_PROFILE) >> $(CONFIG_FILE) @echo PSM_CUDA=$(PSM_CUDA) >> $(CONFIG_FILE) @echo Wrote $(CONFIG_FILE) mock: OUTDIR := $(MOCK_OUTDIR) mock: $(MAKE) OUTDIR=$(OUTDIR) PSM2_MOCK_TESTING=1 debug: OUTDIR := $(DEBUG_OUTDIR) debug: $(MAKE) OUTDIR=$(OUTDIR) PSM_DEBUG=1 test_clean: if [ -d ./test ]; then \ $(MAKE) -C test clean; \ fi specfile_clean: rm -f ${OUTDIR}/${RPM_NAME}.spec distclean: specfile_clean cleanlinks $(HISTORIC_TARGETS) test_clean rm -f $(CONFIG_FILE) rm -rf ${OUTDIR}/${DIST} rm -f ${OUTDIR}/${DIST}.tar.gz rm -fr temp.* *.rej.patch $(OUTDIR): mkdir -p ${OUTDIR} symlinks: @test -L $(top_srcdir)/include/linux-x86_64 || \ ln -sf linux-i386 $(top_srcdir)/include/linux-x86_64 cleanlinks: rm -rf $(top_srcdir)/include/linux-x86_64 install: all for subdir in $(SUBDIRS) ; do \ mkdir -p $(OUTDIR)/$$subdir ; \ $(MAKE) -j $(nthreads) -C $$subdir OUTDIR=$(OUTDIR)/$$subdir install ; \ done $(MAKE) -j $(nthreads) $(OUTDIR)/${TARGLIB}.so OUTDIR=$(OUTDIR) $(MAKE) -j $(nthreads) -C compat OUTDIR=$(OUTDIR)/compat install install -D $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR} \ ${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR} (cd ${DESTDIR}${INSTALL_LIB_TARG} ; \ ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \ ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so) install -D $(OUTDIR)/${TARGLIB}.a \ ${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.a install -m 0644 -D psm2.h ${DESTDIR}/usr/include/psm2.h install -m 0644 -D psm2_mq.h ${DESTDIR}/usr/include/psm2_mq.h install -m 0644 -D psm2_am.h ${DESTDIR}/usr/include/psm2_am.h ifneq (fedora,${DISTRO}) install -m 0644 -D 40-psm.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm.rules endif # The following files and dirs were part of the noship rpm: mkdir -p ${DESTDIR}/usr/include/hfi1diag mkdir -p ${DESTDIR}/usr/include/hfi1diag/linux-x86_64 install -m 0644 -D include/linux-x86_64/bit_ops.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/bit_ops.h install -m 0644 -D include/linux-x86_64/sysdep.h ${DESTDIR}/usr/include/hfi1diag/linux-x86_64/sysdep.h install -m 0644 -D include/opa_udebug.h ${DESTDIR}/usr/include/hfi1diag/opa_udebug.h install -m 0644 -D include/opa_debug.h ${DESTDIR}/usr/include/hfi1diag/opa_debug.h install -m 0644 -D include/opa_intf.h ${DESTDIR}/usr/include/hfi1diag/opa_intf.h for h in opa_user_gen1.h opa_service_gen1.h opa_common_gen1.h ; do \ sed -e 's/#include "opa_user_gen1.h"/#include "opa_user.h"/' \ -e 's/#include "opa_common_gen1.h"/#include "opa_common.h"/' \ -e 's/#include "hfi1_deprecated_gen1.h"/#include "hfi1_deprecated.h"/' \ -e 's/#include "opa_service_gen1.h"/#include "opa_service.h"/' psm_hal_gen1/$$h \ > $(TEMP_INST_DIR)/$$h ; \ done cat include/opa_user.h $(TEMP_INST_DIR)/opa_user_gen1.h > $(TEMP_INST_DIR)/opa_user.h cat include/opa_service.h $(TEMP_INST_DIR)/opa_service_gen1.h > $(TEMP_INST_DIR)/opa_service.h install -m 0644 -D $(TEMP_INST_DIR)/opa_user.h ${DESTDIR}/usr/include/hfi1diag/opa_user.h install -m 0644 -D $(TEMP_INST_DIR)/opa_service.h ${DESTDIR}/usr/include/hfi1diag/opa_service.h install -m 0644 -D $(TEMP_INST_DIR)/opa_common_gen1.h ${DESTDIR}/usr/include/hfi1diag/opa_common.h install -m 0644 -D include/opa_byteorder.h ${DESTDIR}/usr/include/hfi1diag/opa_byteorder.h install -m 0644 -D include/psm2_mock_testing.h ${DESTDIR}/usr/include/hfi1diag/psm2_mock_testing.h install -m 0644 -D include/opa_revision.h ${DESTDIR}/usr/include/hfi1diag/opa_revision.h install -m 0644 -D psmi_wrappers.h ${DESTDIR}/usr/include/hfi1diag/psmi_wrappers.h install -m 0644 -D psm_hal_gen1/hfi1_deprecated_gen1.h ${DESTDIR}/usr/include/hfi1diag/hfi1_deprecated.h rm -fr $(TEMP_INST_DIR) specfile: specfile_clean | $(OUTDIR) sed -e 's/@VERSION@/'${VERSION_RELEASE}'/g' libpsm2.spec.in | \ sed -e 's/@TARGLIB@/'${TARGLIB}'/g' \ -e 's/@RPM_NAME@/'${RPM_NAME}'/g' \ -e 's/@RPM_NAME_BASEEXT@/'${RPM_NAME_BASEEXT}'/g' \ -e 's/@COMPATLIB@/'${COMPATLIB}'/g' \ -e 's/@COMPATMAJOR@/'${COMPATMAJOR}'/g' \ -e 's;@UDEVDIR@;'${UDEVDIR}';g' \ -e 's/@MAJOR@/'${MAJOR}'/g' \ -e 's/@MINOR@/'${MINOR}'/g' \ -e 's:@LIBPSM2_COMPAT_CONF_DIR@:'${LIBPSM2_COMPAT_CONF_DIR}':g' \ -e 's:@LIBPSM2_COMPAT_SYM_CONF_DIR@:'${LIBPSM2_COMPAT_SYM_CONF_DIR}':g' \ -e 's;@SPEC_FILE_RELEASE_DIST@;'${SPEC_FILE_RELEASE_DIST}';g' \ -e 's/@DIST_SHA@/'${DIST_SHA}'/g' > \ ${OUTDIR}/${RPM_NAME}.spec if [ -f /etc/redhat-release ] && [ `grep -o "[0-9.]*" /etc/redhat-release | cut -d"." -f1` -lt 7 ]; then \ sed -i 's;@40_PSM_RULES@;'${UDEVDIR}'/rules.d/40-psm.rules;g' ${OUTDIR}/${RPM_NAME}.spec; \ else \ sed -i 's;@40_PSM_RULES@;'${UDEV_40_PSM_RULES}';g' ${OUTDIR}/${RPM_NAME}.spec; \ fi # We can't totally prevent two make dist calls in a row from packaging # the previous make dist, unless we switch to using a dedicated ./src folder # That will come in the next major revision of the Makefile for now we can # prevent the easy and default cases # # Notes on PRUNE_LIST: # To make the dist, we always eliminate the psm_hal_MOCK dir. # we also eliminate the psm hal instances that are not enabled via the PSM_HAL_ENABLE variable. # To implement this, we build the prune list in two passes: # 1. The first pass includes all of the common items we want to exclude. # 2. The second pass we include the differnce of # (all of the PSM HAL instances) minus (the PSM hal instances that are enabled) # The final prune list is supplied to find, and the dist is created. dist: distclean mkdir -p ${OUTDIR}/${DIST} PRUNE_LIST=""; \ for pd in ".git" "cscope*" "$(shell realpath --relative-to=${top_srcdir} ${OUTDIR})" \ "*.orig" "*~" "#*" ".gitignore" "doc" "libcm" "psm.supp" "test" "psm_hal_MOCK" \ "psm_test" "tools" "artifacts" "*.rej.patch"; do \ PRUNE_LIST="$$PRUNE_LIST -name $$pd -prune -o"; \ done; \ for hid in psm_hal_* ; do \ found=0; \ for ehid in $(PSM_HAL_ENABLE_D) ; do \ if [ "$$hid" = "$$ehid" ]; then \ found=1; \ break; \ fi; \ done; \ if [ $$found -eq 0 ]; then \ PRUNE_LIST="$$PRUNE_LIST -name $$hid -prune -o"; \ fi; \ done; \ for x in $$(/usr/bin/find . \ $$PRUNE_LIST \ -print); do \ dir=$$(dirname $$x); \ mkdir -p ${OUTDIR}/${DIST}/$$dir; \ [ ! -d $$x ] && cp $$x ${OUTDIR}/${DIST}/$$dir; \ done if [ $(ISGIT) = 1 ] ; then git log -n1 --pretty=format:%H . > ${OUTDIR}/${DIST}/COMMIT ; fi echo ${RELEASE} > ${OUTDIR}/${DIST}/rpm_release_extension cd ${OUTDIR}; tar czvf ${DIST}.tar.gz ${DIST} @echo "${DIST}.tar.gz is located in ${OUTDIR}/${DIST}.tar.gz" ofeddist: $(MAKE) -j $(nthreads) dist # rebuild the cscope database, skipping sccs files, done once for # top level cscope: find * -type f ! -name '[ps].*' \( -iname '*.[cfhs]' -o \ -iname \\*.cc -o -name \\*.cpp -o -name \\*.f90 \) -print | cscope -bqu -i - sources-checksum: @echo ${SOURCES_CHKSUM_VALUE} ${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o \ ptl_am/am_reqrep.o \ ptl_am/ptl.o \ ptl_am/cmarwu.o \ ptl_am/am_cuda_memhandle_cache.o \ psm_context.o \ psm_ep.o \ psm_ep_connect.o \ psm_error.o \ psm_utils.o \ psm_sysbuf.o \ psm_timer.o \ psm_am.o \ psm_mq.o \ psm_mq_utils.o \ psm_mq_recv.o \ psm_mpool.o \ psm_stats.o \ psm_memcpy.o \ psm_mock.o \ psm.o \ psm_perf.o \ libuuid/psm_uuid.o \ libuuid/parse.o \ libuuid/pack.o \ libuuid/unpack.o \ libuuid/unparse.o \ ptl_ips/ptl.o \ ptl_ips/ptl_rcvthread.o \ ptl_ips/ips_scb.o \ ptl_ips/ips_epstate.o \ ptl_ips/ips_recvq.o \ ptl_ips/ips_recvhdrq.o \ ptl_ips/ips_proto.o \ ptl_ips/ips_proto_recv.o \ ptl_ips/ips_proto_connect.o \ ptl_ips/ips_proto_expected.o \ ptl_ips/ips_tid.o \ ptl_ips/ips_tidcache.o \ ptl_ips/ips_tidflow.o \ ptl_ips/ips_crc32.o \ ptl_ips/ips_proto_dump.o \ ptl_ips/ips_proto_mq.o \ ptl_ips/ips_proto_am.o \ ptl_ips/ips_path_rec.o \ ptl_ips/ips_opp_path_rec.o \ ptl_ips/ips_writehdrq.o \ ptl_self/ptl.o \ opa/*.o \ psm_diags.o \ psm2_hal.o \ $(PSM_HAL_INSTANCE_OBJFILES) \ psmi_wrappers.o ${TARGLIB}-objs := $(patsubst %.o, ${OUTDIR}/%.o, ${${TARGLIB}-objs}) DEPS:= $(${TARGLIB}-objs:.o=.d) -include $(DEPS) ${OUTDIR}/${TARGLIB}.so: ${OUTDIR}/${TARGLIB}.so.${MAJOR} ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ ${OUTDIR}/${TARGLIB}.so.${MAJOR}: ${OUTDIR}/${TARGLIB}.so.${MAJOR}.${MINOR} ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ # when we build the shared library, generate a revision and date # string in it, for easier id'ing when people may have copied the # file around. Generate it such that the ident command can find it # and strings -a | grep OPA does a reasonable job as well. $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} $(LINKER_SCRIPT_FILE) echo "char psmi_hfi_IFS_version[]=\"`printenv RELEASE_TAG`\";" > ${OUTDIR}/_revision.c date -u -d@$${SOURCE_DATE_EPOCH:-$$(date +%s)} +'char psmi_hfi_build_timestamp[] ="%F %T%:z";' >> ${OUTDIR}/_revision.c echo "char psmi_hfi_sources_checksum[] =\"${SOURCES_CHKSUM_VALUE}\";" >> ${OUTDIR}/_revision.c echo "char psmi_hfi_git_checksum[] =\"`git rev-parse HEAD`\";" >> ${OUTDIR}/_revision.c $(CC) -c $(CFLAGS) $(BASECFLAGS) $(INCLUDES) ${OUTDIR}/_revision.c -o $(OUTDIR)/_revision.o $(CC) $(LINKER_SCRIPT) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \ ${${TARGLIB}-objs} $(OUTDIR)/_revision.o $(LDLIBS) $(OUTDIR)/${TARGLIB}.a: $(OUTDIR)/${TARGLIB}.so.${MAJOR}.${MINOR} $(AR) rcs $(OUTDIR)/${TARGLIB}.a ${${TARGLIB}-objs} $(OUTDIR)/_revision.o ${OUTDIR}/%.o: ${top_srcdir}/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@ $(LINKER_SCRIPT_FILE): psm2_linker_script_map.in sed "s/_psm2_additional_globals_;/$(PSM2_ADDITIONAL_GLOBALS)/" \ psm2_linker_script_map.in > ${OUTDIR}/psm2_linker_script.map .PHONY: all %_clean clean config mock debug distclean symlinks cleanlinks install specfile dist ofeddist cscope sources-checksum opa-psm2-PSM2_11.2.185/README000066400000000000000000000270701370564314600151070ustar00rootroot00000000000000 This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Copyright (c) 2003-2017 Intel Corporation. All rights reserved. ================================================================================ ABSTRACT -------- Discusses how to build, install and test the PSM2 library source code. Contains the following sections: - INTRODUCTION - DEPENDENCIES - BUILDING * BUILDING USING MAKEFILE * BUILDING USING RPMBUILD (CREATING SOURCE AND BINARY RPM'S) - INSTALLING * INSTALLING USING MAKEFILE * INSTALLING USING EITHER YUM OR DNF - TESTING - RELATED SOFTWARE TO PSM2 - SUPPORTING DOCUMENTATION INTRODUCTION ============ This README file discusses how to build, install and test the PSM2 library source code. The PSM2 library supports a number of fabric media and stacks, and all of them run on version 7.X of Red Hat Enterprise Linux (abbreviated: RHEL), and SuSE SLES. Only the x86_64 architecture is supported. Building PSM2 is possible on RHEL 7.2+ as it ships with hfi1 kernel driver. For older RHEL 7.x versions and SuSE SLES, OPA is not natively supported in the kernel and therefore, building PSM2 is not possible unless you have the correct kernel-devel package or use latest versions of IFS. There are two mechanisms for building and installing the PSM2 library: 1. Use provided Makefiles to build and install or 2. Generate the *.rpm files which you can then install using either yum or dnf command DEPENDENCIES ============ The following packages are required to build the PSM2 library source code: (all packages are for the x86_64 architecture) compat-rdma-devel gcc-4.8.2 glibc-devel glibc-headers kernel-headers Additional packages for GPU Direct support include: NVIDIA CUDA toolkit 8.0 or greater. Older versions are not supported. In addition to depending on these packages, root privileges are required to install the runtime libraries and development header files into standard system location. BUILDING ======== The instructions below use $BASENAME, $PRODUCT and $RELEASE to refer to the base name of the tarball, RPM that will be generated and the product and release identifiers of the RPM. The base name of the RPM changes depending on which version/branch of code you derive the tar file from. Up until v10.2 of PSM2, the base name for the RPM is hfi1-psm. From v10.2 onwards, the base name will be libpsm2. The internal library remains unchanged and is still libpsm2.so.2. BUILDING USING MAKEFILES ------------------------ 1. Untar the tarball: $ tar zxvf $BASENAME-$PRODUCT-$RELEASE.tar.gz 2. Change directory into the untarred location: $ cd $BASENAME-$PRODUCT-$RELEASE 3. Build: 3.1. To build with GNU C (gcc), run make on the command line: $ make - or - $ make CCARCH=gcc 3.2 To build with Intel C (icc), specify the correct CCARCH: $ make CCARCH=icc 3.3. To build with CUDA support, specify PSM_CUDA=1 on the command line along with the desired compiler: $ make PSM_CUDA=1 CCARCH=gcc - or - $ make PSM_CUDA=1 CCARCH=icc BUILDING USING RPMBUILD ----------------------- 1. Run this command from your $PWD to generate rpm, srpm files $ ./makesrpm.sh a This command results in the following collection of rpm's and source code rpm's under your $PWD/temp.X/ directory. ("X" is the pid of the bash script that created the srpm and rpm files) (Result shown here for RHEL systems.) RPMS/x86_64/libpsm2-compat-10.3.7-1x86_64.rpm RPMS/x86_64/libpsm2-devel-10.3.7-1x86_64.rpm RPMS/x86_64/libpsm2-10.3.7-1x86_64.rpm RPMS/x86_64/libpsm2-debuginfo-10.3.7-1x86_64.rpm SRPMS/libpsm2-10.3.7-1.src.rpm 1.1. Optionally for GPU Direct support run this command from your $PWD to generate rpm, srpm files $ ./makesrpm.sh a -cuda This command results in the following collection of rpm's and source code rpm's under your $PWD/temp.X/ directory. ("X" is the pid of the bash script that created the srpm and rpm files): RPMS/x86_64/libpsm2-10.3.7-1cuda.x86_64.rpm RPMS/x86_64/libpsm2-compat-10.3.7-1cuda.x86_64.rpm RPMS/x86_64/libpsm2-devel-10.3.7-1cuda.x86_64.rpm SRPMS/x86_64/libpsm2-10.3.7-1cuda.src.rpm On systems with SLES 12.3 or newer, the package name for the base libpsm2 RPM will be: libpsm2-2-10.3.7-1.x86_64.rpm Other supporting RPM package names will be as listed above. 2. To build rpm files from the srpm file with Intel C (icc), specify the correct CCARCH in the rpmbuild environment: $ env CCARCH=icc rpmbuild --rebuild SRPMS/libpsm2-10.3.7-1.src.rpm INSTALLING ========== INSTALLING USING MAKEFILE ------------------------- Install the libraries and header files on the system (as root): $ make install The libraries will be installed in /usr/lib64, and the header files will be installed in /usr/include. This behavior can be altered by using the "DESTDIR" and "LIBDIR" variables on the "make install" command line. "DESTDIR" will add a leading path component to the overall install path and "LIBDIR" will change the path where libraries will be installed. For example, "make DESTDIR=/tmp/psm-install install" will install all files (libraries and headers) into "/tmp/psm-install/usr/...", "make DESTDIR=/tmp/psm-install LIBDIR=/libraries install" will install the libraries in "/tmp/psm-install/libraries" and the headers in "/tmp/psm-install/usr/include", and "make LIBDIR=/tmp/libs install" will install the libraries in "/tmp/libs" and the headers in "/usr/include". INSTALLING USING EITHER YUM OR DNF ---------------------------------- You can install the rpm's and source rpm's previously built using rpmbuild using either the yum or dnf command as the root user. See the appropriate man page for details of installing rpm's. Note: It is also possible to use rpm command to install rpm's, but it is recommended that one use yum/dnf as rpm tool has issues with name changes and obsoletes tags. yum or dnf should be better able to resolve dependency issues. RELATED SOFTWARE TO PSM2 ======================== MPI Libraries supported ----------------------- A large number of open source (Open MPI, MVAPICH2) and Vendor MPI implementations support PSM2 for optimized communication on HCAs. Vendor MPI implementations (HP-MPI, Intel MPI 4.0 with PMI, Platform/Scali MPI) require that the PSM2 runtime libraries be installed and available on each node. Usually a configuration file or a command line switch to mpirun needs to be specified to utilize the PSM2 transport. Open MPI support --------------- If using a version of Open MPI that is not packaged within IFS release, it is required to use at least v1.10.4. Older versions are not supported. Since v1.10.4 is not in active development, it is further recommended to use upstream versions v2.1.2 or newer. If NVIDIA* CUDA* support is desired, you can use Open MPI built with CUDA* support provided by Intel in the IFS installer 10.4 or newer. This Open MPI build is identified with the "-cuda-hfi" tag to the Open MPI base version name. The NVIDIA* CUDA* support changes have also been accepted into v2.1.3, v3.0.1 and v3.1.0 branches of upstream Open MPI repository. PSM2 header and runtime files need to be installed on a node where the Open MPI build is performed. All compute nodes additionally should have the PSM2 runtime libraries available on them. Open MPI provides a standard configure, make and make install mechanism which will detect and build the relevant PSM2 network modules for Open MPI once the header and runtime files are detected. MVAPICH2 support ---------------- MVAPICH2 supports PSM2 transport for optimized communication on HFI hardware. OPA IFS supports MVAPICH2 v2.1 (or later). PSM2 header and runtime files need to be installed on a node where MVAPICH2 builds are performed. All compute nodes should also have the PSM2 runtime libraries available on them. For building and installing MVAPICH2 with OPA support, refer to MVAPICH2 user guides here: http://mvapich.cse.ohio-state.edu/userguide/ (Note: Support for PSM2 is included in v2.2 and newer) OFED Support ------------ Intel OPA is not yet included within OFED. But the hfi1 driver is available publicly at kernel.org. SUPPORTING DOCUMENTATION ------------------------ PSM2 Programmer's Guide is published along with documentation for "IntelĀ® Omni-Path Host Fabric Interface PCIe Adapter 100 Series" (https://www.intel.com/content/www/us/en/support/articles/000016242/network-and-i-o/fabric-products.html) Refer to this document for description on APIs and environment variables that are available for use. For sample code on writing applications leveraging the PSM2 APIs, refer to Section 5. PSM Compatibility Support ------------ libpsm2-compat suppports applications that use the PSM API instead of the PSM2 API, through a compatibility library. This library is an interface between PSM applications and the PSM2 API. If the system has an application that is coded to use PSM and has requirements to use PSM2 (i.e. the host has Omni-Path hardware), the compatibility library must be used. Please refer to your operating system's documentation to find how to modify the order in which system directories are searched for dynamic libraries. The libpsm2-compat version of libpsm_infinipath.so.1 must be earlier on the search path than that of libpsm-infinipath. Doing so allows applications coded to PSM to transparently use the PSM2 API and devices which require it. Please note that the installation path for the libpsm2-compat version of libpsm_infinipath.so.1 will differ depending on your operating system specifics. Common locations include: - /usr/lib64/psm2-compat/ - /usr/lib/psm2-compat/ opa-psm2-PSM2_11.2.185/buildflags.mak000066400000000000000000000147671370564314600170460ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2016 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2016 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2003-2016 Intel Corporation. All rights reserved. # # set top_srcdir and include this file ifeq (,$(top_srcdir)) $(error top_srcdir must be set to include makefile fragment) endif export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,') ifeq (${CCARCH},$(filter ${CCARCH},gcc gcc4 icc clang)) export CC := ${CCARCH} else anerr := $(error Unknown C compiler arch: ${CCARCH}) endif ifeq (${FCARCH},gfortran) export FC := gfortran else anerr := $(error Unknown Fortran compiler arch: ${FCARCH}) endif # gfortran BASECFLAGS := $(BASE_FLAGS) -pthread LDFLAGS += $(BASE_FLAGS) ASFLAGS += $(BASE_FLAGS) ifeq ($(PSM2_MOCK_TESTING),1) BASECFLAGS += -DPSM2_MOCK_TESTING=1 unexport LINKER_SCRIPT # We skip the linker script for mock testing version, we want all symbols # to be reachable from outside the library else LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE) endif WERROR := -Werror INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/mpspawn -I$(top_srcdir)/include/$(os)-$(arch) # # use IFS provided hfi1_user.h if installed. # IFS_HFI_HEADER_PATH := /usr/include/uapi INCLUDES += -I${IFS_HFI_HEADER_PATH} BASECFLAGS +=-Wall $(WERROR) # # test if compiler supports 32B(AVX2)/64B(AVX512F) move instruction. # ifeq (${CC},icc) ifeq ($(PSM_DISABLE_AVX2),) MAVX2=-xATOM_SSE4.2 -DPSM_AVX512 else MAVX2=-march=core-avx-i endif else ifeq ($(PSM_DISABLE_AVX2),) MAVX2=-mavx2 else MAVX2=-mavx endif endif ifneq (icc,${CC}) ifeq ($(PSM_DISABLE_AVX2),) RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX2 ; echo $$?) else RET := $(shell echo "int main() {}" | ${CC} ${MAVX2} -E -dM -xc - 2>&1 | grep -q AVX ; echo $$?) $(warning ***NOTE TO USER**** Disabling AVX2 will harm performance) endif ifeq (0,${RET}) BASECFLAGS += ${MAVX2} else $(error Compiler does not support ${MAVX2} ) endif else BASECFLAGS += ${MAVX2} endif # This support is dynamic at runtime, so is OK to enable as long as compiler can generate # the code. ifneq (,${PSM_AVX512}) ifneq (icc,${CC}) RET := $(shell echo "int main() {}" | ${CC} -mavx512f -E -dM -xc - 2>&1 | grep -q AVX512 ; echo $$?) ifeq (0,${RET}) BASECFLAGS += -mavx512f else $(error Compiler does not support AVX512 ) endif BASECFLAGS += -DPSM_AVX512 endif endif # # feature test macros for drand48_r # BASECFLAGS += -D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE ifneq (,${HFI_BRAKE_DEBUG}) BASECFLAGS += -DHFI_BRAKE_DEBUG endif ifneq (,${PSM_FI}) BASECFLAGS += -DPSM_FI endif ifneq (,${PSM_DEBUG}) BASECFLAGS += -O -g3 -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2 else BASECFLAGS += -O3 -g3 endif ifneq (,${PSM_COVERAGE}) # This check must come after PSM_DEBUG to override optimization setting BASECFLAGS += -O -fprofile-arcs -ftest-coverage LDFLAGS += -fprofile-arcs endif ifneq (,${PSM_LOG}) BASECFLAGS += -DPSM_LOG ifneq (,${PSM_LOG_FAST_IO}) BASECFLAGS += -DPSM_LOG_FAST_IO PSM2_ADDITIONAL_GLOBALS += psmi_log_fini;psmi_log_message; endif endif ifneq (,${PSM_PERF}) BASECFLAGS += -DRDPMC_PERF_FRAMEWORK endif ifneq (,${PSM_HEAP_DEBUG}) BASECFLAGS += -DPSM_HEAP_DEBUG PSM2_ADDITIONAL_GLOBALS += _psmi_heapdebug_val_heapallocs; endif ifneq (,${PSM_PROFILE}) BASECFLAGS += -DPSM_PROFILE endif BASECFLAGS += -DNVIDIA_GPU_DIRECT ifneq (,${PSM_CUDA}) BASECFLAGS += -DPSM_CUDA CUDA_HOME ?= /usr/local/cuda INCLUDES += -I$(CUDA_HOME)/include endif BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE ASFLAGS += -g3 -fpic BASECFLAGS += ${OPA_CFLAGS} ifeq (${CCARCH},icc) BASECFLAGS += -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed, LDFLAGS += -static-intel else LDFLAGS += -Wl,--build-id ifeq (${CCARCH},$(filter ${CCARCH},gcc clang)) BASECFLAGS += -funwind-tables -Wno-strict-aliasing -Wformat-security else ifneq (${CCARCH},gcc4) $(error Unknown compiler arch "${CCARCH}") endif # gcc4 endif # gcc endif # icc # We run export here to ensure all the above setup is in the environment # for sub makes. However, we exclude this during clean and distclean # to avoid resolution of some variables that don't need to be resolved # and avoid unnecessary missing file warnings during cleanup. ifneq ($(MAKECMDGOALS), clean) ifneq ($(MAKECMDGOALS), distclean) export endif endif opa-psm2-PSM2_11.2.185/compat/000077500000000000000000000000001370564314600155045ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/compat/40-psm-compat.rules000066400000000000000000000045101370564314600210610ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # KERNEL=="hfi1", SYMLINK+="ipath" KERNEL=="hfi1_[0-9]", MODE="0666", SYMLINK+="ipath" opa-psm2-PSM2_11.2.185/compat/Makefile000066400000000000000000000071721370564314600171530ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # OUTDIR = . COMPATLIB := libpsm_infinipath COMPAT_LIB_TARG := $(INSTALL_LIB_TARG)/psm2-compat compat_build_dir := $(shell readlink -m .) MAJOR := $(shell sed -n 's/^\#define.*PSM2_VERNO_COMPAT_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' ../psm2.h) top_srcdir := $(compat_build_dir)/.. include $(compat_build_dir)/buildflags.mak INCLUDES += -I$(top_srcdir) ${COMPATLIB}-objs := psm-compat.o ${COMPATLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${COMPATLIB}-objs}) DEPS:= $(${COMPATLIB}-objs:.o=.d) -include $(DEPS) all .DEFAULT: ${${COMPATLIB}-objs} $(OUTDIR)/${COMPATLIB}.so.${MAJOR} install: all install -m 0644 -D 40-psm-compat.rules ${DESTDIR}$(UDEVDIR)/rules.d/40-psm-compat.rules install -m 0644 -D libpsm2-compat.conf ${DESTDIR}${LIBPSM2_COMPAT_CONF_DIR}/modprobe.d/libpsm2-compat.conf install -m 0755 -D libpsm2-compat.cmds ${DESTDIR}/usr/lib/libpsm2/libpsm2-compat.cmds install -D $(OUTDIR)/${COMPATLIB}.so.${MAJOR} ${DESTDIR}${COMPAT_LIB_TARG}/${COMPATLIB}.so.${MAJOR} $(OUTDIR)/%.o: $(compat_build_dir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@ $(OUTDIR)/${COMPATLIB}.so.${MAJOR}: ${${COMPATLIB}-objs} $(CC) $(BASECFLAGS) $(LINKER_SCRIPT) $(LDFLAGS) -Wl,-soname=${COMPATLIB}.so.${MAJOR} -shared \ -L$(OUTDIR)/.. ${${COMPATLIB}-objs} -lpsm2 -o $@ clean: @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno ${COMPATLIB}.*; \ cd -; \ fi opa-psm2-PSM2_11.2.185/compat/buildflags.mak000066400000000000000000000064031370564314600203150ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ifeq (,$(top_srcdir)) $(error top_srcdir must be set to include makefile fragment) endif export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') export arch := $(shell uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,') export CCARCH ?= gcc ifeq (${CCARCH},$(filter ${CCARCH},gcc gcc4 icc clang)) export CC := ${CCARCH} else anerr := $(error Unknown C compiler arch: ${CCARCH}) endif BASECFLAGS += $(BASE_FLAGS) LDFLAGS += $(BASE_FLAGS) ASFLAGS += $(BASE_FLAGS) LINKER_SCRIPT_FILE := psm2_compat_linker_script.map LINKER_SCRIPT := -Wl,--version-script $(LINKER_SCRIPT_FILE) WERROR := -Werror INCLUDES := -I$(top_srcdir)/include -I$(top_srcdir)/include/$(os)-$(arch) -I$(top_srcdir)/mpspawn BASECFLAGS +=-Wall $(WERROR) BASECFLAGS += -fpic -fPIC ASFLAGS += -g3 -fpic ifeq (${CCARCH},icc) BASECFLAGS += -O3 -g3 LDFLAGS += -static-intel else ifeq (${CCARCH},$(filter ${CCARCH},gcc clang)) BASECFLAGS += -Wno-strict-aliasing else ifneq (${CCARCH},gcc4) $(error Unknown compiler arch "${CCARCH}") endif endif endif opa-psm2-PSM2_11.2.185/compat/libpsm2-compat.cmds000077500000000000000000000053371370564314600212200ustar00rootroot00000000000000#!/bin/sh # # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # This script was created to allow for both an hfi1 and qib adapter # to co-exist on the same machine. # The simlink from /dev/ipath is removed to allow ib_qib to load # correctly and create a proper device file. case "$1" in start) # Remove symlink if hfi1 was loaded first if [ -L "/dev/ipath" ]; then rm /dev/ipath fi ;; stop) # Restore symlink if hfi1 is loaded if [ -f "/dev/hfi1" ] && ! [ -L "/dev/ipath" ]; then ln -s /dev/hfi1 /dev/ipath fi ;; esac opa-psm2-PSM2_11.2.185/compat/libpsm2-compat.conf000066400000000000000000000046401370564314600212100ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # install ib_qib /usr/lib/libpsm2/libpsm2-compat.cmds start; modprobe -i ib_qib $CMDLINE_OPTS remove ib_qib modprobe -r -i ib_qib && /usr/lib/libpsm2/libpsm2-compat.cmds stop opa-psm2-PSM2_11.2.185/compat/psm-compat.c000066400000000000000000000207601370564314600177350ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "../psm2.h" #include "../psm2_mq.h" #include "../psm2_am.h" /* Functions from TS psm.h */ psm2_error_t psm_init(int *major, int *minor) { return psm2_init(major, minor); } psm2_error_t psm_finalize(void) { return psm2_finalize(); } psm2_error_t psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames) { return psm2_map_nid_hostname(num, nids, hostnames); } void psm_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label) { return psm2_epaddr_setlabel(epaddr, epaddr_label); } void psm_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt) { psm2_epaddr_setctxt(epaddr, ctxt); } void * psm_epaddr_getctxt(psm2_epaddr_t epaddr) { return psm2_epaddr_getctxt(epaddr); } psm2_error_t psm_setopt(psm2_component_t component, const void *component_obj, int optname, const void *optval, uint64_t optlen) { return psm2_setopt(component, component_obj, optname, optval, optlen); } psm2_error_t psm_getopt(psm2_component_t component, const void *component_obj, int optname, void *optval, uint64_t *optlen) { return psm2_getopt(component, component_obj, optname, optval, optlen); } psm2_error_t psm_poll(psm2_ep_t ep) { return psm2_poll(ep); } void psm_uuid_generate(psm2_uuid_t uuid_out) { psm2_uuid_generate(uuid_out); } /* Functions from TS psm_am.h */ psm2_error_t psm_am_register_handlers(psm2_ep_t ep, const psm2_am_handler_fn_t *handlers, int num_handlers, int *handlers_idx) { return psm2_am_register_handlers(ep, handlers, num_handlers, handlers_idx); } psm2_error_t psm_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { return psm2_am_request_short(epaddr, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt); } psm2_error_t psm_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { return psm2_am_reply_short(token, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt); } psm2_error_t psm_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters, size_t sizeof_parameters_in, size_t *sizeof_parameters_out) { return psm2_am_get_parameters(ep, parameters, sizeof_parameters_in, sizeof_parameters_out); } /* Functions from TS psm_error.h */ psm2_error_t psm_error_defer(psm2_error_token_t token) { return psm2_error_defer(token); } psm2_error_t psm_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler) { return psm2_error_register_handler(ep, errhandler); } const char * psm_error_get_string(psm2_error_t error) { return psm2_error_get_string(error); } /* Functions from TS psm_mq.h */ psm2_error_t psm_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, psm2_mq_status_t *status) { return psm2_mq_iprobe(mq, tag, tagsel, status); } psm2_error_t psm_mq_cancel(psm2_mq_req_t *ireq) { return psm2_mq_cancel(ireq); } psm2_error_t psm_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status) { return psm2_mq_wait(ireq, status); } psm2_error_t psm_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status) { return psm2_mq_test(ireq, status); } psm2_error_t psm_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, const void *buf, uint32_t len, void *context, psm2_mq_req_t *req) { return psm2_mq_isend(mq, dest, flags, stag, buf, len, context, req); } psm2_error_t psm_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, const void *buf, uint32_t len) { return psm2_mq_send(mq, dest, flags, stag, buf, len); } psm2_error_t psm_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) { return psm2_mq_irecv(mq, tag, tagsel, flags, buf, len, context, reqo); } psm2_error_t psm_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status) { return psm2_mq_ipeek(mq, oreq, status); } psm2_error_t psm_mq_getopt(psm2_mq_t mq, int key, void *value) { return psm2_mq_getopt(mq, key, value); } psm2_error_t psm_mq_setopt(psm2_mq_t mq, int key, const void *value) { return psm2_mq_setopt(mq, key, value); } psm2_error_t psm_mq_init(psm2_ep_t ep, uint64_t ignored, const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo) { return psm2_mq_init(ep, ignored, opts, numopts, mqo); } psm2_error_t psm_mq_finalize(psm2_mq_t mq) { return psm2_mq_finalize(mq); } void psm_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats) { psm2_mq_get_stats(mq, stats); } /* Functions from TS psm_mq.h */ psm2_error_t psm_ep_num_devunits(uint32_t *num_units_o) { return psm2_ep_num_devunits(num_units_o); } uint64_t psm_epid_nid(psm2_epid_t epid) { return psm2_epid_nid(epid); } uint64_t psm_epid_context(psm2_epid_t epid) { return psm2_epid_context(epid); } uint64_t psm_epid_port(psm2_epid_t epid) { return psm2_epid_port(epid); } psm2_error_t psm_ep_query (int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) { return psm2_ep_query (num_of_epinfo, array_of_epinfo); } psm2_error_t psm_ep_epid_lookup (psm2_epid_t epid, psm2_epconn_t *epconn) { return psm2_ep_epid_lookup (epid, epconn); } psm2_error_t psm_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o) { return psm2_ep_epid_share_memory(ep, epid, result_o); } psm2_error_t psm_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts) { return psm2_ep_open_opts_get_defaults(opts); } psm2_error_t psm_ep_open(psm2_uuid_t const unique_job_key, struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo, psm2_epid_t *epido) { return psm2_ep_open(unique_job_key, opts_i, epo, epido); } psm2_error_t psm_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) { return psm2_ep_close(ep, mode, timeout_in); } psm2_error_t psm_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, int const *array_of_epid_mask, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, int64_t timeout) { return psm2_ep_connect(ep, num_of_epid, array_of_epid, array_of_epid_mask, array_of_errors, array_of_epaddr, timeout); } opa-psm2-PSM2_11.2.185/compat/psm2_compat_linker_script.map000066400000000000000000000050631370564314600233630ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info. C++ // Comments don't work in this file. */ PSM_1.0 { /* Expose only those symbols we choose to. This way we do not pollute users namespace more than absolutely necessary. */ global: psm_*; /* Make all other symbols local */ local: *; }; opa-psm2-PSM2_11.2.185/debian/000077500000000000000000000000001370564314600154435ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/debian/changelog.in000066400000000000000000000004551370564314600177260ustar00rootroot00000000000000libpsm2 (10.2.91) UNRELEASED; urgency=medium * Add Ubuntu support -- Tymoteusz Kielan Thu, 08 Dec 2016 11:49:12 +0100 hfi1-psm (0.7) UNRELEASED; urgency=medium * Initial release -- Brian T. Smith Mon, 14 Mar 2016 12:26:35 -0500 opa-psm2-PSM2_11.2.185/debian/changelog.tmpl000066400000000000000000000004551370564314600202740ustar00rootroot00000000000000libpsm2 (10.2.91) UNRELEASED; urgency=medium * Add Ubuntu support -- Tymoteusz Kielan Thu, 08 Dec 2016 11:49:12 +0100 hfi1-psm (0.7) UNRELEASED; urgency=medium * Initial release -- Brian T. Smith Mon, 14 Mar 2016 12:26:35 -0500 opa-psm2-PSM2_11.2.185/debian/compat000066400000000000000000000000021370564314600166410ustar00rootroot000000000000009 opa-psm2-PSM2_11.2.185/debian/control000066400000000000000000000020211370564314600170410ustar00rootroot00000000000000Source: libpsm2 Maintainer: Tymoteusz Kielan Section: libs Priority: optional Standards-Version: 3.9.8 Build-Depends: debhelper (>= 9), uuid-dev, libnuma-dev Package: libpsm2 Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} Description: Intel PSM2 library PSM2 is Intel's low-level user-space communications interface for the Intel(R) OPA family of products. PSM2 users are enabled with mechanisms necessary to implement higher level communications interfaces in parallel environments. Package: libpsm2-dev Architecture: linux-any Section: libdevel Depends: ${misc:Depends}, libpsm2 (= ${binary:Version}), uuid-dev, libnuma-dev Description: Development files for Intel PSM2 library PSM2 is Intel's low-level user-space communications interface for the Intel(R) OPA family of products. PSM2 users are enabled with mechanisms necessary to implement higher level communications interfaces in parallel environments. This package contains the development headers for Intel PSM2 library. opa-psm2-PSM2_11.2.185/debian/copyright000066400000000000000000000032671370564314600174060ustar00rootroot00000000000000 /usr/share/common-licenses/GPL-2 Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Copyright(c) 2014-2017 Intel Corporation. All rights reserved. Copyright(c) 2016 System Fabric Works, Inc. All Rights Reserved. opa-psm2-PSM2_11.2.185/debian/libpsm2-dev.install000066400000000000000000000013361370564314600211620ustar00rootroot00000000000000/usr/lib/libpsm2.so /usr/include/psm2.h /usr/include/psm2_mq.h /usr/include/psm2_am.h /usr/include/hfi1diag/hfi1_deprecated_gen1.h /usr/include/hfi1diag/linux-x86_64/bit_ops.h /usr/include/hfi1diag/linux-x86_64/sysdep.h /usr/include/hfi1diag/opa_udebug.h /usr/include/hfi1diag/opa_debug.h /usr/include/hfi1diag/opa_intf.h /usr/include/hfi1diag/opa_user.h /usr/include/hfi1diag/opa_service.h /usr/include/hfi1diag/opa_byteorder.h /usr/include/hfi1diag/opa_common_gen1.h /usr/include/hfi1diag/opa_revision.h /usr/include/hfi1diag/opa_service.h /usr/include/hfi1diag/opa_service_gen1.h /usr/include/hfi1diag/opa_user.h /usr/include/hfi1diag/opa_user_gen1.h /usr/include/hfi1diag/psm2_mock_testing.h /usr/include/hfi1diag/psmi_wrappers.h opa-psm2-PSM2_11.2.185/debian/libpsm2.install000066400000000000000000000001151370564314600204000ustar00rootroot00000000000000/usr/lib/libpsm2.so.2.1 /usr/lib/libpsm2.so.2 /lib/udev/rules.d/40-psm.rules opa-psm2-PSM2_11.2.185/debian/rules000077500000000000000000000003321370564314600165210ustar00rootroot00000000000000#!/usr/bin/make -f export DEB_BUILD_MAINT_OPTIONS=hardening=+all # Specify the library installation directory export LIBDIR=/usr/lib %: dh $@ --parallel override_dh_installdocs: dh_installdocs --link-doc=libpsm2 opa-psm2-PSM2_11.2.185/debian/source/000077500000000000000000000000001370564314600167435ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/debian/source/format000066400000000000000000000000151370564314600201520ustar00rootroot000000000000003.0 (native) opa-psm2-PSM2_11.2.185/debian/source/options000066400000000000000000000000231370564314600203540ustar00rootroot00000000000000compression = "xz" opa-psm2-PSM2_11.2.185/debian/symbols000066400000000000000000000045421370564314600170630ustar00rootroot00000000000000libpsm2.so.2 libpsm2 #MINVER# PSM2_1.0@PSM2_1.0 10.2 __hfi_dbgout@PSM2_1.0 10.2 __hfi_mylabel@PSM2_1.0 10.2 __hfi_pico_per_cycle@PSM2_1.0 10.2 hfi_cmd_write@PSM2_1.0 10.2 hfi_context_close@PSM2_1.0 10.2 hfi_context_open@PSM2_1.0 10.2 hfi_debug@PSM2_1.0 10.2 hfi_get_mylabel@PSM2_1.0 10.2 hfi_get_port_lid@PSM2_1.0 10.2 hfi_get_port_vl2mtu@PSM2_1.0 10.2 hfi_mmap64@PSM2_1.0 10.2 hfi_poll_type@PSM2_1.0 10.2 hfi_set_mylabel@PSM2_1.0 10.2 hfi_userinit@PSM2_1.0 10.2 hfi_wait_for_packet@PSM2_1.0 10.2 psm2_am_get_parameters@PSM2_1.0 10.2 psm2_am_get_source@PSM2_1.0 10.2 psm2_am_register_handlers@PSM2_1.0 10.2 psm2_am_reply_short@PSM2_1.0 10.2 psm2_am_request_short@PSM2_1.0 10.2 psm2_capabilities_bitset@PSM2_1.0 10.3.0 psm2_ep_close@PSM2_1.0 10.2 psm2_ep_connect@PSM2_1.0 10.2 psm2_ep_disconnect2@PSM2_1.0 10.3.0 psm2_ep_disconnect@PSM2_1.0 10.2 psm2_ep_epid_lookup2@PSM2_1.0 10.3.0 psm2_ep_epid_lookup@PSM2_1.0 10.2 psm2_ep_epid_share_memory@PSM2_1.0 10.2 psm2_ep_num_devunits@PSM2_1.0 10.2 psm2_ep_open@PSM2_1.0 10.2 psm2_ep_open_opts_get_defaults@PSM2_1.0 10.2 psm2_ep_query@PSM2_1.0 10.2 psm2_epaddr_getctxt@PSM2_1.0 10.2 psm2_epaddr_setctxt@PSM2_1.0 10.2 psm2_epaddr_setlabel@PSM2_1.0 10.2 psm2_epaddr_to_epid@PSM2_1.0 10.3.0 psm2_epid_context@PSM2_1.0 10.2 psm2_epid_nid@PSM2_1.0 10.2 psm2_epid_port@PSM2_1.0 10.2 psm2_error_defer@PSM2_1.0 10.2 psm2_error_get_string@PSM2_1.0 10.2 psm2_error_register_handler@PSM2_1.0 10.2 psm2_finalize@PSM2_1.0 10.2 psm2_get_capability_mask@PSM2_1.0 10.3.0 psm2_getopt@PSM2_1.0 10.2 psm2_init@PSM2_1.0 10.2 psm2_map_nid_hostname@PSM2_1.0 10.2 psm2_mq_cancel@PSM2_1.0 10.2 psm2_mq_finalize@PSM2_1.0 10.2 psm2_mq_get_stats@PSM2_1.0 10.2 psm2_mq_getopt@PSM2_1.0 10.2 psm2_mq_improbe2@PSM2_1.0 10.2 psm2_mq_improbe@PSM2_1.0 10.2 psm2_mq_imrecv@PSM2_1.0 10.2 psm2_mq_init@PSM2_1.0 10.2 psm2_mq_ipeek2@PSM2_1.0 10.2 psm2_mq_ipeek@PSM2_1.0 10.2 psm2_mq_iprobe2@PSM2_1.0 10.2 psm2_mq_iprobe@PSM2_1.0 10.2 psm2_mq_irecv2@PSM2_1.0 10.2 psm2_mq_irecv@PSM2_1.0 10.2 psm2_mq_isend2@PSM2_1.0 10.2 psm2_mq_isend@PSM2_1.0 10.2 psm2_mq_send2@PSM2_1.0 10.2 psm2_mq_send@PSM2_1.0 10.2 psm2_mq_setopt@PSM2_1.0 10.2 psm2_mq_test2@PSM2_1.0 10.2 psm2_mq_test@PSM2_1.0 10.2 psm2_mq_wait2@PSM2_1.0 10.2 psm2_mq_wait@PSM2_1.0 10.2 psm2_poll@PSM2_1.0 10.2 psm2_setopt@PSM2_1.0 10.2 psm2_uuid_generate@PSM2_1.0 10.2 opa-psm2-PSM2_11.2.185/include/000077500000000000000000000000001370564314600156445ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/include/linux-i386/000077500000000000000000000000001370564314600174725ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/include/linux-i386/bit_ops.h000066400000000000000000000065701370564314600213120ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _HFI_i386_BIT_OPS_H #define _HFI_i386_BIT_OPS_H static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr) { asm volatile (LOCK_PREFIX "btrl %1,%0" : "=m"(*addr) : "dIr"(nr)); } static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr) { asm volatile (LOCK_PREFIX "btcl %1,%0" : "=m"(*addr) : "dIr"(nr)); } static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr) { int oldbit; asm volatile (LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r"(oldbit), "=m"(*addr) : "dIr"(nr) : "memory"); return oldbit; } static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr) { asm volatile ("btrl %1,%0" : "=m" (*addr) : "dIr"(nr)); } static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr) { asm volatile ("btcl %1,%0" : "=m" (*addr) : "dIr"(nr)); } static __inline__ int ips___test_and_set_bit(int nr, volatile unsigned long *addr) { int oldbit; asm volatile ("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m"(*addr) : "dIr"(nr) : "memory"); return oldbit; } #endif /* _HFI_i386_BIT_OPS_H */ opa-psm2-PSM2_11.2.185/include/linux-i386/sysdep.h000066400000000000000000000107271370564314600211610ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _HFI_i386_SYSDEP_H #define _HFI_i386_SYSDEP_H typedef struct cpuid { unsigned eax, ebx, ecx, edx; } cpuid_t; static __inline__ void get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id) { unsigned a, b, c, d; asm (" \ mov %4, %%eax \n\ mov %5, %%ecx \n\ cpuid \n\ mov %%eax, %0 \n\ mov %%ebx, %1 \n\ mov %%ecx, %2 \n\ mov %%edx, %3 \n\ " : "=g" (a), "=g" (b), "=g" (c), "=g" (d) : "g" (func), "g" (subfunc) : "%eax", "%ebx", "%ecx", "%edx" ); id->eax = a; id->ebx = b; id->ecx = c; id->edx = d; } static __inline__ uint64_t get_cycles(void) { uint64_t v; uint32_t a, d; asm volatile ("rdtsc" : "=a" (a), "=d"(d)); v = ((uint64_t) a) | (((uint64_t) d) << 32); return v; } #ifndef LOCK_PREFIX #define LOCK_PREFIX "lock " #endif static __inline__ void ips_barrier() { asm volatile ("" : : : "memory"); } static __inline__ void ips_mb() { asm volatile ("mfence" : : : "memory"); } /* gcc-3.4 has a bug with this function body at -O0 */ static #if defined(__GNUC__) && __GNUC__ == 3 && __GNUC_MINOR__ == 4 #else __inline__ #endif void ips_rmb() { asm volatile ("" : : : "memory"); } static __inline__ void ips_wmb() { asm volatile ("sfence" : : : "memory"); } static __inline__ void ips_sync_writes() { asm volatile ("sfence" : : : "memory"); } static __inline__ void ips_sync_reads() { asm volatile ("lfence" : : : "memory"); } static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr, uint32_t old_val, uint32_t new_val) { uint32_t prev; struct xchg_dummy { uint32_t a[100]; }; asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev) : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val) : "memory"); return prev; } typedef struct { volatile int32_t counter; } ips_atomic_t; #define ips_atomic_set(v, i) (((v)->counter) = (i)) #define ips_atomic_cmpxchg(p, oval, nval) \ ips_cmpxchg((volatile uint32_t *) &((p)->counter), oval, nval) #if 0 static __inline__ int32_t ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value) { asm volatile ("lock cmpxchg %2, %0" : "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory"); return old_value; } #endif #endif /* _HFI_i386_SYSDEP_H */ opa-psm2-PSM2_11.2.185/include/opa_byteorder.h000066400000000000000000000153711370564314600206620ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef OPA_BYTEORDER_H #define OPA_BYTEORDER_H #ifdef __cplusplus extern "C" { #endif #include #include #include #ifndef __BYTE_ORDER # error "BYTE_ORDER undefined" #endif typedef __u16 __le16; typedef __u16 __be16; typedef __u32 __le32; typedef __u32 __be32; typedef __u64 __le64; typedef __u64 __be64; static __inline__ __u16 __hfi_fswab16(__u16) __attribute__ ((always_inline)); static __inline__ __u32 __hfi_fswab32(__u32) __attribute__ ((always_inline)); static __inline__ __u64 __hfi_fswab64(__u64) __attribute__ ((always_inline)); static __inline__ __u16 __hfi_fswab16(__u16 x) { return ((x & (__u16) 0x00ffU) << 8) | ((x & (__u16) 0xff00U) >> 8); } static __inline__ __u32 __hfi_fswab32(__u32 x) { return ((x & (__u32) 0x000000ffUL) << 24) | ((x & (__u32) 0x0000ff00UL) << 8) | ((x & (__u32) 0x00ff0000UL) >> 8) | ((x & (__u32) 0xff000000UL) >> 24); } static __inline__ __u64 __hfi_fswab64(__u64 x) { return ((x & (__u64) 0x00000000000000ffULL) << 56) | ((x & (__u64) 0x000000000000ff00ULL) << 40) | ((x & (__u64) 0x0000000000ff0000ULL) << 24) | ((x & (__u64) 0x00000000ff000000ULL) << 8) | ((x & (__u64) 0x000000ff00000000ULL) >> 8) | ((x & (__u64) 0x0000ff0000000000ULL) >> 24) | ((x & (__u64) 0x00ff000000000000ULL) >> 40) | ((x & (__u64) 0xff00000000000000ULL) >> 56); } static __inline__ __u16 __cpu_to_le16(__le16) __attribute__ ((always_inline)); static __inline__ __u32 __cpu_to_le32(__le32) __attribute__ ((always_inline)); static __inline__ __u64 __cpu_to_le64(__le64) __attribute__ ((always_inline)); static __inline__ __u16 __le16_to_cpu(__le16) __attribute__ ((always_inline)); static __inline__ __u32 __le32_to_cpu(__le32) __attribute__ ((always_inline)); static __inline__ __u64 __le64_to_cpu(__le64) __attribute__ ((always_inline)); static __inline__ __u16 __cpu_to_be16(__be16) __attribute__ ((always_inline)); static __inline__ __u32 __cpu_to_be32(__be32) __attribute__ ((always_inline)); static __inline__ __u64 __cpu_to_be64(__be64) __attribute__ ((always_inline)); static __inline__ __u16 __be16_to_cpu(__be16) __attribute__ ((always_inline)); static __inline__ __u32 __be32_to_cpu(__be32) __attribute__ ((always_inline)); static __inline__ __u64 __be64_to_cpu(__be64) __attribute__ ((always_inline)); #if __BYTE_ORDER == __LITTLE_ENDIAN /* * __cpu_to_le* routines */ static __inline__ __le16 __cpu_to_le16(__u16 x) { return x; } static __inline__ __le32 __cpu_to_le32(__u32 x) { return x; } static __inline__ __le64 __cpu_to_le64(__u64 x) { return x; } /* * __le*_to_cpu routines */ static __inline__ __u16 __le16_to_cpu(__le16 x) { return x; } static __inline__ __u32 __le32_to_cpu(__le32 x) { return x; } static __inline__ __u64 __le64_to_cpu(__le64 x) { return x; } /* * __cpu_to_be* routines */ static __inline__ __be16 __cpu_to_be16(__u16 x) { return __hfi_fswab16(x); } static __inline__ __be32 __cpu_to_be32(__u32 x) { return __hfi_fswab32(x); } static __inline__ __be64 __cpu_to_be64(__u64 x) { return __hfi_fswab64(x); } /* * __be*_to_cpu routines */ static __inline__ __u16 __be16_to_cpu(__be16 x) { return __hfi_fswab16(x); } static __inline__ __u32 __be32_to_cpu(__be32 x) { return __hfi_fswab32(x); } static __inline__ __u64 __be64_to_cpu(__be64 x) { return __hfi_fswab64(x); } #elif __BYTE_ORDER == __BIG_ENDIAN /* * __cpu_to_le* routines */ static __inline__ __le16 __cpu_to_le16(__u16 x) { return __hfi_fswab16(x); } static __inline__ __le32 __cpu_to_le32(__u32 x) { return __hfi_fswab32(x); } static __inline__ __le64 __cpu_to_le64(__u64 x) { return __hfi_fswab64(x); } /* * __le*_to_cpu routines */ static __inline__ __u16 __le16_to_cpu(__le16 x) { return __hfi_fswab16(x); } static __inline__ __u32 __le32_to_cpu(__le32 x) { return __hfi_fswab32(x); } static __inline__ __u64 __le64_to_cpu(__le64 x) { return __hfi_fswab64(x); } /* * __cpu_to_be* routines */ static __inline__ __be16 __cpu_to_be16(__u16 x) { return x; } static __inline__ __be32 __cpu_to_be32(__u32 x) { return x; } static __inline__ __be64 __cpu_to_be64(__u64 x) { return x; } /* * __be*_to_cpu routines */ static __inline__ __u16 __be16_to_cpu(__be16 x) { return x; } static __inline__ __u32 __be32_to_cpu(__be32 x) { return x; } static __inline__ __u64 __be64_to_cpu(__be64 x) { return x; } #else # error "unsupported BYTE_ORDER: " #BYTE_ORDER #endif #ifdef __cplusplus } /* extern "C" */ #endif #endif /* OPA_BYTEORDER_H */ opa-psm2-PSM2_11.2.185/include/opa_debug.h000066400000000000000000000102221370564314600177370ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef OPA_DEBUG_H #define OPA_DEBUG_H #ifndef _HFI_DEBUGGING /* debugging enabled or not */ #define _HFI_DEBUGGING 1 #endif #if _HFI_DEBUGGING /* * Mask values for debugging. The scheme allows us to compile out any * of the debug tracing stuff, and if compiled in, to enable or disable * dynamically. This can be set at modprobe time also: * modprobe hfi.ko hfi_debug=7 */ #define __HFI_INFO 0x1 /* generic low verbosity stuff */ #define __HFI_DBG 0x2 /* generic debug */ #define __HFI_TRSAMPLE 0x8 /* generate trace buffer sample entries */ /* leave some low verbosity spots open */ #define __HFI_VERBDBG 0x40 /* very verbose debug */ #define __HFI_PKTDBG 0x80 /* print packet data */ /* print process startup (init)/exit messages and important env vars */ #define __HFI_PROCDBG 0x100 /* print mmap/nopage stuff, not using VDBG any more */ #define __HFI_MMDBG 0x200 /* low-level environment variables */ #define __HFI_ENVDBG 0x400 #define __HFI_EPKTDBG 0x800 /* print error packet data */ #define __HFI_CCADBG 0x1000 /* print CCA related events */ #else /* _HFI_DEBUGGING */ /* * define all of these even with debugging off, for the few places that do * if(hfi_debug & _HFI_xyzzy), but in a way that will make the * compiler eliminate the code */ #define __HFI_INFO 0x0 /* generic low verbosity stuff */ #define __HFI_DBG 0x0 /* generic debug */ #define __HFI_TRSAMPLE 0x0 /* generate trace buffer sample entries */ #define __HFI_VERBDBG 0x0 /* very verbose debug */ #define __HFI_PKTDBG 0x0 /* print packet data */ #define __HFI_PROCDBG 0x0 /* print process startup (init)/exit messages */ /* print mmap/nopage stuff, not using VDBG any more */ #define __HFI_MMDBG 0x0 #define __HFI_CCADBG 0x0 /* print CCA related events */ #endif /* _HFI_DEBUGGING */ #define __HFI_VERBOSEDBG __HFI_VERBDBG #endif /* OPA_DEBUG_H */ opa-psm2-PSM2_11.2.185/include/opa_intf.h000066400000000000000000000063121370564314600176160ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef OPA_INTF_H #define OPA_INTF_H #include #include #include #ifdef __inline__ #undef __inline__ #endif #define __inline__ inline __attribute__((always_inline, unused)) #include "sysdep.h" #include "bit_ops.h" /* these aren't implemented for user mode, which is OK until we multi-thread */ typedef struct _atomic { uint32_t counter; } atomic_t; /* no atomic_t type in user-land */ #define atomic_set(a, v) ((a)->counter = (v)) #define atomic_inc_return(a) (++(a)->counter) #if defined(__GNUC__) #ifndef likely #define likely(x) __builtin_expect(!!(x), 1L) #endif #ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0L) #endif #ifndef if_pt #define if_pt(cond) if (likely(cond)) #endif #ifndef if_pf #define if_pf(cond) if (unlikely(cond)) #endif #define _Pragma_unlikely #define _Pragma_likely #else #error "Unsupported compiler" #endif #define yield() sched_yield() #endif /* OPA_INTF_H */ opa-psm2-PSM2_11.2.185/include/opa_queue.h000066400000000000000000000424141370564314600200050ustar00rootroot00000000000000/* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)queue.h 8.5 (Berkeley) 8/20/94 * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $ */ #ifndef OPA_QUEUE_H_ #define OPA_QUEUE_H_ /* * This file defines five types of data structures: singly-linked lists, * singly-linked tail queues, lists, tail queues, and circular queues. * * A singly-linked list is headed by a single forward pointer. The elements * are singly linked for minimum space and pointer manipulation overhead at * the expense of O(n) removal for arbitrary elements. New elements can be * added to the list after an existing element or at the head of the list. * Elements being removed from the head of the list should use the explicit * macro for this purpose for optimum efficiency. A singly-linked list may * only be traversed in the forward direction. Singly-linked lists are ideal * for applications with large datasets and few or no removals or for * implementing a LIFO queue. * * A singly-linked tail queue is headed by a pair of pointers, one to the * head of the list and the other to the tail of the list. The elements are * singly linked for minimum space and pointer manipulation overhead at the * expense of O(n) removal for arbitrary elements. New elements can be added * to the list after an existing element, at the head of the list, or at the * end of the list. Elements being removed from the head of the tail queue * should use the explicit macro for this purpose for optimum efficiency. * A singly-linked tail queue may only be traversed in the forward direction. * Singly-linked tail queues are ideal for applications with large datasets * and few or no removals or for implementing a FIFO queue. * * A list is headed by a single forward pointer (or an array of forward * pointers for a hash table header). The elements are doubly linked * so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before * or after an existing element or at the head of the list. A list * may only be traversed in the forward direction. * * A tail queue is headed by a pair of pointers, one to the head of the * list and the other to the tail of the list. The elements are doubly * linked so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before or * after an existing element, at the head of the list, or at the end of * the list. A tail queue may be traversed in either direction. * * A circle queue is headed by a pair of pointers, one to the head of the * list and the other to the tail of the list. The elements are doubly * linked so that an arbitrary element can be removed without a need to * traverse the list. New elements can be added to the list before or after * an existing element, at the head of the list, or at the end of the list. * A circle queue may be traversed in either direction, but has a more * complex end of list detection. * * For details on the use of these macros, see the queue(3) manual page. * * * SLIST LIST STAILQ TAILQ CIRCLEQ * _HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - - - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_REVERSE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _REMOVE_HEAD + - + - - * _REMOVE + + + + + * */ /* * Singly-linked List declarations. */ #define SLIST_HEAD(name, type) \ struct name { \ struct type *slh_first; /* first element */ \ } #define SLIST_HEAD_INITIALIZER(head) \ { NULL } #define SLIST_ENTRY(type) \ struct { \ struct type *sle_next; /* next element */ \ } /* * Singly-linked List functions. */ #define SLIST_EMPTY(head) ((head)->slh_first == NULL) #define SLIST_FIRST(head) ((head)->slh_first) #define SLIST_FOREACH(var, head, field) \ for ((var) = SLIST_FIRST((head)); \ (var); \ (var) = SLIST_NEXT((var), field)) #define SLIST_INIT(head) do { \ SLIST_FIRST((head)) = NULL; \ } while (0) #define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ SLIST_NEXT((slistelm), field) = (elm); \ } while (0) #define SLIST_INSERT_HEAD(head, elm, field) do { \ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ SLIST_FIRST((head)) = (elm); \ } while (0) #define SLIST_NEXT(elm, field) ((elm)->field.sle_next) #define SLIST_REMOVE(head, elm, type, field) do { \ if (SLIST_FIRST((head)) == (elm)) { \ SLIST_REMOVE_HEAD((head), field); \ } \ else { \ struct type *curelm = SLIST_FIRST((head)); \ while (SLIST_NEXT(curelm, field) != (elm)) \ curelm = SLIST_NEXT(curelm, field); \ SLIST_NEXT(curelm, field) = \ SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ } \ } while (0) #define SLIST_REMOVE_HEAD(head, field) do { \ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ } while (0) /* * Singly-linked Tail queue declarations. */ #define STAILQ_HEAD(name, type) \ struct name { \ struct type *stqh_first;/* first element */ \ struct type **stqh_last;/* addr of last next element */ \ } #define STAILQ_HEAD_INITIALIZER(head) \ { NULL, &(head).stqh_first } #define STAILQ_ENTRY(type) \ struct { \ struct type *stqe_next; /* next element */ \ } /* * Singly-linked Tail queue functions. */ #define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) #define STAILQ_FIRST(head) ((head)->stqh_first) #define STAILQ_FOREACH(var, head, field) \ for ((var) = STAILQ_FIRST((head)); \ (var); \ (var) = STAILQ_NEXT((var), field)) #define STAILQ_INIT(head) do { \ STAILQ_FIRST((head)) = NULL; \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) #define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ STAILQ_NEXT((tqelm), field) = (elm); \ } while (0) #define STAILQ_INSERT_HEAD(head, elm, field) do { \ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ STAILQ_FIRST((head)) = (elm); \ } while (0) #define STAILQ_INSERT_TAIL(head, elm, field) do { \ STAILQ_NEXT((elm), field) = NULL; \ *(head)->stqh_last = (elm); \ (head)->stqh_last = &STAILQ_NEXT((elm), field); \ } while (0) #define STAILQ_LAST(head, type, field) \ (STAILQ_EMPTY(head) ? \ NULL : \ ((struct type *) \ ((char *)((head)->stqh_last) - offsetof(struct type, field)))) #define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) #define STAILQ_REMOVE(head, elm, type, field) do { \ if (STAILQ_FIRST((head)) == (elm)) { \ STAILQ_REMOVE_HEAD(head, field); \ } \ else { \ struct type *curelm = STAILQ_FIRST((head)); \ while (STAILQ_NEXT(curelm, field) != (elm)) \ curelm = STAILQ_NEXT(curelm, field); \ if ((STAILQ_NEXT(curelm, field) = \ STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ } \ } while (0) #define STAILQ_REMOVE_HEAD(head, field) do { \ if ((STAILQ_FIRST((head)) = \ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) #define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) /* * List declarations. */ #define LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } #define LIST_HEAD_INITIALIZER(head) \ { NULL } #define LIST_ENTRY(type) \ struct { \ struct type *le_next; /* next element */ \ struct type **le_prev; /* address of previous next element */ \ } /* * List functions. */ #define LIST_EMPTY(head) ((head)->lh_first == NULL) #define LIST_FIRST(head) ((head)->lh_first) #define LIST_FOREACH(var, head, field) \ for ((var) = LIST_FIRST((head)); \ (var); \ (var) = LIST_NEXT((var), field)) #define LIST_INIT(head) do { \ LIST_FIRST((head)) = NULL; \ } while (0) #define LIST_INSERT_AFTER(listelm, elm, field) do { \ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ LIST_NEXT((listelm), field)->field.le_prev = \ &LIST_NEXT((elm), field); \ LIST_NEXT((listelm), field) = (elm); \ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ } while (0) #define LIST_INSERT_BEFORE(listelm, elm, field) do { \ (elm)->field.le_prev = (listelm)->field.le_prev; \ LIST_NEXT((elm), field) = (listelm); \ *(listelm)->field.le_prev = (elm); \ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ } while (0) #define LIST_INSERT_HEAD(head, elm, field) do { \ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ LIST_FIRST((head)) = (elm); \ (elm)->field.le_prev = &LIST_FIRST((head)); \ } while (0) #define LIST_NEXT(elm, field) ((elm)->field.le_next) #define LIST_REMOVE(elm, field) do { \ if (LIST_NEXT((elm), field) != NULL) \ LIST_NEXT((elm), field)->field.le_prev = \ (elm)->field.le_prev; \ *(elm)->field.le_prev = LIST_NEXT((elm), field); \ } while (0) /* * Tail queue declarations. */ #define TAILQ_HEAD(name, type) \ struct name { \ struct type *tqh_first; /* first element */ \ struct type **tqh_last; /* addr of last next element */ \ } #define TAILQ_HEAD_INITIALIZER(head) \ { NULL, &(head).tqh_first } #define TAILQ_ENTRY(type) \ struct { \ struct type *tqe_next; /* next element */ \ struct type **tqe_prev; /* address of previous next element */ \ } /* * Tail queue functions. */ #define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) #define TAILQ_FIRST(head) ((head)->tqh_first) #define TAILQ_FOREACH(var, head, field) \ for ((var) = TAILQ_FIRST((head)); \ (var); \ (var) = TAILQ_NEXT((var), field)) #define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ for ((var) = TAILQ_LAST((head), headname); \ (var); \ (var) = TAILQ_PREV((var), headname, field)) #define TAILQ_INIT(head) do { \ TAILQ_FIRST((head)) = NULL; \ (head)->tqh_last = &TAILQ_FIRST((head)); \ } while (0) #define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ TAILQ_NEXT((elm), field)->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ else \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ TAILQ_NEXT((listelm), field) = (elm); \ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ } while (0) #define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ TAILQ_NEXT((elm), field) = (listelm); \ *(listelm)->field.tqe_prev = (elm); \ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ } while (0) #define TAILQ_INSERT_HEAD(head, elm, field) do { \ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ TAILQ_FIRST((head))->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ else \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ TAILQ_FIRST((head)) = (elm); \ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ } while (0) #define TAILQ_INSERT_TAIL(head, elm, field) do { \ TAILQ_NEXT((elm), field) = NULL; \ (elm)->field.tqe_prev = (head)->tqh_last; \ *(head)->tqh_last = (elm); \ (head)->tqh_last = &TAILQ_NEXT((elm), field); \ } while (0) #define TAILQ_LAST(head, headname) \ (*(((struct headname *)((head)->tqh_last))->tqh_last)) #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) #define TAILQ_PREV(elm, headname, field) \ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) #define TAILQ_REMOVE(head, elm, field) do { \ if ((TAILQ_NEXT((elm), field)) != NULL) \ TAILQ_NEXT((elm), field)->field.tqe_prev = \ (elm)->field.tqe_prev; \ else \ (head)->tqh_last = (elm)->field.tqe_prev; \ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ } while (0) /* * Circular queue declarations. */ #define CIRCLEQ_HEAD(name, type) \ struct name { \ struct type *cqh_first; /* first element */ \ struct type *cqh_last; /* last element */ \ } #define CIRCLEQ_HEAD_INITIALIZER(head) \ { (void *)&(head), (void *)&(head) } #define CIRCLEQ_ENTRY(type) \ struct { \ struct type *cqe_next; /* next element */ \ struct type *cqe_prev; /* previous element */ \ } /* * Circular queue functions. */ #define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head)) #define CIRCLEQ_FIRST(head) ((head)->cqh_first) #define CIRCLEQ_FOREACH(var, head, field) \ for ((var) = CIRCLEQ_FIRST((head)); \ (var) != (void *)(head) || ((var) = NULL); \ (var) = CIRCLEQ_NEXT((var), field)) #define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ for ((var) = CIRCLEQ_LAST((head)); \ (var) != (void *)(head) || ((var) = NULL); \ (var) = CIRCLEQ_PREV((var), field)) #define CIRCLEQ_INIT(head) do { \ CIRCLEQ_FIRST((head)) = (void *)(head); \ CIRCLEQ_LAST((head)) = (void *)(head); \ } while (0) #define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \ CIRCLEQ_PREV((elm), field) = (listelm); \ if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \ CIRCLEQ_LAST((head)) = (elm); \ else \ CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\ CIRCLEQ_NEXT((listelm), field) = (elm); \ } while (0) #define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ CIRCLEQ_NEXT((elm), field) = (listelm); \ CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \ if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \ CIRCLEQ_FIRST((head)) = (elm); \ else \ CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\ CIRCLEQ_PREV((listelm), field) = (elm); \ } while (0) #define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \ CIRCLEQ_PREV((elm), field) = (void *)(head); \ if (CIRCLEQ_LAST((head)) == (void *)(head)) \ CIRCLEQ_LAST((head)) = (elm); \ else \ CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \ CIRCLEQ_FIRST((head)) = (elm); \ } while (0) #define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ CIRCLEQ_NEXT((elm), field) = (void *)(head); \ CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \ if (CIRCLEQ_FIRST((head)) == (void *)(head)) \ CIRCLEQ_FIRST((head)) = (elm); \ else \ CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \ CIRCLEQ_LAST((head)) = (elm); \ } while (0) #define CIRCLEQ_LAST(head) ((head)->cqh_last) #define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) #define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) #define CIRCLEQ_REMOVE(head, elm, field) do { \ if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \ CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \ else \ CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \ CIRCLEQ_PREV((elm), field); \ if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \ CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \ else \ CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \ CIRCLEQ_NEXT((elm), field); \ } while (0) #endif /* !OPA_QUEUE_H_ */ opa-psm2-PSM2_11.2.185/include/opa_revision.h000066400000000000000000000050461370564314600205170ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef OPA_REVISION_H #define OPA_REVISION_H /* Those variables are defined in the _revision.c file which is dynamically generated during building of the library */ extern char psmi_hfi_IFS_version[]; extern char psmi_hfi_build_timestamp[]; extern char psmi_hfi_sources_checksum[]; extern char psmi_hfi_git_checksum[]; #endif /* OPA_REVISION_H */ opa-psm2-PSM2_11.2.185/include/opa_service.h000066400000000000000000000100761370564314600203200ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef OPA_SERVICE_H #define OPA_SERVICE_H /* This file contains all the lowest level routines calling into sysfs */ /* and qib driver. All other calls are based on these routines. */ #include #include "opa_intf.h" #include "opa_udebug.h" #include "opa_byteorder.h" /* upper and lower bounds for HFI port numbers */ #define HFI_MIN_PORT 1 #define HFI_MAX_PORT 1 /* any unit id to match. */ #define HFI_UNIT_ID_ANY ((long)-1) /* any port num to match. */ #define HFI_PORT_NUM_ANY ((long)0) /* Statistics maintained by the driver */ int hfi_get_stats(uint64_t *, int); int hfi_get_stats_names(char **namep); /* Counters maintained in the chip, globally, and per-prot */ int hfi_get_ctrs_unit(int unitno, uint64_t *, int); int hfi_get_ctrs_unit_names(int unitno, char **namep); int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int); int hfi_get_ctrs_port_names(int unitno, char **namep); /* sysfs helper routines (only those currently used are exported; * try to avoid using others) */ /* Initializes the following sysfs helper routines. sysfs_init() returns 0 on success, non-zero on an error: */ int sysfs_init(const char *dflt_hfi_class_path); /* Complementary */ void sysfs_fini(void); /* read a string value into buff, no more than size bytes. returns the number of bytes read */ size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, char *buff, size_t size); /* read up to one page of malloc'ed data (caller must free), returning number of bytes read or -1 */ int hfi_hfifs_read(const char *attr, char **datap); int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **data); int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit); /* these read directly into supplied buffer and take a count */ int hfi_hfifs_rd(const char *, void *, int); int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int); #endif /* OPA_SERVICE_H */ opa-psm2-PSM2_11.2.185/include/opa_udebug.h000066400000000000000000000141671370564314600201400ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef OPA_UDEBUG_H #define OPA_UDEBUG_H #include #include "opa_debug.h" extern unsigned hfi_debug; const char *hfi_get_unit_name(int unit); extern char *__progname; static const char hfi_ident_tag[] = "PSM2_IDENTIFY"; char *hfi_get_mylabel(); #if _HFI_DEBUGGING extern char *__hfi_mylabel; void hfi_set_mylabel(char *); extern FILE *__hfi_dbgout; #define _HFI_UNIT_ERROR(unit, fmt, ...) \ do { \ _Pragma_unlikely \ printf("%s%s: " fmt, __hfi_mylabel, __progname, \ ##__VA_ARGS__); \ } while (0) #define _HFI_ERROR(fmt, ...) \ do { \ _Pragma_unlikely \ printf("%s%s: " fmt, __hfi_mylabel, __progname, \ ##__VA_ARGS__); \ } while (0) #define _HFI_INFO(fmt, ...) \ do { \ _Pragma_unlikely \ if (unlikely(hfi_debug&__HFI_INFO)) \ printf("%s%s: " fmt, __hfi_mylabel, __func__, \ ##__VA_ARGS__); \ } while (0) #define __HFI_PKTDBG_ON unlikely(hfi_debug & __HFI_PKTDBG) #define __HFI_DBG_WHICH(which, fmt, ...) \ do { \ _Pragma_unlikely \ if (unlikely(hfi_debug&(which))) \ fprintf(__hfi_dbgout, "%s%s: " fmt, __hfi_mylabel, __func__, \ ##__VA_ARGS__); \ } while (0) #define __HFI_DBG_WHICH_NOFUNC(which, fmt, ...) \ do { \ _Pragma_unlikely \ if (unlikely(hfi_debug&(which))) \ fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \ ##__VA_ARGS__); \ } while (0) #define _HFI_DBG(fmt, ...) __HFI_DBG_WHICH(__HFI_DBG, fmt, ##__VA_ARGS__) #define _HFI_VDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_VERBDBG, fmt, ##__VA_ARGS__) #define _HFI_PDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PKTDBG, fmt, ##__VA_ARGS__) #define _HFI_EPDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_EPKTDBG, fmt, ##__VA_ARGS__) #define _HFI_PRDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PROCDBG, fmt, ##__VA_ARGS__) #define _HFI_ENVDBG(lev, fmt, ...) \ __HFI_DBG_WHICH_NOFUNC( \ (lev == 0) ? __HFI_INFO : \ (lev > 1 ? __HFI_ENVDBG : (__HFI_PROCDBG|__HFI_ENVDBG)),\ "env " fmt, ##__VA_ARGS__) #define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__) #define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__) /* * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together * for a scope of code preparing debug info for printing; e.g. * if (_HFI_DBG_ON) { * // put your code here * _HFI_DBG_ALWAYS(print your results here); * } */ #define _HFI_DBG_ON unlikely(hfi_debug & __HFI_DBG) #define _HFI_DBG_ALWAYS(fmt, ...) \ do { \ _Pragma_unlikely \ fprintf(__hfi_dbgout, "%s" fmt, __hfi_mylabel, \ ##__VA_ARGS__); \ } while (0) #define _HFI_VDBG_ON unlikely(hfi_debug & __HFI_VERBDBG) #define _HFI_VDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) #define _HFI_PRDBG_ON unlikely(hfi_debug & __HFI_PROCDBG) #define _HFI_PRDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) #define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG) #define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) #define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO) #define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) #else /* ! _HFI_DEBUGGING */ #define _HFI_UNIT_ERROR(unit, fmt, ...) \ do { \ printf("%s" fmt, "", ##__VA_ARGS__); \ } while (0) #define _HFI_ERROR(fmt, ...) \ do { \ printf("%s" fmt, "", ##__VA_ARGS__); \ } while (0) #define _HFI_INFO(fmt, ...) #define __HFI_PKTDBG_ON 0 #define _HFI_DBG(fmt, ...) #define _HFI_PDBG(fmt, ...) #define _HFI_EPDBG(fmt, ...) #define _HFI_PRDBG(fmt, ...) #define _HFI_ENVDBG(lev, fmt, ...) #define _HFI_VDBG(fmt, ...) #define _HFI_MMDBG(fmt, ...) #define _HFI_CCADBG(fmt, ...) #define _HFI_DBG_ON 0 #define _HFI_DBG_ALWAYS(fmt, ...) #define _HFI_VDBG_ON 0 #define _HFI_VDBG_ALWAYS(fmt, ...) #define _HFI_PRDBG_ON 0 #define _HFI_PRDBG_ALWAYS(fmt, ...) #define _HFI_CCADBG_ON 0 #define _HFI_CCADBG_ALWAYS(fmt, ...) #define _HFI_INFO_ON 0 #define _HFI_INFO_ALWAYS(fmt, ...) #endif /* _HFI_DEBUGGING */ #endif /* OPA_UDEBUG_H */ opa-psm2-PSM2_11.2.185/include/opa_user.h000066400000000000000000000174771370564314600176520ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef OPA_USER_H #define OPA_USER_H /* This file contains all of the data structures and routines that are publicly visible and usable (to low level infrastructure code; it is not expected that any application, or even normal application-level library, will ever need to use any of this). Additional entry points and data structures that are used by these routines may be referenced in this file, but they should not be generally available; they are visible here only to allow use in inlined functions. Any variable, data structure, or function that starts with a leading "_" is in this category. */ /* Include header files we need that are unlikely to otherwise be needed by */ /* programs. */ #include #include #include #include #include #include #include #include #include #include "opa_intf.h" #include "opa_byteorder.h" #include "opa_udebug.h" #include "opa_service.h" #define HFI_TF_NFLOWS 32 /* IB - LRH header consts */ #define HFI_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ #define HFI_LRH_SC_SHIFT 12 #define HFI_LRH_SC_MASK 0xf #define HFI_LRH_SL_SHIFT 4 #define HFI_LRH_SL_MASK 0xf #define HFI_LRH_PKTLEN_MASK 0xfff /* IB - BTH header consts */ #define HFI_BTH_OPCODE_SHIFT 24 #define HFI_BTH_OPCODE_MASK 0xff #define HFI_BTH_BECN_SHIFT 30 #define HFI_BTH_FECN_SHIFT 31 #define HFI_BTH_QP_SHIFT 16 #define HFI_BTH_QP_MASK 0xff #define HFI_BTH_FLOWID_SHIFT 11 #define HFI_BTH_FLOWID_MASK 0x1f #define HFI_BTH_SUBCTXT_SHIFT 8 #define HFI_BTH_SUBCTXT_MASK 0x7 #define HFI_BTH_SEQ_SHIFT 0 #define HFI_BTH_SEQ_MASK 0x7ff #define HFI_BTH_GEN_SHIFT 11 #define HFI_BTH_GEN_MASK 0xfffff #define HFI_BTH_ACK_SHIFT 31 /* KDETH header consts */ #define HFI_KHDR_OFFSET_MASK 0x7fff #define HFI_KHDR_OM_SHIFT 15 #define HFI_KHDR_TID_SHIFT 16 #define HFI_KHDR_TID_MASK 0x3ff #define HFI_KHDR_TIDCTRL_SHIFT 26 #define HFI_KHDR_TIDCTRL_MASK 0x3 #define HFI_KHDR_INTR_SHIFT 28 #define HFI_KHDR_SH_SHIFT 29 #define HFI_KHDR_KVER_SHIFT 30 #define HFI_KHDR_KVER_MASK 0x3 #define HFI_KHDR_MSGSEQ_MASK 0xffff #define HFI_KHDR_TINYLEN_MASK 0xf #define HFI_KHDR_TINYLEN_SHIFT 16 #define GET_HFI_KHDR_TIDCTRL(val) \ (((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \ HFI_KHDR_TIDCTRL_MASK) #ifdef PSM_CUDA extern int is_driver_gpudirect_enabled; #define PSMI_IS_DRIVER_GPUDIRECT_ENABLED likely(is_driver_gpudirect_enabled) #define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled) #endif /* hfi kdeth header format */ struct hfi_kdeth { __u32 kdeth0; union { struct { __u16 job_key; __u16 hcrc; }; __u32 kdeth1; }; }; /* misc. */ #define HFI_CRC_SIZE_IN_BYTES 4 #define HFI_DEFAULT_SERVICE_ID 0x1000117500000000ULL #define HFI_DEFAULT_P_KEY 0x8001 /* fabric default pkey for app traffic */ #if 0 #define HFI_PERMISSIVE_LID 0xFFFF #define HFI_AETH_CREDIT_SHIFT 24 #define HFI_AETH_CREDIT_MASK 0x1F #define HFI_AETH_CREDIT_INVAL 0x1F #define HFI_PSN_MASK 0xFFFFFF #define HFI_MSN_MASK 0xFFFFFF #define HFI_QPN_MASK 0xFFFFFF #define HFI_MULTICAST_LID_BASE 0xC000 #define HFI_MULTICAST_QPN 0xFFFFFF #endif /* Receive Header Queue: receive type (from hfi) */ #define RCVHQ_RCV_TYPE_EXPECTED 0 #define RCVHQ_RCV_TYPE_EAGER 1 #define RCVHQ_RCV_TYPE_NON_KD 2 #define RCVHQ_RCV_TYPE_ERROR 3 /* OPA PSM assumes that the message header is always 56 bytes. */ #define HFI_MESSAGE_HDR_SIZE 56 /* interval timing routines */ /* Convert a count of cycles to elapsed nanoseconds */ /* this is only accurate for reasonably large numbers of cycles (at least tens) */ static __inline__ uint64_t cycles_to_nanosecs(uint64_t) __attribute__ ((always_inline)); /* convert elapsed nanoseconds to elapsed cycles */ /* this is only accurate for reasonably large numbers of nsecs (at least tens) */ static __inline__ uint64_t nanosecs_to_cycles(uint64_t) __attribute__ ((always_inline)); /* Statistics maintained by the driver */ const char *hfi_get_next_name(char **names); int hfi_get_stats_names_count(void); /* Counters maintained in the chip, globally, and per-prot */ int hfi_get_ctrs_unit_names_count(int unitno); int hfi_get_ctrs_port_names_count(int unitno); uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s); int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *c); void hfi_release_names(char *namep); /* Syslog wrapper level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING, LOG_NOTICE, LOG_INFO, LOG_DEBUG. prefix should be a short string to describe which part of the software stack is using syslog, i.e. "PSM", "mpi", "mpirun". */ void hfi_syslog(const char *prefix, int to_console, int level, const char *format, ...) __attribute__((format(printf, 4, 5))); void hfi_vsyslog(const char *prefix, int to_console, int level, const char *format, va_list ap); /* * Copy routine that may copy a byte multiple times but optimized for througput * This is not safe to use for PIO routines where we want a guarantee that a * byte is only copied/moved across the bus once. */ void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords); extern uint32_t __hfi_pico_per_cycle; /* only for use in these functions */ /* this is only accurate for reasonably large numbers of cycles (at least tens) */ static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs) { return (__hfi_pico_per_cycle * cycs) / 1000ULL; } /* this is only accurate for reasonably large numbers of nsecs (at least tens) */ static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns) { return (ns * 1000ULL) / __hfi_pico_per_cycle; } #endif /* OPA_USER_H */ opa-psm2-PSM2_11.2.185/include/psm2_mock_testing.h000066400000000000000000000143211370564314600214450ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef PSM2_MOCK_TESTING_H #define PSM2_MOCK_TESTING_H /* PSM2_MOCK_TESTING being defined flips a couple of switches so that a * testable version of libpsm2.so is built. It'll make properly annotated * static functions be non-static, visible to the outside. Also, all mockable * functions will be replaced with function pointers which will originally * point to the actual implementation. However, those function pointers might * be reset by the test code, thus allowing for mocking selected PSM2 functions * for the purpose of the test. * * So far the following utilities have been introduced for enabling a * conditional compilation of the testable vs. production version of the library: * - ustatic: toggles function visibility * - MOCKABLE(): decorates function name so that it is visible after being mocked * - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam * for mocking a function * - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam * for mocking a function * * If the declaration and definition of a static function @c foo reside in * different files, this would be the common use case: * * @code * // somefile.c: * int MOCKABLE(foo)(); * MOCK_DCL_EPILOGUE(foo); * * // otherfile.c: * int MOCKABLE(foo)() { * printf("I am the original foo!\n"); * } * MOCK_DEF_EPILOGUE(foo); * @endcode * * If the production version of the library is being built, the following code * would result: * @code * // somefile.c: * int foo(); * * // otherfile.c: * int foo() { * printf("I am the original foo!\n"); * } * @endcode * * On the other hand, if a testable version of the libary is being build, it * would produce the following code: * @code * // somefile.c: * int foo_original_(); * extern typeof(& foo_original_) foo; * * // otherfile.c: * int foo_original_() { * printf("I am the original foo!\n"); * } * typeof(& foo_original_) foo = foo_original_; * @endcode * * If the function to be mocked is a static function residing in the header, * the following syntax would be used: * @code * // somefile.c: * ustatic int MOCKABLE(foo)() { * printf("I am the original foo!\n"); * } * MOCK_DCL_EPILOGUE(foo); * MOCK_DEF_EPILOGUE(foo); * @endcode * * If the production version of the library is being built, the following code * would result: * @code * // somefile.c: * static int foo() { * printf("I am the original foo!\n"); * } * @endcode * * Similarly, if a testable version of the libary is being build, it would * produce the following code: * @code * // somefile.c: * int foo_original_(); * extern typeof(& foo_original_) foo; * typeof(& foo_original_) foo = foo_original_; * @endcode */ #ifndef PSM2_MOCK_TESTING /* If no testing is being done, ustatic resolves to regular "static" */ #define ustatic static /* If no testing is being done, no indirection is introduced */ #define MOCKABLE(fname) fname /* If no testing is being done, no declaration epilogue is needed */ #define MOCK_DCL_EPILOGUE(fname) /* If no testing is being done, no definition epilogue is needed */ #define MOCK_DEF_EPILOGUE(fname) #else /* ndef PSM2_MOCK_TESTING */ /* For the testable version, all _ustatic_ function will NOT be static */ #define ustatic /* TODO override inline directives in the same fashion as static */ /* For the testable version, the actual implementation function is renamed */ #define MOCKABLE(x) x ## _original_ /* For the testable version, we declare the function pointer which will be the * point of indirection for calls to that function. It must be delared after * the declaration of the actual function happens. */ #define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x; /* For the testable version, we define the function pointer which will be the * point of indirection for calls to that function. It must be delared after * the definition of the actual function happens. */ #define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_; #endif /* ndef PSM2_MOCK_TESTING */ #endif /* PSM2_MOCK_TESTING_H */ opa-psm2-PSM2_11.2.185/include/rbtree.c000066400000000000000000000474301370564314600173030ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ /* * Abstract: * Implementation of quick map, a binary tree where the caller always provides * all necessary storage. * * Environment: * All * * $Revision$ */ /***************************************************************************** * * Map * * Map is an associative array. By providing a key, the caller can retrieve * an object from the map. All objects in the map have an associated key, * as specified by the caller when the object was inserted into the map. * In addition to random access, the caller can traverse the map much like * a linked list, either forwards from the first object or backwards from * the last object. The objects in the map are always traversed in * order since the nodes are stored sorted. * * This implementation of Map uses a red black tree verified against * Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth * printing, 1994. * *****************************************************************************/ #include /* for memset declaration */ // RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to // -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively. #ifdef RBTREE_CMP #if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST) #error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST #endif #elif !defined ( RBTREE_GET_LEFTMOST ) || \ ! defined ( RBTREE_GET_RIGHTMOST ) || \ ! defined ( RBTREE_MAP_COUNT ) || \ ! defined ( RBTREE_ASSERT ) #error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \ RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c" #endif /* RBTREE_CMP */ #define IN /* nothing */ /****************************************************************************** ******************************************************************************* ************** ************ ************** IMPLEMENTATION OF QUICK MAP ************ ************** ************ ******************************************************************************* ******************************************************************************/ /* Forward declarations: */ static void ips_cl_qmap_init( IN cl_qmap_t *p_map, IN cl_map_item_t* const root, IN cl_map_item_t* const nil); static void ips_cl_qmap_insert_item( IN cl_qmap_t* const p_map, IN cl_map_item_t* const p_item); static void ips_cl_qmap_remove_item( IN cl_qmap_t* const p_map, IN cl_map_item_t* const p_item); static cl_map_item_t* ips_cl_qmap_successor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item); #ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR static cl_map_item_t* ips_cl_qmap_predecessor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item); #endif #if defined(RBTREE_GET_LEFTMOST) static cl_map_item_t* ips_cl_qmap_search( IN cl_qmap_t* const p_map, IN unsigned long start, IN unsigned long end); #else static cl_map_item_t* ips_cl_qmap_searchv( cl_qmap_t* const p_map, const RBTREE_MI_PL *key); #endif /* * Get the root. */ static inline cl_map_item_t* __cl_map_root( IN const cl_qmap_t* const p_map ) { RBTREE_ASSERT( p_map ); return( p_map->root->p_left ); } /* * Returns whether a given item is on the left of its parent. */ static int __cl_map_is_left_child( IN const cl_map_item_t* const p_item ) { RBTREE_ASSERT( p_item ); RBTREE_ASSERT( p_item->p_up ); RBTREE_ASSERT( p_item->p_up != p_item ); return( p_item->p_up->p_left == p_item ); } /* * Retrieve the pointer to the parent's pointer to an item. */ static cl_map_item_t** __cl_map_get_parent_ptr_to_item( IN cl_map_item_t* const p_item ) { RBTREE_ASSERT( p_item ); RBTREE_ASSERT( p_item->p_up ); RBTREE_ASSERT( p_item->p_up != p_item ); if( __cl_map_is_left_child( p_item ) ) return( &p_item->p_up->p_left ); RBTREE_ASSERT( p_item->p_up->p_right == p_item ); return( &p_item->p_up->p_right ); } /* * Rotate a node to the left. This rotation affects the least number of links * between nodes and brings the level of C up by one while increasing the depth * of A one. Note that the links to/from W, X, Y, and Z are not affected. * * R R * | | * A C * / \ / \ * W C A Z * / \ / \ * B Z W B * / \ / \ * X Y X Y */ static void __cl_map_rot_left( IN cl_qmap_t* const p_map, IN cl_map_item_t* const p_item ) { cl_map_item_t **pp_root; RBTREE_ASSERT( p_map ); RBTREE_ASSERT( p_item ); RBTREE_ASSERT( p_item->p_right != p_map->nil_item ); pp_root = __cl_map_get_parent_ptr_to_item( p_item ); /* Point R to C instead of A. */ *pp_root = p_item->p_right; /* Set C's parent to R. */ (*pp_root)->p_up = p_item->p_up; /* Set A's right to B */ p_item->p_right = (*pp_root)->p_left; /* * Set B's parent to A. We trap for B being NIL since the * caller may depend on NIL not changing. */ if( (*pp_root)->p_left != p_map->nil_item ) (*pp_root)->p_left->p_up = p_item; /* Set C's left to A. */ (*pp_root)->p_left = p_item; /* Set A's parent to C. */ p_item->p_up = *pp_root; } /* * Rotate a node to the right. This rotation affects the least number of links * between nodes and brings the level of A up by one while increasing the depth * of C one. Note that the links to/from W, X, Y, and Z are not affected. * * R R * | | * C A * / \ / \ * A Z W C * / \ / \ * W B B Z * / \ / \ * X Y X Y */ static void __cl_map_rot_right( IN cl_qmap_t* const p_map, IN cl_map_item_t* const p_item ) { cl_map_item_t **pp_root; RBTREE_ASSERT( p_map ); RBTREE_ASSERT( p_item ); RBTREE_ASSERT( p_item->p_left != p_map->nil_item ); /* Point R to A instead of C. */ pp_root = __cl_map_get_parent_ptr_to_item( p_item ); (*pp_root) = p_item->p_left; /* Set A's parent to R. */ (*pp_root)->p_up = p_item->p_up; /* Set C's left to B */ p_item->p_left = (*pp_root)->p_right; /* * Set B's parent to C. We trap for B being NIL since the * caller may depend on NIL not changing. */ if( (*pp_root)->p_right != p_map->nil_item ) (*pp_root)->p_right->p_up = p_item; /* Set A's right to C. */ (*pp_root)->p_right = p_item; /* Set C's parent to A. */ p_item->p_up = *pp_root; } /* * Balance a tree starting at a given item back to the root. */ static void __cl_map_ins_bal( IN cl_qmap_t* const p_map, IN cl_map_item_t* p_item ) { cl_map_item_t* p_grand_uncle; RBTREE_ASSERT( p_map ); RBTREE_ASSERT( p_item ); RBTREE_ASSERT( p_item != p_map->root ); while( p_item->p_up->color == CL_MAP_RED ) { if( __cl_map_is_left_child( p_item->p_up ) ) { p_grand_uncle = p_item->p_up->p_up->p_right; RBTREE_ASSERT( p_grand_uncle ); if( p_grand_uncle->color == CL_MAP_RED ) { p_grand_uncle->color = CL_MAP_BLACK; p_item->p_up->color = CL_MAP_BLACK; p_item->p_up->p_up->color = CL_MAP_RED; p_item = p_item->p_up->p_up; continue; } if( !__cl_map_is_left_child( p_item ) ) { p_item = p_item->p_up; __cl_map_rot_left( p_map, p_item ); } p_item->p_up->color = CL_MAP_BLACK; p_item->p_up->p_up->color = CL_MAP_RED; __cl_map_rot_right( p_map, p_item->p_up->p_up ); } else { p_grand_uncle = p_item->p_up->p_up->p_left; RBTREE_ASSERT( p_grand_uncle ); if( p_grand_uncle->color == CL_MAP_RED ) { p_grand_uncle->color = CL_MAP_BLACK; p_item->p_up->color = CL_MAP_BLACK; p_item->p_up->p_up->color = CL_MAP_RED; p_item = p_item->p_up->p_up; continue; } if( __cl_map_is_left_child( p_item ) ) { p_item = p_item->p_up; __cl_map_rot_right( p_map, p_item ); } p_item->p_up->color = CL_MAP_BLACK; p_item->p_up->p_up->color = CL_MAP_RED; __cl_map_rot_left( p_map, p_item->p_up->p_up ); } } } static void ips_cl_qmap_init( IN cl_qmap_t *p_map, IN cl_map_item_t* const root, IN cl_map_item_t* const nil_item) { RBTREE_ASSERT( p_map ); RBTREE_ASSERT( root ); RBTREE_ASSERT( nil_item ); memset(p_map,0,sizeof(cl_qmap_t)); p_map->root = root; /* setup the RB tree map */ p_map->nil_item = nil_item; p_map->root->p_up = p_map->root; p_map->root->p_left = p_map->nil_item; p_map->root->p_right = p_map->nil_item; p_map->root->color = CL_MAP_BLACK; p_map->nil_item->p_up = p_map->nil_item; p_map->nil_item->p_left = p_map->nil_item; p_map->nil_item->p_right = p_map->nil_item; p_map->nil_item->color = CL_MAP_BLACK; } static void ips_cl_qmap_insert_item( IN cl_qmap_t* const p_map, IN cl_map_item_t* const p_item ) { cl_map_item_t *p_insert_at, *p_comp_item; int compare_res = 0; RBTREE_ASSERT( p_map ); RBTREE_ASSERT( p_item ); RBTREE_ASSERT( p_map->root->p_up == p_map->root ); RBTREE_ASSERT( p_map->root->color != CL_MAP_RED ); RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED ); /* Find the insertion location. */ p_insert_at = p_map->root; p_comp_item = __cl_map_root( p_map ); while( p_comp_item != p_map->nil_item ) { p_insert_at = p_comp_item; /* Traverse the tree until the correct insertion point is found. */ #ifdef RBTREE_GET_LEFTMOST if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) ) #else if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0) #endif { p_comp_item = p_insert_at->p_left; compare_res = 1; } else { p_comp_item = p_insert_at->p_right; compare_res = -1; } } RBTREE_ASSERT( p_insert_at != p_map->nil_item ); RBTREE_ASSERT( p_comp_item == p_map->nil_item ); /* Insert the item. */ p_item->p_left = p_map->nil_item; p_item->p_right = p_map->nil_item; p_item->color = CL_MAP_RED; if( p_insert_at == p_map->root ) { p_insert_at->p_left = p_item; } else if( compare_res > 0 ) /* key < p_insert_at->key */ { p_insert_at->p_left = p_item; } else { p_insert_at->p_right = p_item; } /* Increase the count. */ RBTREE_MAP_COUNT(&p_map->payload)++; p_item->p_up = p_insert_at; /* * We have added depth to this section of the tree. * Rebalance as necessary as we retrace our path through the tree * and update colors. */ __cl_map_ins_bal( p_map, p_item ); __cl_map_root( p_map )->color = CL_MAP_BLACK; /* * Note that it is not necessary to re-color the nil node black because all * red color assignments are made via the p_up pointer, and nil is never * set as the value of a p_up pointer. */ } static void __cl_map_del_bal( IN cl_qmap_t* const p_map, IN cl_map_item_t* p_item ) { cl_map_item_t *p_uncle; while( (p_item->color != CL_MAP_RED) && (p_item->p_up != p_map->root) ) { if( __cl_map_is_left_child( p_item ) ) { p_uncle = p_item->p_up->p_right; if( p_uncle->color == CL_MAP_RED ) { p_uncle->color = CL_MAP_BLACK; p_item->p_up->color = CL_MAP_RED; __cl_map_rot_left( p_map, p_item->p_up ); p_uncle = p_item->p_up->p_right; } if( p_uncle->p_right->color != CL_MAP_RED ) { if( p_uncle->p_left->color != CL_MAP_RED ) { p_uncle->color = CL_MAP_RED; p_item = p_item->p_up; continue; } p_uncle->p_left->color = CL_MAP_BLACK; p_uncle->color = CL_MAP_RED; __cl_map_rot_right( p_map, p_uncle ); p_uncle = p_item->p_up->p_right; } p_uncle->color = p_item->p_up->color; p_item->p_up->color = CL_MAP_BLACK; p_uncle->p_right->color = CL_MAP_BLACK; __cl_map_rot_left( p_map, p_item->p_up ); break; } else { p_uncle = p_item->p_up->p_left; if( p_uncle->color == CL_MAP_RED ) { p_uncle->color = CL_MAP_BLACK; p_item->p_up->color = CL_MAP_RED; __cl_map_rot_right( p_map, p_item->p_up ); p_uncle = p_item->p_up->p_left; } if( p_uncle->p_left->color != CL_MAP_RED ) { if( p_uncle->p_right->color != CL_MAP_RED ) { p_uncle->color = CL_MAP_RED; p_item = p_item->p_up; continue; } p_uncle->p_right->color = CL_MAP_BLACK; p_uncle->color = CL_MAP_RED; __cl_map_rot_left( p_map, p_uncle ); p_uncle = p_item->p_up->p_left; } p_uncle->color = p_item->p_up->color; p_item->p_up->color = CL_MAP_BLACK; p_uncle->p_left->color = CL_MAP_BLACK; __cl_map_rot_right( p_map, p_item->p_up ); break; } } p_item->color = CL_MAP_BLACK; } static void ips_cl_qmap_remove_item( IN cl_qmap_t* const p_map, IN cl_map_item_t* const p_item ) { cl_map_item_t *p_child, *p_del_item; RBTREE_ASSERT( p_map ); RBTREE_ASSERT( p_item ); if( p_item == p_map->nil_item ) return; if( (p_item->p_right == p_map->nil_item) || (p_item->p_left == p_map->nil_item ) ) { /* The item being removed has children on at most on side. */ p_del_item = p_item; } else { /* * The item being removed has children on both side. * We select the item that will replace it. After removing * the substitute item and rebalancing, the tree will have the * correct topology. Exchanging the substitute for the item * will finalize the removal. */ p_del_item = ips_cl_qmap_successor(p_map, p_item); RBTREE_ASSERT( p_del_item != p_map->nil_item ); } RBTREE_MAP_COUNT(&p_map->payload)--; /* Get the pointer to the new root's child, if any. */ if( p_del_item->p_left != p_map->nil_item ) p_child = p_del_item->p_left; else p_child = p_del_item->p_right; /* * This assignment may modify the parent pointer of the nil node. * This is inconsequential. */ p_child->p_up = p_del_item->p_up; (*__cl_map_get_parent_ptr_to_item( p_del_item )) = p_child; if( p_del_item->color != CL_MAP_RED ) __cl_map_del_bal( p_map, p_child ); /* * Note that the splicing done below does not need to occur before * the tree is balanced, since the actual topology changes are made by the * preceding code. The topology is preserved by the color assignment made * below (reader should be reminded that p_del_item == p_item in some cases). */ if( p_del_item != p_item ) { /* * Finalize the removal of the specified item by exchanging it with * the substitute which we removed above. */ p_del_item->p_up = p_item->p_up; p_del_item->p_left = p_item->p_left; p_del_item->p_right = p_item->p_right; (*__cl_map_get_parent_ptr_to_item( p_item )) = p_del_item; p_item->p_right->p_up = p_del_item; p_item->p_left->p_up = p_del_item; p_del_item->color = p_item->color; } RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED ); } static cl_map_item_t * ips_cl_qmap_successor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item ) { cl_map_item_t *p_tmp; p_tmp = p_item->p_right; if (p_tmp != p_map->nil_item) { while (p_tmp->p_left != p_map->nil_item) p_tmp = p_tmp->p_left; return p_tmp; } else { p_tmp = p_item->p_up; while (p_tmp->p_right == p_item) { p_item = p_tmp; p_tmp = p_tmp->p_up; } if (p_tmp == p_map->root) return p_map->nil_item; return p_tmp; } } // When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted. // When this happens, ips_cl_qmap_predecessor() may not be called. // Combined with -Werror -Wunused-function, libpsm2 fails to build. // So provide macro to control emitting this function #ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR static cl_map_item_t * ips_cl_qmap_predecessor( IN cl_qmap_t* const p_map, IN const cl_map_item_t* p_item ) { cl_map_item_t *p_tmp; p_tmp = p_item->p_left; if (p_tmp != p_map->nil_item) { while (p_tmp->p_right != p_map->nil_item) p_tmp = p_tmp->p_right; return p_tmp; } else { p_tmp = p_item->p_up; while (p_tmp->p_left == p_item) { p_item = p_tmp; p_tmp = p_tmp->p_up; } if (p_tmp == p_map->root) return p_map->nil_item; return p_tmp; } } #endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */ #if defined(RBTREE_GET_LEFTMOST) /* * return the first node with buffer overlapping or zero. */ static cl_map_item_t * ips_cl_qmap_search(cl_qmap_t * const p_map, unsigned long start, unsigned long end) { cl_map_item_t *p_item, *p_tmp; RBTREE_ASSERT( p_map ); p_item = __cl_map_root(p_map); while (p_item != p_map->nil_item) { if (start > RBTREE_GET_LEFTMOST(&p_item->payload)) { p_tmp = p_item->p_right; if (p_tmp != p_map->nil_item) { p_item = p_tmp; continue; } /* * p_item is on immediate left side of 'start'. */ if (start >= RBTREE_GET_RIGHTMOST(&p_item->payload)) { /* * p_item is on immediate right * side of 'start'. */ p_item = ips_cl_qmap_successor(p_map, p_item); if (p_item != p_map->nil_item && end <= RBTREE_GET_LEFTMOST(&p_item->payload)) p_item = p_map->nil_item; } } else if (start < RBTREE_GET_LEFTMOST(&p_item->payload)) { p_tmp = p_item->p_left; if (p_tmp != p_map->nil_item) { p_item = p_tmp; continue; } /* * p_tmp is on immediate left side of 'start'. */ p_tmp = ips_cl_qmap_predecessor(p_map, p_item); if (p_tmp == p_map->nil_item || (start >= RBTREE_GET_RIGHTMOST(&p_tmp->payload))) { /* * p_item is on immediate right * side of 'start'. */ if (end <= RBTREE_GET_LEFTMOST(&p_item->payload)) p_item = p_map->nil_item; } else p_item = p_tmp; } break; } return p_item; } #else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ static cl_map_item_t * ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key) { RBTREE_ASSERT( p_map ); cl_map_item_t *p_item = __cl_map_root(p_map); while (p_item != p_map->nil_item) { if (RBTREE_CMP(key, &p_item->payload) > 0) { p_item = p_item->p_right; } else if (RBTREE_CMP(key, &p_item->payload) < 0) { p_item = p_item->p_left; } else { break; } } return p_item; } #endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ opa-psm2-PSM2_11.2.185/include/rbtree.h000066400000000000000000000057501370564314600173070ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __RBTREE_H__ #define __RBTREE_H__ #include #ifndef RBTREE_MAP_PL #error "You must define RBTREE_MAP_PL before including rbtree.h" #endif #ifndef RBTREE_MI_PL #error "You must define RBTREE_MI_PL before including rbtree.h" #endif /* * Red-Black tid cache definition. */ typedef struct _cl_map_item { struct _cl_map_item *p_left; /* left pointer */ struct _cl_map_item *p_right; /* right pointer */ struct _cl_map_item *p_up; /* up pointer */ uint16_t color; /* red-black color */ RBTREE_MI_PL payload; } cl_map_item_t; typedef struct _cl_qmap { cl_map_item_t *root; /* root node pointer */ cl_map_item_t *nil_item; /* terminator node pointer */ RBTREE_MAP_PL payload; } cl_qmap_t; #define CL_MAP_RED 0 #define CL_MAP_BLACK 1 #endif opa-psm2-PSM2_11.2.185/libpsm2.spec.in000066400000000000000000000133211370564314600170520ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2017 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2017 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # Summary: Intel PSM2 Libraries Name: @RPM_NAME@ Version: @VERSION@ Release: 1@SPEC_FILE_RELEASE_DIST@ License: BSD or GPLv2 URL: https://github.com/intel/opa-psm2/ # The tarball can be created by: # git clone https://github.com/intel/opa-psm2 # cd opa-psm2 # git checkout @DIST_SHA@ # make dist Source0: @RPM_NAME@-%{version}.tar.gz # The OPA product is supported on x86_64 only: ExclusiveArch: x86_64 BuildRequires: gcc Provides: hfi1-psm Obsoletes: hfi1-psm < 1.0.0 %if "@RPM_NAME_BASEEXT@" %package -n @RPM_NAME@@RPM_NAME_BASEEXT@ Summary: Intel PSM2 Libraries %endif Provides: @RPM_NAME@ = %{version}-%{release} Provides: @RPM_NAME@%{_isa} = %{version}-%{release} %if 0%{?suse_version} BuildRequires: libnuma-devel Requires: libnuma1 %else %if 0%{?rhel}==0 || 0%{?rhel} > 6 BuildRequires: systemd BuildRequires: numactl-devel Requires: numactl-libs %endif %endif %package -n @RPM_NAME@-devel Summary: Development files for Intel PSM2 Requires: %{name}%{?_isa} = %{version}-%{release} Provides: hfi1-psm-devel Obsoletes: hfi1-psm-devel < 1.0.0 %package -n @RPM_NAME@-compat Summary: Compat library for Intel PSM2 Requires: %{name}%{?_isa} = %{version}-%{release} %if 0%{?fedora} Requires: systemd-udev %endif Provides: hfi1-psm-compat Obsoletes: hfi1-psm-compat < 1.0.0 # If an alternate basename is defined, like in SLES >=12.3 # Then we generate a different base src.rpm, so use this # description instead. %if "@RPM_NAME_BASEEXT@" %description The source code for the PSM2 messaging API, libpsm2. A low-level user-level communications interface for the Intel(R) OPA family of products. PSM2 users are enabled with mechanisms necessary to implement higher level communications interfaces in parallel environments. %endif # In distro's other than SLES >=12.3 we use a single description # for both the .src.rpm and the base binary rpm. As the # RPM_NAME_BASEEXT defaults to empty contents. %description -n @RPM_NAME@@RPM_NAME_BASEEXT@ PSM2 Messaging API, or PSM2 API, is the low-level user-level communications interface for the Intel(R) OPA family of products. PSM2 users are enabled with mechanisms necessary to implement higher level communications interfaces in parallel environments. %description -n @RPM_NAME@-devel Intel(R) PSM2, psm2*.h, headers and libpsm2.so files necessary for developing software using libpsm2. %description -n @RPM_NAME@-compat Support for MPIs linked with PSM versions < 2. This will allow software compiled to use Intel(R) Truescale PSM, libinfinipath, to run with Intel(R) OPA PSM2, libpsm2. %prep %setup -q -n @RPM_NAME@-%{version} %build make %{?_smp_mflags} %install %make_install %post -p /sbin/ldconfig %postun -p /sbin/ldconfig %files -n @RPM_NAME@@RPM_NAME_BASEEXT@ %if 0%{?rhel} && 0%{?rhel} < 7 %{!?_licensedir:%global license %doc} %endif %license COPYING %{_libdir}/@TARGLIB@.so.@MAJOR@.@MINOR@ %{_libdir}/@TARGLIB@.so.@MAJOR@ @40_PSM_RULES@ %files -n @RPM_NAME@-devel %{_libdir}/@TARGLIB@.so %{_libdir}/@TARGLIB@.a %{_includedir}/psm2.h %{_includedir}/psm2_mq.h %{_includedir}/psm2_am.h %{_includedir}/hfi1diag %files -n @RPM_NAME@-compat %{_libdir}/psm2-compat %if 0%{?rhel} && 0%{?rhel} < 7 @UDEVDIR@/rules.d/40-psm-compat.rules %else %{_udevrulesdir}/40-psm-compat.rules %endif @LIBPSM2_COMPAT_SYM_CONF_DIR@/modprobe.d/libpsm2-compat.conf %{_prefix}/lib/libpsm2 %changelog * Wed Aug 30 2017 Rusell McGuire - Adjust RPM names to match SLES 12.3 distro names * Tue Apr 05 2016 Paul Reger - Upstream PSM2 source code for Fedora. opa-psm2-PSM2_11.2.185/libuuid/000077500000000000000000000000001370564314600156565ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/libuuid/Makefile000066400000000000000000000064051370564314600173230ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2003-2014 Intel Corporation. All rights reserved. # OUTDIR = . this_srcdir := $(shell readlink -m .) top_srcdir := $(this_srcdir)/.. BASECFLAGS += -DPSM_UUID=1 -Wno-unused-function INCLUDES += -I$(top_srcdir) ${TARGLIB}-objs := psm_uuid.o parse.o pack.o unpack.o unparse.o ${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) DEPS := $(${TARGLIB}-objs:.o=.d) .PHONY: all clean IGNORE_DEP_TARGETS = clean all .DEFAULT: ${${TARGLIB}-objs} $(OUTDIR)/%.d: $(this_srcdir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ clean: @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno; \ cd -; \ fi #ifeq prevents the deps from being included during clean #-include line is required to pull in auto-dependecies during 2nd pass ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) -include ${DEPS} endif install: @echo "Nothing to do for install." opa-psm2-PSM2_11.2.185/libuuid/compare.c000066400000000000000000000041321370564314600174500ustar00rootroot00000000000000/* * compare.c --- compare whether or not two UUID's are the same * * Returns 0 if the two UUID's are different, and 1 if they are the same. * * Copyright (C) 1996, 1997 Theodore Ts'o. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "psm_uuid.h" #include #define UUCMP(u1,u2) if (u1 != u2) return((u1 < u2) ? -1 : 1); int uuid_compare(const uuid_t uu1, const uuid_t uu2) { struct uuid uuid1, uuid2; uuid_unpack(uu1, &uuid1); uuid_unpack(uu2, &uuid2); UUCMP(uuid1.time_low, uuid2.time_low); UUCMP(uuid1.time_mid, uuid2.time_mid); UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); UUCMP(uuid1.clock_seq, uuid2.clock_seq); return memcmp(uuid1.node, uuid2.node, 6); } opa-psm2-PSM2_11.2.185/libuuid/pack.c000066400000000000000000000043201370564314600167370ustar00rootroot00000000000000/* * Internal routine for packing UUID's * * Copyright (C) 1996, 1997 Theodore Ts'o. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include #include #include "psm_user.h" #include "psm_uuid.h" void uuid_pack(const struct uuid *uu, uuid_t ptr) { uint32_t tmp; unsigned char *out = ptr; tmp = uu->time_low; out[3] = (unsigned char) tmp; tmp >>= 8; out[2] = (unsigned char) tmp; tmp >>= 8; out[1] = (unsigned char) tmp; tmp >>= 8; out[0] = (unsigned char) tmp; tmp = uu->time_mid; out[5] = (unsigned char) tmp; tmp >>= 8; out[4] = (unsigned char) tmp; tmp = uu->time_hi_and_version; out[7] = (unsigned char) tmp; tmp >>= 8; out[6] = (unsigned char) tmp; tmp = uu->clock_seq; out[9] = (unsigned char) tmp; tmp >>= 8; out[8] = (unsigned char) tmp; memcpy(out+10, uu->node, 6); } opa-psm2-PSM2_11.2.185/libuuid/parse.c000066400000000000000000000045541370564314600171440ustar00rootroot00000000000000/* * parse.c --- UUID parsing * * Copyright (C) 1996, 1997 Theodore Ts'o. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include #include #include #include #include "psm_user.h" #include "psm_uuid.h" int uuid_parse(const char *in, uuid_t uu) { struct uuid uuid; int i; const char *cp; char buf[3]; if (strlen(in) != 36) return -1; for (i=0, cp = in; i <= 36; i++,cp++) { if ((i == 8) || (i == 13) || (i == 18) || (i == 23)) { if (*cp == '-') continue; else return -1; } if (i== 36) if (*cp == 0) continue; if (!isxdigit(*cp)) return -1; } uuid.time_low = strtoul(in, NULL, 16); uuid.time_mid = strtoul(in+9, NULL, 16); uuid.time_hi_and_version = strtoul(in+14, NULL, 16); uuid.clock_seq = strtoul(in+19, NULL, 16); cp = in+24; buf[2] = 0; for (i=0; i < 6; i++) { buf[0] = *cp++; buf[1] = *cp++; uuid.node[i] = strtoul(buf, NULL, 16); } uuid_pack(&uuid, uu); return 0; } opa-psm2-PSM2_11.2.185/libuuid/psm_uuid.c000066400000000000000000000066611370564314600176600ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include #include #include #include "psm_user.h" #include "psm_uuid.h" static void psmi_make_drand_uuid(psm2_uuid_t uuid_out) { struct drand48_data drand48_data; int i; long int rnum; srand48_r((get_cycles() + getpid()) % LONG_MAX, &drand48_data); for(i=0; i < 16; i++) { lrand48_r(&drand48_data, &rnum); uuid_out[i] = rnum % UCHAR_MAX; } } /* Since libuuid can call srand, we will generate our own uuids */ void __psm2_uuid_generate(psm2_uuid_t uuid_out) { PSM2_LOG_MSG("entering"); /* Prefer using urandom, fallback to drand48_r */ struct stat urandom_stat; size_t nbytes; int fd; if(stat("/dev/urandom", &urandom_stat) != 0) { psmi_make_drand_uuid(uuid_out); return; } fd = open("/dev/urandom", O_RDONLY); if(fd == -1) { psmi_make_drand_uuid(uuid_out); } else { nbytes = read(fd, (char *) uuid_out, 16); if(nbytes != 16) { psmi_make_drand_uuid(uuid_out); } close(fd); } PSM2_LOG_MSG("leaving"); return; } PSMI_API_DECL(psm2_uuid_generate) void psmi_uuid_unparse(const uuid_t uu, char *out) { uuid_unparse_lower(uu, out); } int psmi_uuid_parse(const char *in, uuid_t uu) { return uuid_parse(in, uu); } opa-psm2-PSM2_11.2.185/libuuid/psm_uuid.h000066400000000000000000000060031370564314600176530ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PSM_UUID_H #define _PSM_UUID_H struct uuid { uint32_t time_low; uint16_t time_mid; uint16_t time_hi_and_version; uint16_t clock_seq; uint8_t node[6]; }; typedef unsigned char uuid_t[16]; int psmi_uuid_parse(const char *in, psm2_uuid_t uu); void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out); int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB); int uuid_compare(const uuid_t uu1, const uuid_t uu2); void uuid_pack(const struct uuid *uu, uuid_t ptr); void uuid_unparse(const uuid_t uu, char *out); void uuid_unparse_upper(const uuid_t uu, char *out); void uuid_unparse_lower(const uuid_t uu, char *out); void uuid_unpack(const uuid_t in, struct uuid *uu); int uuid_parse(const char *in, uuid_t uu); #endif opa-psm2-PSM2_11.2.185/libuuid/unpack.c000066400000000000000000000040641370564314600173070ustar00rootroot00000000000000/* * Internal routine for unpacking UUID * * Copyright (C) 1996, 1997 Theodore Ts'o. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include #include #include "psm_user.h" #include "psm_uuid.h" void uuid_unpack(const uuid_t in, struct uuid *uu) { const uint8_t *ptr = in; uint32_t tmp; tmp = *ptr++; tmp = (tmp << 8) | *ptr++; tmp = (tmp << 8) | *ptr++; tmp = (tmp << 8) | *ptr++; uu->time_low = tmp; tmp = *ptr++; tmp = (tmp << 8) | *ptr++; uu->time_mid = tmp; tmp = *ptr++; tmp = (tmp << 8) | *ptr++; uu->time_hi_and_version = tmp; tmp = *ptr++; tmp = (tmp << 8) | *ptr++; uu->clock_seq = tmp; memcpy(uu->node, ptr, 6); } opa-psm2-PSM2_11.2.185/libuuid/unparse.c000066400000000000000000000047251370564314600175070ustar00rootroot00000000000000/* * unparse.c -- convert a UUID to string * * Copyright (C) 1996, 1997 Theodore Ts'o. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include #include "psm_user.h" #include "psm_uuid.h" static const char *fmt_lower = "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x"; static const char *fmt_upper = "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X"; #ifdef UUID_UNPARSE_DEFAULT_UPPER #define FMT_DEFAULT fmt_upper #else #define FMT_DEFAULT fmt_lower #endif static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt) { struct uuid uuid; uuid_unpack(uu, &uuid); sprintf(out, fmt, uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, uuid.node[0], uuid.node[1], uuid.node[2], uuid.node[3], uuid.node[4], uuid.node[5]); } void uuid_unparse_lower(const uuid_t uu, char *out) { uuid_unparse_x(uu, out, fmt_lower); } void uuid_unparse_upper(const uuid_t uu, char *out) { uuid_unparse_x(uu, out, fmt_upper); } void uuid_unparse(const uuid_t uu, char *out) { uuid_unparse_x(uu, out, FMT_DEFAULT); } opa-psm2-PSM2_11.2.185/makesdeb.sh000077500000000000000000000126501370564314600163370ustar00rootroot00000000000000#!/bin/bash # # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2017 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2017 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Stop on error set -e BUILD_OPTS="gGbBAS" BUILD_OPT= DEB_NAME=libpsm2 # OUT_DIR is where the Makefile places its meta-data OUT_DIR=build_release # Set BUILD_DIR first, so user control can override the value # This is where this script places deb(s) and uses its build meta-data. # It can be set the same as OUT_DIR, and work just fine if desired. BUILD_DIR=temp.$$ function literate() { echo $(sed "s/\B/&$2/g" <<< "$1") } function usage() { SCRIPT=${0##*/} echo "Usage: $SCRIPT [OPTIONS]" echo echo "Creates tar ball of source and source rpms by default." echo "Optionally generates binary rpm(s) " echo echo " $(literate $BUILD_OPTS ',')" echo " Optional, default is full build (source and binary)" echo " Set single extension letter for dpkg-buildpackage argument" echo " -r " echo " Optional, set the output deb name" echo " -e " echo " Optional, set a base name extension" echo " This only appends an extra string onto the base DEB name" echo " Does not affect supporting DEBs" echo " -c" echo " Optional, default is unset" echo " Sets PSM_CUDA=1, creating -cuda based manifest and debs" echo " -d " echo " Optionally sets output folder for dpkg-buildpackage to use" echo " -h" echo " Shows this screen" echo " Examples:" echo " $SCRIPT b" echo " $SCRIPT s -c" echo " $SCRIPT -" echo " $SCRIPT -d ./temp" echo " $SCRIPT b -c -d output" exit $1 } while getopts "r:e:cd:h$BUILD_OPTS" OPT; do case $OPT in r) DEB_NAME=$OPTARG ;; e) BASE_EXT=$OPTARG ;; c) export PSM_CUDA=1 DEB_EXT="-cuda" ;; d) BUILD_DIR=$OPTARG ;; h) usage 0 ;; \?) usage 1 ;; *) BUILD_OPT=-$OPT ;; esac done # Remove parsed options shift $((OPTIND-1)) # Check if we have any non-option parameters test ! $# -eq 0 && usage # Generic cleanup, build, and tmp folder creation make distclean OUTDIR=$OUT_DIR make RPM_NAME=$DEB_NAME RPM_NAME_BASEEXT=$BASE_EXT dist OUTDIR=$OUT_DIR # Prepare build area mkdir -p $BUILD_DIR/{build,binary,sources,dists} # Differnet paths based on DEB_EXT cp $OUT_DIR/$DEB_NAME-*.tar.gz $BUILD_DIR/dists/ FILE_BASE=$(basename $BUILD_DIR/dists/$DEB_NAME-*.tar.gz .tar.gz) VERSION=${FILE_BASE##$DEB_NAME-} echo Building $DEB_NAME version $VERSION... tar xzf $BUILD_DIR/dists/$DEB_NAME-$VERSION.tar.gz -C $BUILD_DIR/build (cd $BUILD_DIR/build/$DEB_NAME-$VERSION # Annotate changelog mv debian/changelog.in debian/changelog debchange --newversion=$VERSION "Bump up version to $VERSION" # Build package dpkg-buildpackage $BUILD_OPT -us -uc -tc) mv $BUILD_DIR/build/$DEB_NAME*{.tar.xz,.dsc,.changes} $BUILD_DIR/sources/ mv $BUILD_DIR/build/$DEB_NAME*{.deb,.ddeb} $BUILD_DIR/binary/ echo "The deb package(s) is (are) in $BUILD_DIR/binary/$(ls $BUILD_DIR/binary)" opa-psm2-PSM2_11.2.185/makesrpm.sh000077500000000000000000000136741370564314600164120ustar00rootroot00000000000000#!/bin/bash # # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # #It makes no sense to have both CUDA and non-CUDA in the same invocation #as they require different versions of the hfi1_user.h at this point in time. #Limiting this script to only build CUDA if requested #default BUILDARG to build source RPM only BUILDARG=s RPM_NAME=libpsm2 function usage() { echo "Usage: $0 [OPTION] [OPTION] [OPTION]" echo " " echo "Creates tar ball of source and source rpms by default." echo "Optionally generates binary rpm(s) " echo " " echo " s,a,b,p,c,i,l" echo " Optional, default is s (sourcerpm)" echo " Set single extension letter for rpmbuild -b argument" echo " -r , -rpmname " echo " Optional, set the output rpm name" echo " -e , -baseext " echo " Optional, set a base name extension" echo " This only appends an extra string onto the base RPM name" echo " Does not affect supporting RPMs" echo " -c, -cuda" echo " Optional, default is unset" echo " Sets PSM_CUDA=1, creating -cuda based spec and rpms" echo " -d , -dir " echo " Optionally sets output folder for rpmbuild to use" echo " -h , -hal_gen " echo " Optional, default is to build gen1" echo " Sets hal generations for rpmbuild to use" echo " Examples:" echo " $0 b" echo " $0 s -cuda" echo " $0 -cuda" echo " $0 -d ./temp" echo " $0 b -cuda -dir output" echo " $0 -h gen1" exit 1 } err=0 # OUTDIR is where the Makefile places its meta-data OUTDIR=build_release # Set TEMPDIR first, so user control can override the value # This is where rpmbuild places rpm(s) and uses its build meta-data. # It can be set the same as OUTDIR, and work just fine if desired. TEMPDIR=temp.$$ HAL_GENS="" while [ "$1" != "" ]; do case $1 in -d | -dir) shift if [ -z "$1" ]; then usage fi TEMPDIR=$1 ;; -c | -cuda) export PSM_CUDA=1 RPM_EXT="-cuda" ;; -e | -baseext) shift if [ -z "$1" ]; then usage fi RPM_NAME_BASEEXT="$1" export RPM_NAME_BASEEXT="$1" ;; -h | -halgen) shift HAL_GENS="$1 $HAL_GENS" ;; -r | -rpmname) shift if [ -z "$1" ]; then usage fi $RPM_NAME="$1" export RPM_NAME="$1" ;; s|a|b|p|c|i|l) BUILDARG=$1 ;; * ) err=1 usage ;; esac shift done if [ "$HAL_GENS" = "" ]; then HAL_GENS="gen1" fi # Generic cleanup, build, and tmp folder creation make distclean OUTDIR=$OUTDIR make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT "PSM_HAL_ENABLE=$HAL_GENS" dist OUTDIR=$OUTDIR mkdir -p ./$TEMPDIR/{BUILD,RPMS,SOURCES,SPECS,SRPMS,BUILDROOT} # Different paths based on RPM_EXT cp ${OUTDIR}/$RPM_NAME-*.tar.gz $TEMPDIR/SOURCES make RPM_NAME=$RPM_NAME RPM_NAME_BASEEXT=$RPM_NAME_BASEEXT specfile OUTDIR=$OUTDIR cp ${OUTDIR}/$RPM_NAME.spec $TEMPDIR/SPECS rpmbuild -b$BUILDARG --define "_topdir $PWD/$TEMPDIR" --nodeps $TEMPDIR/SPECS/$RPM_NAME.spec echo "The SRPM(s) are in $TEMPDIR/SRPMS/`ls $TEMPDIR/SRPMS`" opa-psm2-PSM2_11.2.185/mpspawn/000077500000000000000000000000001370564314600157065ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/mpspawn/mpspawn_stats.h000066400000000000000000000113721370564314600207660ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _MPSPAWN_STATS_H #define _MPSPAWN_STATS_H #include #define MPSPAWN_STATS_VERSION 1 typedef enum { MPSPAWN_STATS_TYPE_DOUBLE = 0x1, #define MPSPAWN_STATS_TYPE_DOUBLE 0x1 MPSPAWN_STATS_TYPE_HEADER = 0x2, #define MPSPAWN_STATS_TYPE_HEADER 0x2 MPSPAWN_STATS_REDUCTION_MAX = 0x1000, #define MPSPAWN_STATS_REDUCTION_MAX 0x1000 MPSPAWN_STATS_REDUCTION_MIN = 0x2000, #define MPSPAWN_STATS_REDUCTION_MIN 0x2000 MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000, #define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000 MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000 #define MPSPAWN_STATS_SKIP_IF_ZERO 0x8000 } mpspawn_stats_flags; #define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \ MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN) #define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg))) #define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL) #define MPSPAWN_ISNAN_U64(x) (((uint64_t)(x)) == MPSPAWN_NAN_U64) #define MPSPAWN_NAN ((uint64_t) ~0ULL) /* NAN) */ #define MPSPAWN_ISNAN(x) (isnan(x)) struct mpspawn_stats_add_args; /* client->mpspawn stats registration */ struct mpspawn_stats_req_args; /* mpspawn->client fn callback stats request */ struct mpspawn_stats_init_args; /* mpspawn->client "downcall" to register */ /* Clients implement this function to fill in mpspawn request for stats */ typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *); /* mpspawn implements this function to allow clients to register new stats */ typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *); /* mpspawn implements this function to map rank indexes into epaddr structs */ struct psm2_epaddr; typedef struct psm2_epaddr *(*mpspawn_map_epaddr_fn) (int rank); typedef struct mpspawn_stats_req_args { int version; int num; uint64_t *stats; uint16_t *flags; void *context; } mpspawn_stats_req_args_t; typedef struct mpspawn_stats_add_args { int version; int num; char *header; char **desc; uint16_t *flags; mpspawn_stats_req_fn req_fn; void *context; } mpspawn_stats_add_args_t; typedef struct mpspawn_stats_init_args { int version; psm2_mq_t mq; /* initialized mq endpoint */ int num_epaddr; /* number of endpoints in job */ mpspawn_stats_add_fn add_fn; /* function for client to add stats */ mpspawn_map_epaddr_fn epaddr_map_fn; const char *stats_types; /* stats type string mpirun -M */ } mpspawn_stats_init_args_t; /* Function in psm exposed to register stats */ void *psmi_stats_register(struct mpspawn_stats_init_args *args); #endif opa-psm2-PSM2_11.2.185/opa/000077500000000000000000000000001370564314600150005ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/opa/Makefile000066400000000000000000000073031370564314600164430ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2003-2014 Intel Corporation. All rights reserved. # OUTDIR = . TARGLIB := libopa MAJOR := $(OPA_LIB_MAJOR) MINOR := $(OPA_LIB_MINOR) this_srcdir := $(shell readlink -m .) top_srcdir := $(this_srcdir)/.. INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips ifeq (${arch},x86_64) PLATFORM_OBJ=opa_dwordcpy-x86_64-fast.o else PLATFORM_OBJ= endif ${TARGLIB}-objs := opa_debug.o opa_time.o \ opa_service.o opa_utils.o \ opa_dwordcpy-$(arch).o opa_sysfs.o opa_syslog.o \ $(PLATFORM_OBJ) ${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) DEPS := $(${TARGLIB}-objs:.o=.d) .PHONY: all clean IGNORE_DEP_TARGETS = clean all .DEFAULT: ${${TARGLIB}-objs} install: all @echo "Nothing to do for install." $(OUTDIR)/%.d: $(this_srcdir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) $(OUTDIR)/%.d: $(this_srcdir)/%.S $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ $(OUTDIR)/%.o: $(this_srcdir)/%.S | ${DEPS} $(CC) $(ASFLAGS) -c $< -o $@ clean: @rm -f $(OUTDIR)/_revision.c @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno ${TARGLIB}.*; \ cd -; \ fi #ifeq prevents the deps from being included during clean #-include line is required to pull in auto-dependecies during 2nd pass ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) -include ${DEPS} endif opa-psm2-PSM2_11.2.185/opa/opa_debug.c000066400000000000000000000304161370564314600170750ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "opa_user.h" #include "../psm_log.h" unsigned hfi_debug = 1; char *__hfi_mylabel = NULL; FILE *__hfi_dbgout; static void init_hfi_mylabel(void) __attribute__ ((constructor)); static void init_hfi_backtrace(void) __attribute__ ((constructor)); static void init_hfi_dbgfile(void) __attribute__ ((constructor)); static void fini_hfi_backtrace(void) __attribute__ ((destructor)); static void fini_hfi_mylabel(void) __attribute__ ((destructor)); static struct sigaction SIGSEGV_old_act; static struct sigaction SIGBUS_old_act; static struct sigaction SIGILL_old_act; static struct sigaction SIGABRT_old_act; static struct sigaction SIGINT_old_act; static struct sigaction SIGTERM_old_act; #ifdef HFI_BRAKE_DEBUG static void hfi_brake_debug(void) __attribute__ ((constructor)); /* How to use hfi_break_debug code: 1. Build psm with HFI_BRAKE_DEBUG set in the environment. 2. Create a script for your test case (e.g. mpistress?). In the script make sure to choose a HFI brake file that corresponds to a network file system that is common to all hosts where you will run your code. Also, in the script, make sure to propagate the "HFI_BRAKE_FILE_NAME" env var to all hosts. 3. Bring up 3 putty sessions to one of the hosts that your script uses. 4. In putty session number 1, touch the HFI_BRAKE_FILE and sync. 5. In putty session number 1, start the script. You should see messages of the form: -bash-4.2$ ./mpistress.0304.sc :5716 remove the file: "/nfs/user/HFI_BRAKE" to continue :5717 remove the file: "/nfs/user/HFI_BRAKE" to continue :3456 remove the file: "/nfs/user/HFI_BRAKE" to continue :3457 remove the file: "/nfs/user/HFI_BRAKE" to continue Note that the hostname and process id are shown for all of the processes that are started by your script. 6. In putty session 2, bring up gdb, and debug the program that is referenced in your script. For example: /usr/mpi/gcc/openmpi-1.10.2-hfi/tests/intel/mpi_stress 7. In putty session 2 / gdb, attach to one of the processes that is shown in putty session 1. 8. Note, at this point, you have only one gdb session. I leave it as an exercise to the reader to determine how to bring up multiple gdb sessions. 9. In putty session 3, rm the HFI_BRAKE_FILE. 10. You are now debugging a live session of psm. */ static void hfi_brake_debug(void) { struct stat buff; char hostname[80]; const char *hfi_brake_file_name = getenv("HFI_BRAKE_FILE_NAME"); gethostname(hostname, 80); hostname[sizeof(hostname) - 1] = '\0'; if (!hfi_brake_file_name) hfi_brake_file_name = "/tmp/HFI_BRAKE_FILE"; printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name); while (0 == stat(hfi_brake_file_name, &buff)) { printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name); sleep(10); } printf("%s:%d continuing.\n",hostname,getpid()); } #endif static void init_hfi_mylabel(void) { char lbl[1024]; char hostname[80]; char *e; /* By default, try to come up with a decent default label, it will be * overridden later. Try getting rank, if that's not available revert to * pid. */ gethostname(hostname, 80); lbl[0] = '\0'; hostname[sizeof(hostname) - 1] = '\0'; if ((((e = getenv("PSC_MPI_RANK")) && *e)) || (((e = getenv("MPI_RANKID")) && *e)) || (((e = getenv("MPIRUN_RANK")) && *e))) { char *ep; unsigned long val; val = strtoul(e, &ep, 10); if (ep != e) /* valid conversion */ snprintf(lbl, 1024, "%s.%lu", hostname, val); } if (lbl[0] == '\0') snprintf(lbl, 1024, "%s.%u", hostname, getpid()); __hfi_mylabel = strdup(lbl); } static void fini_hfi_mylabel(void) { if(__hfi_mylabel != NULL) free(__hfi_mylabel); } /* FIXME: This signal handler does not conform to the posix standards described in 'man 7 signal' due to it calling unsafe functions. See 'CALLS UNSAFE FUNCTION' notes below for examples. */ static void hfi_sighdlr(int sig, siginfo_t *p1, void *ucv) { /* we make these static to try and avoid issues caused by stack overflow that might have gotten us here. */ static void *backaddr[128]; /* avoid stack usage */ static char buf[150], hname[64], fname[128]; static int i, j, fd, id; extern char *__progname; PSM2_LOG_DECLARE_BT_BUFFER(); /* CALLS UNSAFE FUNCTION when PSM_LOG is defined. */ PSM2_LOG_BT(100,__FUNCTION__); /* If this is a SIGINT do not display backtrace. Just invoke exit handlers */ if ((sig == SIGINT) || (sig == SIGTERM)) /* CALLS UNSAFE FUNCTION (exit) */ exit(1); /* CALLS UNSAFE FUNCTION (snprintf) */ id = snprintf(buf, sizeof(buf), "\n%.60s:%u terminated with signal %d", __progname, getpid(), sig); if (ucv) { static ucontext_t *uc; uc = (ucontext_t *) ucv; id += snprintf(buf + id, sizeof(buf) - id, " at PC=%lx SP=%lx", #if defined(__x86_64__) (unsigned long)uc->uc_mcontext.gregs[REG_RIP], (unsigned long)uc->uc_mcontext.gregs[REG_RSP]); #elif defined(__i386__) (unsigned long)uc->uc_mcontext.gregs[REG_EIP], (unsigned long)uc->uc_mcontext.gregs[REG_ESP]); #else 0ul, 0ul); #warning No stack pointer or instruction pointer for this arch #endif } id += snprintf(buf + id, sizeof(buf) - id, ". Backtrace:\n"); /* CALLS UNSAFE FUNCTION (fprintf) */ fprintf(stderr, "%.*s", id, buf); i = backtrace(backaddr, sizeof(backaddr) / sizeof(backaddr[0])); if (i > 2) /* skip ourselves and backtrace */ j = 2, i -= j; else j = 0; backtrace_symbols_fd(backaddr + j, i, 2); (void)fsync(2); /* Try to write it to a file as well, in case the rest doesn't make it out. Do it second, in case we get a second failure (more likely). We might eventually want to print some more of the registers to the btr file, to aid debugging, but not for now. Truncate the program name if overly long, so we always get pid and (at least part of) hostname. */ /* CALLS UNSAFE FUNCTION (gethostname) */ (void)gethostname(hname, sizeof(hname)); hname[sizeof(hname) - 1] = '\0'; snprintf(fname, sizeof(fname), "%s.80s-%u,%.32s.btr", __progname, getpid(), hname); if ((fd = open(fname, O_CREAT | O_WRONLY, 0644)) >= 0) { /* CALLS UNSAFE FUNCTION (fdopen) */ FILE *fp = fdopen(fd, "w"); if (fp) fprintf(fp, "%.*s", id, buf); backtrace_symbols_fd(backaddr + j, i, fd); if (fp) /* CALLS UNSAFE FUNCTION (fclose) */ fclose(fp); } switch (sig){ case SIGSEGV: (*SIGSEGV_old_act.sa_sigaction)(sig,p1,ucv); break; case SIGBUS: (*SIGBUS_old_act.sa_sigaction)(sig,p1,ucv); break; case SIGILL: (*SIGILL_old_act.sa_sigaction)(sig,p1,ucv); break; case SIGABRT: (*SIGABRT_old_act.sa_sigaction)(sig,p1,ucv); break; default: break; } exit(1); /* not _exit(), want atexit handlers to get run */ } /* We do this as a constructor so any user program that sets signal handlers for these will override our settings, but we still get backtraces if they don't. */ static void init_hfi_backtrace(void) { /* we need to track memory corruption */ static struct sigaction act; /* easier than memset */ act.sa_sigaction = hfi_sighdlr; act.sa_flags = SA_SIGINFO; if (getenv("HFI_BACKTRACE")) { /* permanent, although probably undocumented way to disable backtraces. */ (void)sigaction(SIGSEGV, &act, &SIGSEGV_old_act); (void)sigaction(SIGBUS, &act, &SIGBUS_old_act); (void)sigaction(SIGILL, &act, &SIGILL_old_act); (void)sigaction(SIGABRT, &act, &SIGABRT_old_act); (void)sigaction(SIGINT, &act, &SIGINT_old_act); (void)sigaction(SIGTERM, &act, &SIGTERM_old_act); } } /* if HFI_DEBUG_FILENAME is set in the environment, then all the debug prints (not info and error) will go to that file. %h is expanded to the hostname, and %p to the pid, if present. */ static void init_hfi_dbgfile(void) { char *fname = getenv("HFI_DEBUG_FILENAME"); char *exph, *expp, tbuf[1024]; FILE *newf; if (!fname) { __hfi_dbgout = stdout; return; } exph = strstr(fname, "%h"); /* hostname */ expp = strstr(fname, "%p"); /* pid */ if (exph || expp) { int baselen; char hname[256], pid[12]; if (exph) { *hname = hname[sizeof(hname) - 1] = 0; gethostname(hname, sizeof(hname) - 1); if (!*hname) strcpy(hname, "[unknown]"); } if (expp) snprintf(pid, sizeof(pid), "%d", getpid()); if (exph && expp) { if (exph < expp) { baselen = exph - fname; snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s", baselen, fname, hname, (int)(expp - (exph + 2)), exph + 2, pid, expp + 2); } else { baselen = expp - fname; snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s", baselen, fname, pid, (int)(exph - (expp + 2)), expp + 2, hname, exph + 2); } } else if (exph) { baselen = exph - fname; snprintf(tbuf, sizeof(tbuf), "%.*s%s%s", baselen, fname, hname, exph + 2); } else { baselen = expp - fname; snprintf(tbuf, sizeof(tbuf), "%.*s%s%s", baselen, fname, pid, expp + 2); } fname = tbuf; } newf = fopen(fname, "a"); if (!newf) { _HFI_ERROR ("Unable to open \"%s\" for debug output, using stdout: %s\n", fname, strerror(errno)); __hfi_dbgout = stdout; } else { __hfi_dbgout = newf; setlinebuf(__hfi_dbgout); } } void hfi_set_mylabel(char *label) { __hfi_mylabel = label; } char *hfi_get_mylabel() { return __hfi_mylabel; } static void fini_hfi_backtrace(void) { if (getenv("HFI_BACKTRACE")) { (void)sigaction(SIGSEGV, &SIGSEGV_old_act, NULL); (void)sigaction(SIGBUS, &SIGBUS_old_act, NULL); (void)sigaction(SIGILL, &SIGILL_old_act, NULL); (void)sigaction(SIGABRT, &SIGABRT_old_act, NULL); (void)sigaction(SIGINT, &SIGINT_old_act, NULL); (void)sigaction(SIGTERM, &SIGTERM_old_act, NULL); } } opa-psm2-PSM2_11.2.185/opa/opa_dwordcpy-generic.c000066400000000000000000000214461370564314600212570ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include "opa_intf.h" #include "psm_user.h" #if defined(__x86_64__) #define hfi_dwordcpy hfi_dwordcpy_safe #define hfi_qwordcpy hfi_qwordcpy_safe #endif void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords) { uint_fast32_t ndw = ndwords; const uint64_t *src64[4]; volatile uint64_t *dst64[4]; src64[0] = (const uint64_t *) src; dst64[0] = (volatile uint64_t *) dest; while (ndw >= 8) { *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; ndw -= 8; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; } if (ndw) { src = (const uint32_t *) src64[0]; dest = (volatile uint32_t *) dst64[0]; switch (ndw) { case 7: *dest++ = *src++; case 6: *dest++ = *src++; case 5: *dest++ = *src++; case 4: *dest++ = *src++; case 3: *dest++ = *src++; case 2: *dest++ = *src++; case 1: *dest++ = *src++; } } } void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords) { uint_fast32_t nqw = nqwords; const uint64_t *src64[4]; volatile uint64_t *dst64[4]; src64[0] = src; dst64[0] = dest; while (nqw >= 8) { *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; nqw -= 8; } if (nqw) { switch (nqw) { case 7: *(dst64[0])++ = *(src64[0])++; case 6: *(dst64[0])++ = *(src64[0])++; case 5: *(dst64[0])++ = *(src64[0])++; case 4: *(dst64[0])++ = *(src64[0])++; case 3: *(dst64[0])++ = *(src64[0])++; case 2: *(dst64[0])++ = *(src64[0])++; case 1: *(dst64[0])++ = *(src64[0])++; } } } #ifdef PSM_AVX512 void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { volatile __m512i *dp = (volatile __m512i *) dest; const __m512i *sp = (const __m512i *) src; psmi_assert((dp != NULL) && (sp != NULL)); psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); if ((((uintptr_t) sp) & 0x3f) == 0x0) { /* source and destination are both 64 byte aligned */ do { __m512i tmp0 = _mm512_load_si512(sp); _mm512_store_si512((__m512i *)dp, tmp0); } while ((--nblock) && (++dp) && (++sp)); } else { /* only destination is 64 byte aligned - use unaligned loads */ do { __m512i tmp0 = _mm512_loadu_si512(sp); _mm512_store_si512((__m512i *)dp, tmp0); } while ((--nblock) && (++dp) && (++sp)); } } #endif void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { volatile __m256i *dp = (volatile __m256i *) dest; const __m256i *sp = (const __m256i *) src; psmi_assert((dp != NULL) && (sp != NULL)); psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); if ((((uintptr_t) sp) & 0x1f) == 0x0) { /* source and destination are both 32 byte aligned */ do { __m256i tmp0 = _mm256_load_si256(sp); __m256i tmp1 = _mm256_load_si256(sp + 1); _mm256_store_si256((__m256i *)dp, tmp0); _mm256_store_si256((__m256i *)(dp + 1), tmp1); } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); } else { /* only destination is 32 byte aligned - use unaligned loads */ do { __m256i tmp0 = _mm256_loadu_si256(sp); __m256i tmp1 = _mm256_loadu_si256(sp + 1); _mm256_store_si256((__m256i *)dp, tmp0); _mm256_store_si256((__m256i *)(dp + 1), tmp1); } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); } } void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { volatile __m128i *dp = (volatile __m128i *) dest; const __m128i *sp = (const __m128i *) src; psmi_assert((dp != NULL) && (sp != NULL)); psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); if ((((uintptr_t) sp) & 0xf) == 0x0) { /* source and destination are both 16 byte aligned */ do { __m128i tmp0 = _mm_load_si128(sp); __m128i tmp1 = _mm_load_si128(sp + 1); __m128i tmp2 = _mm_load_si128(sp + 2); __m128i tmp3 = _mm_load_si128(sp + 3); _mm_store_si128((__m128i *)dp, tmp0); _mm_store_si128((__m128i *)(dp + 1), tmp1); _mm_store_si128((__m128i *)(dp + 2), tmp2); _mm_store_si128((__m128i *)(dp + 3), tmp3); } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); } else { /* only destination is 16 byte aligned - use unaligned loads */ do { __m128i tmp0 = _mm_loadu_si128(sp); __m128i tmp1 = _mm_loadu_si128(sp + 1); __m128i tmp2 = _mm_loadu_si128(sp + 2); __m128i tmp3 = _mm_loadu_si128(sp + 3); _mm_store_si128((__m128i *)dp, tmp0); _mm_store_si128((__m128i *)(dp + 1), tmp1); _mm_store_si128((__m128i *)(dp + 2), tmp2); _mm_store_si128((__m128i *)(dp + 3), tmp3); } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); } } void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { const uint64_t *src64[4]; volatile uint64_t *dst64[4]; src64[0] = src; dst64[0] = dest; psmi_assert((dst64[0] != NULL) && (src64[0] != NULL)); psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0); do { *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; } while (--nblock); } void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) { #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); return; } #endif memcpy(vdest, vsrc, nchars); return; } MOCK_DEF_EPILOGUE(psmi_mq_mtucpy); void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars) { memcpy(vdest, vsrc, nchars); return; } opa-psm2-PSM2_11.2.185/opa/opa_dwordcpy-i386.S000066400000000000000000000055571370564314600203210ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifdef __CET__ #include #endif .globl hfi_dwordcpy .file "opa_dword32cpy.S" .text .p2align 4,,15 hfi_dwordcpy: // standard C calling convention, args on stack // does not return any value .type hfi_dwordcpy, @function #ifdef _CET_ENDBR _CET_ENDBR #endif // save caller-saved regs mov %edi,%eax mov %esi,%edx // setup regs mov 0xc(%esp,1),%ecx mov 0x4(%esp,1),%edi mov 0x8(%esp,1),%esi // and do it cld rep movsd // restore mov %eax,%edi mov %edx,%esi ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif opa-psm2-PSM2_11.2.185/opa/opa_dwordcpy-x86_64-fast.S000066400000000000000000000054111370564314600215060ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifdef __CET__ #include #endif .globl hfi_dwordcpy .file "opa_dwordcpy-x86_64-fast.S" .text .p2align 4,,15 // standard C calling convention, rdi is dest, rsi is source, rdx is count // does not return any value hfi_dwordcpy: .type hfi_dwordcpy, @function #ifdef _CET_ENDBR _CET_ENDBR #endif movl %edx,%ecx shrl $1,%ecx andl $1,%edx cld rep movsq movl %edx,%ecx rep movsd ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif opa-psm2-PSM2_11.2.185/opa/opa_dwordcpy-x86_64.c000066400000000000000000000214461370564314600206010ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include "opa_intf.h" #include "psm_user.h" #if defined(__x86_64__) #define hfi_dwordcpy hfi_dwordcpy_safe #define hfi_qwordcpy hfi_qwordcpy_safe #endif void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords) { uint_fast32_t ndw = ndwords; const uint64_t *src64[4]; volatile uint64_t *dst64[4]; src64[0] = (const uint64_t *) src; dst64[0] = (volatile uint64_t *) dest; while (ndw >= 8) { *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; ndw -= 8; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; } if (ndw) { src = (const uint32_t *) src64[0]; dest = (volatile uint32_t *) dst64[0]; switch (ndw) { case 7: *dest++ = *src++; case 6: *dest++ = *src++; case 5: *dest++ = *src++; case 4: *dest++ = *src++; case 3: *dest++ = *src++; case 2: *dest++ = *src++; case 1: *dest++ = *src++; } } } void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords) { uint_fast32_t nqw = nqwords; const uint64_t *src64[4]; volatile uint64_t *dst64[4]; src64[0] = src; dst64[0] = dest; while (nqw >= 8) { *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; nqw -= 8; } if (nqw) { switch (nqw) { case 7: *(dst64[0])++ = *(src64[0])++; case 6: *(dst64[0])++ = *(src64[0])++; case 5: *(dst64[0])++ = *(src64[0])++; case 4: *(dst64[0])++ = *(src64[0])++; case 3: *(dst64[0])++ = *(src64[0])++; case 2: *(dst64[0])++ = *(src64[0])++; case 1: *(dst64[0])++ = *(src64[0])++; } } } #ifdef PSM_AVX512 void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { volatile __m512i *dp = (volatile __m512i *) dest; const __m512i *sp = (const __m512i *) src; psmi_assert((dp != NULL) && (sp != NULL)); psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); if ((((uintptr_t) sp) & 0x3f) == 0x0) { /* source and destination are both 64 byte aligned */ do { __m512i tmp0 = _mm512_load_si512(sp); _mm512_store_si512((__m512i *)dp, tmp0); } while ((--nblock) && (++dp) && (++sp)); } else { /* only destination is 64 byte aligned - use unaligned loads */ do { __m512i tmp0 = _mm512_loadu_si512(sp); _mm512_store_si512((__m512i *)dp, tmp0); } while ((--nblock) && (++dp) && (++sp)); } } #endif void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { volatile __m256i *dp = (volatile __m256i *) dest; const __m256i *sp = (const __m256i *) src; psmi_assert((dp != NULL) && (sp != NULL)); psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); if ((((uintptr_t) sp) & 0x1f) == 0x0) { /* source and destination are both 32 byte aligned */ do { __m256i tmp0 = _mm256_load_si256(sp); __m256i tmp1 = _mm256_load_si256(sp + 1); _mm256_store_si256((__m256i *)dp, tmp0); _mm256_store_si256((__m256i *)(dp + 1), tmp1); } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); } else { /* only destination is 32 byte aligned - use unaligned loads */ do { __m256i tmp0 = _mm256_loadu_si256(sp); __m256i tmp1 = _mm256_loadu_si256(sp + 1); _mm256_store_si256((__m256i *)dp, tmp0); _mm256_store_si256((__m256i *)(dp + 1), tmp1); } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); } } void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { volatile __m128i *dp = (volatile __m128i *) dest; const __m128i *sp = (const __m128i *) src; psmi_assert((dp != NULL) && (sp != NULL)); psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); if ((((uintptr_t) sp) & 0xf) == 0x0) { /* source and destination are both 16 byte aligned */ do { __m128i tmp0 = _mm_load_si128(sp); __m128i tmp1 = _mm_load_si128(sp + 1); __m128i tmp2 = _mm_load_si128(sp + 2); __m128i tmp3 = _mm_load_si128(sp + 3); _mm_store_si128((__m128i *)dp, tmp0); _mm_store_si128((__m128i *)(dp + 1), tmp1); _mm_store_si128((__m128i *)(dp + 2), tmp2); _mm_store_si128((__m128i *)(dp + 3), tmp3); } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); } else { /* only destination is 16 byte aligned - use unaligned loads */ do { __m128i tmp0 = _mm_loadu_si128(sp); __m128i tmp1 = _mm_loadu_si128(sp + 1); __m128i tmp2 = _mm_loadu_si128(sp + 2); __m128i tmp3 = _mm_loadu_si128(sp + 3); _mm_store_si128((__m128i *)dp, tmp0); _mm_store_si128((__m128i *)(dp + 1), tmp1); _mm_store_si128((__m128i *)(dp + 2), tmp2); _mm_store_si128((__m128i *)(dp + 3), tmp3); } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); } } void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) { const uint64_t *src64[4]; volatile uint64_t *dst64[4]; src64[0] = src; dst64[0] = dest; psmi_assert((dst64[0] != NULL) && (src64[0] != NULL)); psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0); do { *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; *dst64[0] = *src64[0]; src64[1] = src64[0] + 1; src64[2] = src64[0] + 2; src64[3] = src64[0] + 3; dst64[1] = dst64[0] + 1; dst64[2] = dst64[0] + 2; dst64[3] = dst64[0] + 3; *dst64[1] = *src64[1]; *dst64[2] = *src64[2]; *dst64[3] = *src64[3]; src64[0] += 4; dst64[0] += 4; } while (--nblock); } void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) { #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); return; } #endif memcpy(vdest, vsrc, nchars); return; } MOCK_DEF_EPILOGUE(psmi_mq_mtucpy); void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars) { memcpy(vdest, vsrc, nchars); return; } opa-psm2-PSM2_11.2.185/opa/opa_service.c000066400000000000000000000074331370564314600174520ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file contains hfi service routine interface used by the low level hfi protocol code. */ #include "opa_service.h" #include "psmi_wrappers.h" /* These have been fixed to read the values, but they are not * compatible with the hfi driver, they return new info with * the qib driver */ static int hfi_count_names(const char *namep) { int n = 0; while (*namep != '\0') { if (*namep == '\n') n++; namep++; } return n; } int hfi_get_ctrs_unit_names(int unitno, char **namep) { int i; i = hfi_hfifs_unit_read(unitno, "counter_names", namep); if (i < 0) return -1; else return hfi_count_names(*namep); } int hfi_get_ctrs_unit(int unitno, uint64_t *c, int nelem) { int i; i = hfi_hfifs_unit_rd(unitno, "counters", c, nelem * sizeof(*c)); if (i < 0) return -1; else return i / sizeof(*c); } int hfi_get_ctrs_port_names(int unitno, char **namep) { int i; i = hfi_hfifs_unit_read(unitno, "portcounter_names", namep); if (i < 0) return -1; else return hfi_count_names(*namep); } int hfi_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem) { int i; char buf[32]; snprintf(buf, sizeof(buf), "port%dcounters", port); i = hfi_hfifs_unit_rd(unitno, buf, c, nelem * sizeof(*c)); if (i < 0) return -1; else return i / sizeof(*c); } int hfi_get_stats_names(char **namep) { int i; i = hfi_hfifs_read("driver_stats_names", namep); if (i < 0) return -1; else return hfi_count_names(*namep); } int hfi_get_stats(uint64_t *s, int nelem) { int i; i = hfi_hfifs_rd("driver_stats", s, nelem * sizeof(*s)); if (i < 0) return -1; else return i / sizeof(*s); } opa-psm2-PSM2_11.2.185/opa/opa_sysfs.c000066400000000000000000000266621370564314600171660ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* This file contains a simple sysfs interface used by the low level hfi protocol code. It also implements the interface to hfifs. */ #include #include #include #include #include #include #include #include #include "opa_service.h" static char *sysfs_path; static size_t sysfs_path_len; static char *hfifs_path; static long sysfs_page_size; int sysfs_init(const char *dflt_hfi_class_path) { int rv = 0; if (NULL != (sysfs_path = getenv("HFI_SYSFS_PATH"))) { char *syspath = strdup(sysfs_path); if (!syspath) { _HFI_DBG("Failed to strdup(\"%s\") for syspath.\n", sysfs_path); rv = -1; } else sysfs_path = syspath; } if (sysfs_path == NULL) { unsigned len = strlen(dflt_hfi_class_path) + 4; char *syspath = malloc(len); if (!syspath) { _HFI_DBG("Failed to alloc %u bytes for syspath.\n",len); rv = -1; } else { snprintf(syspath, len, "%s_0", dflt_hfi_class_path); sysfs_path = syspath; } } if (sysfs_path != NULL) { struct stat s; if (stat(sysfs_path, &s) || !S_ISDIR(s.st_mode)) { _HFI_DBG("Did not find sysfs directory %s, using anyway\n", sysfs_path); rv = -1; } else { /* Remove the unit number from the sysfs path: */ char *lastUS = strrchr(sysfs_path, '_'); if ((NULL != lastUS) && (isdigit(lastUS[1]))) lastUS[1] = 0; } } if (sysfs_path != NULL) sysfs_path_len = strlen(sysfs_path); if (hfifs_path == NULL) hfifs_path = getenv("HFI_HFIFS_PATH"); if (hfifs_path == NULL) hfifs_path = "/hfifs"; if (!sysfs_page_size) sysfs_page_size = sysconf(_SC_PAGESIZE); return rv; } void sysfs_fini(void) { free(sysfs_path); } const char *hfi_sysfs_path(void) { return sysfs_path; } size_t hfi_sysfs_path_len(void) { return sysfs_path_len; } const char *hfi_hfifs_path(void) { return hfifs_path; } int hfi_hfifs_open(const char *attr, int flags) { char buf[1024]; int saved_errno; int fd; snprintf(buf, sizeof(buf), "%s/%s", hfi_hfifs_path(), attr); fd = open(buf, flags); saved_errno = errno; if (fd == -1) { _HFI_DBG("Failed to open driver attribute '%s': %s\n", attr, strerror(errno)); _HFI_DBG("Offending file name: %s\n", buf); } errno = saved_errno; return fd; } int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags) { int saved_errno; char buf[1024]; int fd; snprintf(buf, sizeof(buf), "%s%u/%s", hfi_sysfs_path(), unit, attr); fd = open(buf, flags); saved_errno = errno; if (fd == -1) { _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, unit, strerror(errno)); _HFI_DBG("Offending file name: %s\n", buf); } errno = saved_errno; return fd; } static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags) { int saved_errno; char buf[1024]; int fd; snprintf(buf, sizeof(buf), "%s%u/device/numa_node", hfi_sysfs_path(), unit); fd = open(buf, flags); saved_errno = errno; if (fd == -1) { _HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n", unit, strerror(errno)); _HFI_DBG("Offending file name: %s\n", buf); } errno = saved_errno; return fd; } int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, int flags) { int saved_errno; char buf[1024]; int fd; snprintf(buf, sizeof(buf), "%s%u/ports/%u/%s", hfi_sysfs_path(), unit, port, attr); fd = open(buf, flags); saved_errno = errno; if (fd == -1) { _HFI_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n", attr, unit, port, strerror(errno)); _HFI_DBG("Offending file name: %s\n", buf); } errno = saved_errno; return fd; } int hfi_hfifs_unit_open(uint32_t unit, const char *attr, int flags) { int saved_errno; char buf[1024]; int fd; snprintf(buf, sizeof(buf), "%s/%u/%s", hfi_hfifs_path(), unit, attr); fd = open(buf, flags); saved_errno = errno; if (fd == -1) { _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, unit, strerror(errno)); _HFI_DBG("Offending file name: %s\n", buf); } errno = saved_errno; return fd; } static int read_page(int fd, char **datap) { char *data = NULL; int saved_errno; int ret = -1; data = malloc(sysfs_page_size); saved_errno = errno; if (!data) { _HFI_DBG("Could not allocate memory: %s\n", strerror(errno)); goto bail; } ret = read(fd, data, sysfs_page_size); saved_errno = errno; if (ret == -1) { _HFI_DBG("Read of attribute failed: %s\n", strerror(errno)); goto bail; } bail: if (ret == -1) { free(data); } else { if (ret < sysfs_page_size) data[ret] = 0; else data[sysfs_page_size-1] = 0; *datap = data; } errno = saved_errno; return ret; } /* * On return, caller must free *datap. */ int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap) { int fd = -1, ret = -1; int saved_errno; fd = hfi_sysfs_unit_open(unit, attr, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read_page(fd, datap); saved_errno = errno; bail: if (ret == -1) *datap = NULL; if (fd != -1) { close(fd); } errno = saved_errno; return ret; } /* read a string value into buff, no more than size bytes. returns the number of bytes read */ size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, char *buff, size_t size) { int fd = -1; size_t rv = 0; fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY); if (fd == -1) return rv; rv = read(fd, buff, size); close(fd); if (rv < size) buff[rv] = 0; else buff[size-1] = 0; return rv; } /* * On return, caller must free *datap. */ int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, char **datap) { int fd = -1, ret = -1; int saved_errno; fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read_page(fd, datap); saved_errno = errno; bail: if (ret == -1) *datap = NULL; if (fd != -1) { close(fd); } errno = saved_errno; return ret; } /* * On return, caller must free *datap. */ int hfi_hfifs_read(const char *attr, char **datap) { int fd = -1, ret = -1; int saved_errno; fd = hfi_hfifs_open(attr, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read_page(fd, datap); saved_errno = errno; bail: if (ret == -1) *datap = NULL; if (fd != -1) { close(fd); } errno = saved_errno; return ret; } /* * On return, caller must free *datap. */ int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **datap) { int fd = -1, ret = -1; int saved_errno; fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read_page(fd, datap); saved_errno = errno; bail: if (ret == -1) *datap = NULL; if (fd != -1) { close(fd); } errno = saved_errno; return ret; } /* * The _rd routines jread directly into a supplied buffer, * unlike the _read routines. */ int hfi_hfifs_rd(const char *attr, void *buf, int n) { int fd = -1, ret = -1; int saved_errno; fd = hfi_hfifs_open(attr, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read(fd, buf, n); saved_errno = errno; bail: if (fd != -1) { close(fd); } errno = saved_errno; return ret; } int hfi_hfifs_unit_rd(uint32_t unit, const char *attr, void *buf, int n) { int fd = -1, ret = -1; int saved_errno; fd = hfi_hfifs_unit_open(unit, attr, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read(fd, buf, n); saved_errno = errno; bail: if (fd != -1) { close(fd); } errno = saved_errno; return ret; } int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr, int64_t *valp, int base) { char *data=NULL, *end; int saved_errno; long long val; int ret; ret = hfi_sysfs_unit_read(unit, attr, &data); saved_errno = errno; if (ret == -1) { goto bail; } val = strtoll(data, &end, base); saved_errno = errno; if (!*data || !(*end == '\0' || isspace(*end))) { ret = -1; goto bail; } *valp = val; ret = 0; bail: if (data) free(data); errno = saved_errno; return ret; } static int hfi_sysfs_unit_read_node(uint32_t unit, char **datap) { int fd = -1, ret = -1; int saved_errno; fd = hfi_sysfs_unit_open_for_node(unit, O_RDONLY); saved_errno = errno; if (fd == -1) goto bail; ret = read_page(fd, datap); if (ret == -1) *datap = NULL; saved_errno = errno; close(fd); bail: errno = saved_errno; return ret; } int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit) { char *data=NULL, *end; int saved_errno; long long val; int64_t ret = -1; saved_errno = errno; if (hfi_sysfs_unit_read_node(unit, &data) == -1) { goto bail; } val = strtoll(data, &end, 0); saved_errno = errno; if (!*data || !(*end == '\0' || isspace(*end))) { ret = -1; goto bail; } ret = (int64_t) val; bail: free(data); errno = saved_errno; return ret; } int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, int64_t *valp, int base) { char *data, *end; int saved_errno; long long val; int ret; ret = hfi_sysfs_port_read(unit, port, attr, &data); saved_errno = errno; if (ret == -1) { goto bail; } val = strtoll(data, &end, base); saved_errno = errno; if (!*data || !(*end == '\0' || isspace(*end))) { ret = -1; goto bail; } *valp = val; ret = 0; bail: free(data); errno = saved_errno; return ret; } opa-psm2-PSM2_11.2.185/opa/opa_syslog.c000066400000000000000000000066341370564314600173340ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #define __USE_GNU #include #include #include #include #include #include "opa_user.h" #define SYSLOG_MAXLEN 512 extern char *__hfi_mylabel; void hfi_vsyslog(const char *prefix, int to_console, int level, const char *format, va_list ap) { char logprefix[SYSLOG_MAXLEN]; size_t len; if (to_console) { char hostname[80]; va_list ap_cons; va_copy(ap_cons, ap); len = strlen(format); gethostname(hostname, sizeof(hostname)); hostname[sizeof(hostname) - 1] = '\0'; if (__hfi_mylabel) fprintf(stderr, "%s", __hfi_mylabel); else fprintf(stderr, "%s: ", hostname); vfprintf(stderr, format, ap_cons); if (format[len] != '\n') fprintf(stderr, "\n"); fflush(stderr); va_end(ap_cons); } len = snprintf(logprefix, sizeof(logprefix), "(hfi/%s)[%d]: %s", prefix ? prefix : "hfi", (int)getpid(), format); vsyslog(level | LOG_USER, logprefix, ap); return; } void hfi_syslog(const char *prefix, int to_console, int level, const char *format, ...) { va_list ap; va_start(ap, format); hfi_vsyslog(prefix, to_console, level, format, ap); va_end(ap); } opa-psm2-PSM2_11.2.185/opa/opa_time.c000066400000000000000000000220011370564314600167340ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #define __USE_GNU #include #include #include #include #include #include #include #include #include #include #include #include #include "opa_user.h" #ifdef min #undef min #endif #define min(a, b) ((a) < (b) ? (a) : (b)) #ifdef max #undef max #endif #define max(a, b) ((a) > (b) ? (a) : (b)) /* init the cycle counter to picosecs/cycle conversion automatically */ /* at program startup, if it's using timing functions. */ static void init_picos_per_cycle(void) __attribute__ ((constructor)); static int hfi_timebase_isvalid(uint32_t pico_per_cycle); static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle); /* in case two of our mechanisms fail */ #define SAFEDEFAULT_PICOS_PER_CYCLE 500 uint32_t __hfi_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; /* This isn't perfect, but it's close enough for rough timing. We want this to work on systems where the cycle counter isn't the same as the clock frequency. __hfi_pico_per_cycle isn't going to lead to completely accurate conversions from timestamps to nanoseconds, but it's close enough for our purposes, which is mainly to allow people to show events with nsecs or usecs if desired, rather than cycles. We use it in some performance analysis, but it has to be done with care, since cpuspeed can change, different cpu's can have different speeds, etc. Some architectures don't have their TSC-equivalent running at anything related to the processor speed (e.g. G5 Power systems use a fixed 33 MHz frequency). */ #define MIN_TEST_TIME_IN_PICOS (100000000000LL) /* 100 milliseconds */ static int timebase_debug; /* off by default */ #define timebase_warn_always(fmt, ...) \ hfi_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__) #define timebase_warn(fmt, ...) if (timebase_debug) \ timebase_warn_always(fmt, ##__VA_ARGS__) static int hfi_timebase_isvalid(uint32_t pico_per_cycle) { #if defined(__x86_64__) || defined(__i386__) /* If pico-per-cycle is less than 200, the clock speed would be greater * than 5 GHz. Similarly, we minimally support a 1GHz clock. * Allow some slop, because newer kernels with HPET can be a few * units off, and we don't want to spend the startup time needlessly */ if (pico_per_cycle >= 198 && pico_per_cycle <= 1005) return 1; #endif else return 0; } /* * Method #1: * * Derive the pico-per-cycle by trying to correlate the difference between two * reads of the tsc counter to gettimeofday. */ static void init_picos_per_cycle() { struct timeval tvs, tve; int64_t usec = 0; uint64_t ts, te; int64_t delta; uint32_t picos = 0; int trials = 0; int retry = 0; cpu_set_t cpuset, cpuset_saved; int have_cpuset = 1; /* * Make sure we try to calculate the cycle time without being migrated. */ CPU_ZERO(&cpuset_saved); if (sched_getaffinity(0, sizeof(cpuset), &cpuset_saved)) have_cpuset = 0; CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); if (have_cpuset && sched_setaffinity(0, sizeof(cpuset), &cpuset)) have_cpuset = 0; /* * If we set affinity correctly, give the scheduler another change to put * us on processor 0 */ if (have_cpuset) sched_yield(); retry_pico_test: if (++retry == 10) { __hfi_pico_per_cycle = hfi_timebase_from_cpuinfo(picos); goto reset_cpu_mask; /* Reset CPU mask before exiting */ } usec = 0; gettimeofday(&tvs, NULL); ts = get_cycles(); while (usec < MIN_TEST_TIME_IN_PICOS) { /* wait for at least 100 millisecs */ trials++; usleep(125); gettimeofday(&tve, NULL); usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) + 1000000000000LL * (tve.tv_sec - tvs.tv_sec); if (usec < 0) { timebase_warn ("RTC timebase, gettimeofday is negative (!) %lld\n", (long long)usec); goto retry_pico_test; } } te = get_cycles(); delta = te - ts; picos = (uint32_t) (usec / delta); if (!hfi_timebase_isvalid(picos)) { cpu_set_t cpuget; int affinity_valid = !sched_getaffinity(0, sizeof(cpuget), &cpuget); if (affinity_valid && !CPU_ISSET(0, &cpuget)) affinity_valid = 0; timebase_warn ("Failed to get valid RTC timebase, gettimeofday delta=%lld, " "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n", (long long)usec, (long long)delta, picos, affinity_valid ? "YES" : "NO", retry); goto retry_pico_test; } /* If we've had to retry even once, let that be known */ if (retry > 1) timebase_warn("Clock is %d picos/cycle found in %d trials and " "%.3f seconds (retry=%d)\n", picos, trials, (double)usec / 1.0e12, retry); __hfi_pico_per_cycle = picos; reset_cpu_mask: /* Restore affinity */ if (have_cpuset) { sched_setaffinity(0, sizeof(cpuset), &cpuset_saved); /* * Give a chance to other processes that also set affinity to 0 for * doing this test. */ sched_yield(); } } /* * Method #2: * * Derive the pico-per-cycle from /proc instead of using sleep trick * that relies on scheduler. */ static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle) { /* we only validate once */ uint32_t new_pico_per_cycle = old_pico_per_cycle; uint32_t max_bet_new_old_pico, min_bet_new_old_pico; char hostname[80]; gethostname(hostname, 80); hostname[sizeof(hostname) - 1] = '\0'; if (getenv("HFI_DEBUG_TIMEBASE")) timebase_debug = 1; /* If the old one is valid, don't bother with this mechanism */ if (hfi_timebase_isvalid(old_pico_per_cycle)) return old_pico_per_cycle; #if defined(__x86_64__) || defined(__i386__) { FILE *fp = fopen("/proc/cpuinfo", "r"); char input[255]; char *p = NULL; if (!fp) goto fail; while (!feof(fp) && fgets(input, 255, fp)) { if (strstr(input, "cpu MHz")) { p = strchr(input, ':'); if (p) { double MHz = atof(p + 1); if (MHz != 0.0) new_pico_per_cycle = (uint32_t) (1000000. / MHz); } break; } } fclose(fp); if (!p) goto fail; } #endif max_bet_new_old_pico = max(new_pico_per_cycle, old_pico_per_cycle); min_bet_new_old_pico = min(new_pico_per_cycle, old_pico_per_cycle); /* If there's no change (within a small range), just return the old one */ if ((max_bet_new_old_pico - min_bet_new_old_pico) < 5) return old_pico_per_cycle; if (hfi_timebase_isvalid(new_pico_per_cycle)) { timebase_warn_always ("RTC timebase, using %d picos/cycle from /proc " "instead of the detected %d picos/cycle\n", new_pico_per_cycle, old_pico_per_cycle); return new_pico_per_cycle; } fail: new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; timebase_warn_always ("Problem obtaining CPU time base, detected to be %d " "pico/cycle, adjusted to safe default %d picos/cycle", old_pico_per_cycle, new_pico_per_cycle); return new_pico_per_cycle; } opa-psm2-PSM2_11.2.185/opa/opa_utils.c000066400000000000000000000100531370564314600171420ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file contains hfi service routine interface used by the low */ /* level hfi protocol code. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opa_user.h" /* keep track whether we disabled mmap in malloc */ int __hfi_malloc_no_mmap = 0; const char *hfi_get_next_name(char **names) { char *p, *start; p = start = *names; while (*p != '\0' && *p != '\n') { p++; } if (*p == '\n') { *p = '\0'; p++; *names = p; return start; } else return NULL; } void hfi_release_names(char *namep) { /* names were initialised in the data section before. Now * they are allocated when hfi_hfifs_read() is called. Allocation * for names is done only once at init time. Should we eventually * have an "stats_type_unregister" type of routine to explicitly * deallocate memory and free resources ? */ #if 0 if (namep != NULL) free(namep); #endif } int hfi_get_stats_names_count() { char *namep; int c; c = hfi_get_stats_names(&namep); free(namep); return c; } int hfi_get_ctrs_unit_names_count(int unitno) { char *namep; int c; c = hfi_get_ctrs_unit_names(unitno, &namep); free(namep); return c; } int hfi_get_ctrs_port_names_count(int unitno) { char *namep; int c; c = hfi_get_ctrs_port_names(unitno, &namep); free(namep); return c; } /* * Add a constructor function to disable mmap if asked to do so by the user */ static void init_mallopt_disable_mmap(void) __attribute__ ((constructor)); static void init_mallopt_disable_mmap(void) { char *env = getenv("HFI_DISABLE_MMAP_MALLOC"); if (env && *env) { if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) { __hfi_malloc_no_mmap = 1; } } return; } opa-psm2-PSM2_11.2.185/opa/opa_write_pio-i386.c000066400000000000000000000222671370564314600205040ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* This file contains the initialization functions used by the low level hfi protocol code. */ #include #include #include #include #include #include #include #include #include #include #include #include "hfi_user.h" /* * These pio copy routines are here so they can be used by test code, as well * as by MPI, and can change independently of MPI */ /* * for processors that may not write store buffers in the order filled, * and when the store buffer is not completely filled (partial at end, or * interrupted and flushed) may write the partial buffer in * "random" order. requires additional serialization */ void hfi_write_pio_force_order(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { union hfi_pbc buf = {.qword = 0 }; uint32_t cksum_len = pioparm->cksum_is_valid ? HFI_CRC_SIZE_IN_BYTES : 0; buf.length = __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); if (pioparm->port > 1) buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | __PBC_IBPORT | pioparm->rate); else buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | pioparm->rate); *piob++ = buf.dword; /* 32 bit programs require fence after first 32 bits of pbc write */ /* Can't do as uint64_t store, or compiler could reorder */ ips_wmb(); *piob++ = buf.pbcflags; if (!pioparm->length) { uint32_t *dhdr, dcpywords; dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1; hfi_dwordcpy_safe(piob, hdr, dcpywords); ips_wmb(); dhdr = hdr; piob += dcpywords; dhdr += dcpywords; *piob++ = *dhdr; } else { uint32_t *pay2 = bdata, j; uint32_t len = pioparm->length; hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); piob += HFI_MESSAGE_HDR_SIZE >> 2; len >>= 2; if (len > 16) { uint32_t pay_words = 16 * ((len - 1) / 16); hfi_dwordcpy_safe(piob, pay2, pay_words); piob += pay_words; pay2 += pay_words; len -= pay_words; } /* now write the final chunk a word at a time, fence before trigger */ for (j = 0; j < (len - 1); j++) *piob++ = *pay2++; ips_wmb(); /* flush the buffer out now, so */ *piob++ = *pay2; } /* If checksum is enabled insert CRC at end of packet */ if_pf(pioparm->cksum_is_valid) { int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; int nCRC = 0; while (nCRC < (nCRCopies - 1)) { *piob = pioparm->cksum; piob++; nCRC++; } ips_wmb(); *piob = pioparm->cksum; } /* send it on it's way, now, rather than waiting for processor to * get around to flushing it */ ips_wmb(); } /* * for processors that always write store buffers in the order filled, * and if store buffer not completely filled (partial at end, or * interrupted and flushed) always write the partial buffer in * address order. Avoids serializing and flush instructions * where possible. */ void hfi_write_pio(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { union hfi_pbc buf = { 0 }; uint32_t cksum_len = pioparm->cksum_is_valid ? HFI_CRC_SIZE_IN_BYTES : 0; buf.length = __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); if (pioparm->port > 1) buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | __PBC_IBPORT | pioparm->rate); else buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | pioparm->rate); *piob++ = buf.dword; /* 32 bit programs needs compiler fence to prevent compiler reordering the two 32 bit stores in a uint64_t, but on inorder wc systems, does not need a memory fence. */ asm volatile ("" : : : "memory"); *piob++ = buf.pbcflags; hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); piob += HFI_MESSAGE_HDR_SIZE >> 2; asm volatile ("" : : : "memory"); if (pioparm->length) hfi_dwordcpy_safe(piob, (uint32_t *) bdata, pioparm->length >> 2); /* If checksum is enabled insert CRC at end of packet */ if_pf(pioparm->cksum_is_valid) { int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; int nCRC = 0; piob += pioparm->length >> 2; while (nCRC < (nCRCopies - 1)) { *piob = pioparm->cksum; piob++; nCRC++; } asm volatile ("" : : : "memory"); *piob = pioparm->cksum; } /* send it on it's way, now, rather than waiting for processor to * get around to flushing it */ ips_wmb(); } /* * for processors that always write store buffers in the order filled, * and if store buffer not completely filled (partial at end, or * interrupted and flushed) always write the partial buffer in * address order. Avoids serializing and flush instructions * where possible. */ static void hfi_write_pio_special_trigger(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata, unsigned offset) __attribute__ ((always_inline)); static void hfi_write_pio_special_trigger(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata, unsigned offset) { union hfi_pbc buf = { 0 }; volatile uint32_t *piobs = piob; uint32_t cksum_len = pioparm->cksum_is_valid ? HFI_CRC_SIZE_IN_BYTES : 0; buf.length = __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); if (pioparm->port > 1) buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | __PBC_IBPORT | pioparm->rate); else buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | pioparm->rate); *piob++ = buf.dword; /* 32 bit programs needs compiler fence to prevent compiler reordering the two 32 bit stores in a uint64_t, but on inorder wc systems, does not need a memory fence. */ asm volatile ("" : : : "memory"); *piob++ = buf.pbcflags; hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); piob += HFI_MESSAGE_HDR_SIZE >> 2; asm volatile ("" : : : "memory"); if (pioparm->length) hfi_dwordcpy_safe(piob, (uint32_t *) bdata, pioparm->length >> 2); /* If checksum is enabled insert CRC at end of packet */ if_pf(pioparm->cksum_is_valid) { int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; int nCRC = 0; piob += pioparm->length >> 2; while (nCRC < (nCRCopies - 1)) { *piob = pioparm->cksum; piob++; nCRC++; } asm volatile ("" : : : "memory"); *piob = pioparm->cksum; } /* send it on it's way, now, rather than waiting for processor to * get around to flushing it */ ips_wmb(); *(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC; ips_wmb(); } void hfi_write_pio_special_trigger2k(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023); } void hfi_write_pio_special_trigger4k(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047); } opa-psm2-PSM2_11.2.185/opa/opa_write_pio-x86_64.c000066400000000000000000000210621370564314600207410ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* This file contains the initialization functions used by the low level hfi protocol code. */ #include #include #include #include #include #include #include #include #include #include #include #include "opa_user.h" /* * These pio copy routines are here so they can be used by test code, as well * as by MPI, and can change independently of MPI */ /* * for processors that may not write store buffers in the order filled, * and when the store buffer is not completely filled (partial at end, or * interrupted and flushed) may write the partial buffer in * "random" order. requires additional serialization */ void hfi_write_pio_force_order(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { union hfi_pbc buf = {.qword = 0 }; uint32_t cksum_len = pioparm->cksum_is_valid ? HFI_CRC_SIZE_IN_BYTES : 0; buf.length = __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); if (pioparm->port > 1) buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | __PBC_IBPORT | pioparm->rate); else buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | pioparm->rate); *(volatile uint64_t *)piob = buf.qword; ips_wmb(); /* pbc must be forced to be first write to chip buffer */ piob += 2; if (!pioparm->length) { uint32_t *dhdr, dcpywords; dcpywords = (HFI_MESSAGE_HDR_SIZE >> 2) - 1; hfi_dwordcpy_safe(piob, hdr, dcpywords); ips_wmb(); dhdr = hdr; piob += dcpywords; dhdr += dcpywords; *piob++ = *dhdr; } else { uint32_t *pay2 = bdata, j; uint32_t len = pioparm->length; hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); piob += HFI_MESSAGE_HDR_SIZE >> 2; len >>= 2; if (len > 16) { uint32_t pay_words = 16 * ((len - 1) / 16); hfi_dwordcpy_safe(piob, pay2, pay_words); piob += pay_words; pay2 += pay_words; len -= pay_words; } /* now write the final chunk a word at a time, fence before trigger */ for (j = 0; j < (len - 1); j++) *piob++ = *pay2++; ips_wmb(); /* flush the buffer out now, so */ *piob++ = *pay2; } /* If checksum is enabled insert CRC at end of packet */ if_pf(pioparm->cksum_is_valid) { int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; int nCRC = 0; while (nCRC < (nCRCopies - 1)) { *piob = pioparm->cksum; piob++; nCRC++; } ips_wmb(); *piob = pioparm->cksum; } /* send it on it's way, now, rather than waiting for processor to * get around to flushing it */ ips_wmb(); } /* * for processors that always write store buffers in the order filled, * and if store buffer not completely filled (partial at end, or * interrupted and flushed) always write the partial buffer in * address order. Avoids serializing and flush instructions * where possible. */ void hfi_write_pio(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { union hfi_pbc buf = { 0 }; uint32_t cksum_len = pioparm->cksum_is_valid ? HFI_CRC_SIZE_IN_BYTES : 0; buf.length = __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); if (pioparm->port > 1) buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | __PBC_IBPORT | pioparm->rate); else buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | pioparm->rate); *(volatile uint64_t *)piob = buf.qword; piob += 2; asm volatile ("" : : : "memory"); hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); asm volatile ("" : : : "memory"); piob += HFI_MESSAGE_HDR_SIZE >> 2; if (pioparm->length) hfi_dwordcpy_safe(piob, (uint32_t *) bdata, pioparm->length >> 2); /* If checksum is enabled insert CRC at end of packet */ if_pf(pioparm->cksum_is_valid) { int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; int nCRC = 0; piob += pioparm->length >> 2; while (nCRC < (nCRCopies - 1)) { *piob = pioparm->cksum; piob++; nCRC++; } asm volatile ("" : : : "memory"); *piob = pioparm->cksum; } /* send it on it's way, now, rather than waiting for processor to * get around to flushing it */ ips_wmb(); } /* * here we trigger on a "special" address, so just bang it out * as fast as possible... */ static void hfi_write_pio_special_trigger(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata, unsigned offset) __attribute__ ((always_inline)); static void hfi_write_pio_special_trigger(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata, unsigned offset) { union hfi_pbc buf = { 0 }; volatile uint32_t *piobs = piob; uint32_t cksum_len = pioparm->cksum_is_valid ? HFI_CRC_SIZE_IN_BYTES : 0; buf.length = __cpu_to_le16(((HFI_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); if (pioparm->port > 1) buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | __PBC_IBPORT | pioparm->rate); else buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | pioparm->rate); *(volatile uint64_t *)piob = buf.qword; piob += 2; asm volatile ("" : : : "memory"); hfi_dwordcpy_safe(piob, hdr, HFI_MESSAGE_HDR_SIZE >> 2); piob += HFI_MESSAGE_HDR_SIZE >> 2; asm volatile ("" : : : "memory"); if (pioparm->length) hfi_dwordcpy_safe(piob, (uint32_t *) bdata, pioparm->length >> 2); /* If checksum is enabled insert CRC at end of packet */ if_pf(pioparm->cksum_is_valid) { int nCRCopies = HFI_CRC_SIZE_IN_BYTES >> 2; int nCRC = 0; piob += pioparm->length >> 2; while (nCRC < (nCRCopies - 1)) { *piob = pioparm->cksum; piob++; nCRC++; } asm volatile ("" : : : "memory"); *piob = pioparm->cksum; } /* * flush then write "special" then flush... */ ips_wmb(); *(piobs + offset) = HFI_SPECIAL_TRIGGER_MAGIC; ips_wmb(); } void hfi_write_pio_special_trigger2k(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023); } void hfi_write_pio_special_trigger4k(volatile uint32_t *piob, const struct hfi_pio_params *pioparm, void *hdr, void *bdata) { hfi_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047); } opa-psm2-PSM2_11.2.185/psm.c000066400000000000000000001050451370564314600151710ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include #include "psm_user.h" #include "psm2_hal.h" #include "opa_revision.h" #include "psm_mq_internal.h" static int psmi_verno_major = PSM2_VERNO_MAJOR; static int psmi_verno_minor = PSM2_VERNO_MINOR; static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); static int psmi_verno_client_val; int psmi_epid_ver; // Special psmi_refcount values #define PSMI_NOT_INITIALIZED 0 #define PSMI_FINALIZED -1 // PSM2 doesn't support transitioning out of the PSMI_FINALIZED state // once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change // psmi_refcount should be treated as an error static int psmi_refcount = PSMI_NOT_INITIALIZED; /* Global lock used for endpoint creation and destroy * (in functions psm2_ep_open and psm2_ep_close) and also * for synchronization with recv_thread (so that recv_thread * will not work on an endpoint which is in a middle of closing). */ psmi_lock_t psmi_creation_lock; sem_t *sem_affinity_shm_rw = NULL; int psmi_affinity_shared_file_opened = 0; int psmi_affinity_semaphore_open = 0; uint64_t *shared_affinity_ptr; char *sem_affinity_shm_rw_name; char *affinity_shm_name; uint32_t psmi_cpu_model; #ifdef PSM_CUDA int is_cuda_enabled; int is_gdr_copy_enabled; int device_support_gpudirect; int gpu_p2p_supported = 0; int my_gpu_device = 0; int cuda_lib_version; int is_driver_gpudirect_enabled; int is_cuda_primary_context_retain = 0; uint32_t cuda_thresh_rndv; uint32_t gdr_copy_threshold_send; uint32_t gdr_copy_threshold_recv; void *psmi_cuda_lib; CUresult (*psmi_cuInit)(unsigned int Flags ); CUresult (*psmi_cuCtxDetach)(CUcontext c); CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); CUresult (*psmi_cuDeviceGetCount)(int* count); CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); CUresult (*psmi_cuStreamDestroy)(CUstream phStream); CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); CUresult (*psmi_cuEventDestroy)(CUevent hEvent); CUresult (*psmi_cuEventQuery)(CUevent hEvent); CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); CUresult (*psmi_cuMemFreeHost)(void* p); CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); #endif /* * Bit field that contains capability set. * Each bit represents different capability. * It is supposed to be filled with logical OR * on conditional compilation basis * along with future features/capabilities. */ uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP; int psmi_verno_client() { return psmi_verno_client_val; } /* This function is used to determine whether the current library build can * successfully communicate with another library that claims to be version * 'verno'. * * PSM 2.x is always ABI compatible, but this checks to see if two different * versions of the library can coexist. */ int psmi_verno_isinteroperable(uint16_t verno) { if (PSMI_VERNO_GET_MAJOR(verno) != PSM2_VERNO_MAJOR) return 0; return 1; } int MOCKABLE(psmi_isinitialized)() { return (psmi_refcount > 0); } MOCK_DEF_EPILOGUE(psmi_isinitialized); #ifdef PSM_CUDA int psmi_cuda_lib_load() { psm2_error_t err = PSM2_OK; char *dlerr; PSM2_LOG_MSG("entering"); _HFI_VDBG("Loading CUDA library.\n"); psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); if (!psmi_cuda_lib) { dlerr = dlerror(); _HFI_ERROR("Unable to open libcuda.so. Error %s\n", dlerr ? dlerr : "no dlerror()"); goto fail; } psmi_cuDriverGetVersion = dlsym(psmi_cuda_lib, "cuDriverGetVersion"); if (!psmi_cuDriverGetVersion) { _HFI_ERROR ("Unable to resolve symbols in CUDA libraries.\n"); goto fail; } PSMI_CUDA_CALL(cuDriverGetVersion, &cuda_lib_version); if (cuda_lib_version < 7000) { _HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n"); goto fail; } PSMI_CUDA_DLSYM(psmi_cuda_lib, cuInit); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxDetach); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventRecord); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoD); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoHAsync); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoDAsync); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcGetMemHandle); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcOpenMemHandle); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcCloseMemHandle); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemGetAddressRange); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxGetState); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRetain); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRelease); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice); PSM2_LOG_MSG("leaving"); return err; fail: if (psmi_cuda_lib) dlclose(psmi_cuda_lib); err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n"); return err; } int psmi_cuda_initialize() { psm2_error_t err = PSM2_OK; int num_devices, dev; PSM2_LOG_MSG("entering"); _HFI_VDBG("Enabling CUDA support.\n"); err = psmi_cuda_lib_load(); if (err != PSM2_OK) goto fail; PSMI_CUDA_CALL(cuInit, 0); /* Check if CUDA context is available. If not, we are not allowed to * launch any CUDA API calls */ PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); if (ctxt == NULL) { _HFI_INFO("Unable to find active CUDA context\n"); is_cuda_enabled = 0; err = PSM2_OK; return err; } CUdevice current_device; CUcontext primary_ctx; PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); int is_ctx_active; unsigned ctx_flags; PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags, &is_ctx_active); if (!is_ctx_active) { /* There is an issue where certain CUDA API calls create * contexts but does not make it active which cause the * driver API call to fail with error 709 */ PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, current_device); is_cuda_primary_context_retain = 1; } /* Check if all devices support Unified Virtual Addressing. */ PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); device_support_gpudirect = 1; for (dev = 0; dev < num_devices; dev++) { CUdevice device; PSMI_CUDA_CALL(cuDeviceGet, &device, dev); int unifiedAddressing; PSMI_CUDA_CALL(cuDeviceGetAttribute, &unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, device); if (unifiedAddressing !=1) { _HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev); goto fail; } int major; PSMI_CUDA_CALL(cuDeviceGetAttribute, &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); if (major < 3) { device_support_gpudirect = 0; _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); } if (device != current_device) { int canAccessPeer = 0; PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, current_device, device); if (canAccessPeer != 1) _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev); else gpu_p2p_supported |= (1 << device); } else { /* Always support p2p on the same GPU */ my_gpu_device = device; gpu_p2p_supported |= (1 << device); } } union psmi_envvar_val env_enable_gdr_copy; psmi_getenv("PSM2_GDRCOPY", "Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)1, &env_enable_gdr_copy); is_gdr_copy_enabled = env_enable_gdr_copy.e_int; union psmi_envvar_val env_cuda_thresh_rndv; psmi_getenv("PSM2_CUDA_THRESH_RNDV", "RNDV protocol is used for message sizes greater than the threshold \n", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)CUDA_THRESH_RNDV, &env_cuda_thresh_rndv); cuda_thresh_rndv = env_cuda_thresh_rndv.e_int; if (cuda_thresh_rndv < 0 || cuda_thresh_rndv > CUDA_THRESH_RNDV) cuda_thresh_rndv = CUDA_THRESH_RNDV; union psmi_envvar_val env_gdr_copy_thresh_send; psmi_getenv("PSM2_GDRCOPY_THRESH_SEND", "GDR Copy is turned off on the send side" " for message sizes greater than the threshold \n", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)GDR_COPY_THRESH_SEND, &env_gdr_copy_thresh_send); gdr_copy_threshold_send = env_gdr_copy_thresh_send.e_int; if (gdr_copy_threshold_send < 8 || gdr_copy_threshold_send > cuda_thresh_rndv) gdr_copy_threshold_send = GDR_COPY_THRESH_SEND; union psmi_envvar_val env_gdr_copy_thresh_recv; psmi_getenv("PSM2_GDRCOPY_THRESH_RECV", "GDR Copy is turned off on the recv side" " for message sizes greater than the threshold \n", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)GDR_COPY_THRESH_RECV, &env_gdr_copy_thresh_recv); gdr_copy_threshold_recv = env_gdr_copy_thresh_recv.e_int; if (gdr_copy_threshold_recv < 8) gdr_copy_threshold_recv = GDR_COPY_THRESH_RECV; PSM2_LOG_MSG("leaving"); return err; fail: err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM2 CUDA support.\n"); return err; } #endif psm2_error_t __psm2_init(int *major, int *minor) { psm2_error_t err = PSM2_OK; union psmi_envvar_val env_tmask; psmi_log_initialize(); PSM2_LOG_MSG("entering"); /* When PSM_PERF is enabled, the following code causes the PMU to be programmed to measure instruction cycles of the TX/RX speedpaths of PSM. */ GENERIC_PERF_INIT(); GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX"); GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX"); if (psmi_refcount > 0) { psmi_refcount++; goto update; } if (psmi_refcount == PSMI_FINALIZED) { err = PSM2_IS_FINALIZED; goto fail; } if (major == NULL || minor == NULL) { err = PSM2_PARAM_ERR; goto fail; } psmi_init_lock(&psmi_creation_lock); #ifdef PSM_DEBUG if (!getenv("PSM2_NO_WARN")) fprintf(stderr, "!!! WARNING !!! You are running an internal-only PSM *DEBUG* build.\n"); #endif #ifdef PSM_PROFILE if (!getenv("PSM2_NO_WARN")) fprintf(stderr, "!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n"); #endif #ifdef PSM_FI /* Make sure we complain if fault injection is enabled */ if (getenv("PSM2_FI") && !getenv("PSM2_NO_WARN")) fprintf(stderr, "!!! WARNING !!! You are running with fault injection enabled!\n"); #endif /* #ifdef PSM_FI */ /* Make sure, as an internal check, that this version knows how to detect * compatibility with other library versions it may communicate with */ if (psmi_verno_isinteroperable(psmi_verno) != 1) { err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "psmi_verno_isinteroperable() not updated for current version!"); goto fail; } /* The only way to not support a client is if the major number doesn't * match */ if (*major != PSM2_VERNO_MAJOR && *major != PSM2_VERNO_COMPAT_MAJOR) { err = psmi_handle_error(NULL, PSM2_INIT_BAD_API_VERSION, "This library does not implement version %d.%d", *major, *minor); goto fail; } /* Make sure we don't keep track of a client that claims a higher version * number than we are */ psmi_verno_client_val = min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno); /* Check to see if we need to set Architecture flags to something * besides big core Xeons */ cpuid_t id; psmi_cpu_model = CPUID_MODEL_UNDEFINED; /* First check to ensure Genuine Intel */ get_cpuid(0x0, 0, &id); if(id.ebx == CPUID_GENUINE_INTEL_EBX && id.ecx == CPUID_GENUINE_INTEL_ECX && id.edx == CPUID_GENUINE_INTEL_EDX) { /* Use cpuid with EAX=1 to get processor info */ get_cpuid(0x1, 0, &id); psmi_cpu_model = CPUID_GENUINE_INTEL; } if( (psmi_cpu_model == CPUID_GENUINE_INTEL) && (id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON) { psmi_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) | ((id.eax & CPUID_EXMODEL_MASK) >> 12); } psmi_refcount++; /* hfi_debug lives in libhfi.so */ psmi_getenv("PSM2_TRACEMASK", "Mask flags for tracing", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG_FLAGS, (union psmi_envvar_val)hfi_debug, &env_tmask); hfi_debug = (long)env_tmask.e_ulong; /* The "real thing" is done in hfi_proto.c as a constructor function, but * we getenv it here to report what we're doing with the setting */ { extern int __hfi_malloc_no_mmap; union psmi_envvar_val env_mmap; char *env = getenv("HFI_DISABLE_MMAP_MALLOC"); int broken = (env && *env && !__hfi_malloc_no_mmap); psmi_getenv("HFI_DISABLE_MMAP_MALLOC", broken ? "Skipping mmap disable for malloc()" : "Disable mmap for malloc()", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, (union psmi_envvar_val)0, &env_mmap); if (broken) _HFI_ERROR ("Couldn't successfully disable mmap in mallocs " "with mallopt()\n"); } { union psmi_envvar_val env_epid_ver; psmi_getenv("PSM2_ADDR_FMT", "Used to force PSM2 to use a particular version of EPID", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)PSMI_EPID_VERNO_DEFAULT, &env_epid_ver); psmi_epid_ver = env_epid_ver.e_int; if (psmi_epid_ver > PSMI_MAX_EPID_VERNO_SUPPORTED) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " The max epid version supported in this version of PSM2 is %d \n" "Please upgrade PSM2 \n", PSMI_MAX_EPID_VERNO_SUPPORTED); goto fail; } else if (psmi_epid_ver < PSMI_MIN_EPID_VERNO_SUPPORTED) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Invalid value provided through PSM2_ADDR_FMT \n"); goto fail; } } if (getenv("PSM2_DIAGS")) { _HFI_INFO("Running diags...\n"); psmi_diags(); } psmi_multi_ep_init(); #ifdef PSM_FI psmi_faultinj_init(); #endif /* #ifdef PSM_FI */ psmi_epid_init(); int rc = psmi_hal_initialize(); if (rc) { err = PSM2_INTERNAL_ERR; goto fail; } #ifdef PSM_CUDA union psmi_envvar_val env_enable_cuda; psmi_getenv("PSM2_CUDA", "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)0, &env_enable_cuda); is_cuda_enabled = env_enable_cuda.e_int; if (PSMI_IS_CUDA_ENABLED) { err = psmi_cuda_initialize(); if (err != PSM2_OK) goto fail; } #endif update: if (getenv("PSM2_IDENTIFY")) { Dl_info info_psm; char ofed_delta[100] = ""; strcat(strcat(ofed_delta," built for OFED DELTA "),psmi_hfi_IFS_version); printf("%s %s PSM2 v%d.%d%s\n" "%s %s location %s\n" "%s %s build date %s\n" "%s %s src checksum %s\n" "%s %s git checksum %s\n" "%s %s built against driver interface v%d.%d\n" "%s %s HAL instance code: %d, HAL description: \"%s\"\n", hfi_get_mylabel(), hfi_ident_tag, PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR, (strcmp(psmi_hfi_IFS_version,"") != 0) ? ofed_delta #ifdef PSM_CUDA : "-cuda", #else : "", #endif hfi_get_mylabel(), hfi_ident_tag, dladdr(psm2_init, &info_psm) ? info_psm.dli_fname : "libpsm2 not available", hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_build_timestamp, hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_sources_checksum, hfi_get_mylabel(), hfi_ident_tag, (strcmp(psmi_hfi_git_checksum,"") != 0) ? psmi_hfi_git_checksum : "", hfi_get_mylabel(), hfi_ident_tag, psmi_hal_get_user_major_bldtime_version(), psmi_hal_get_user_minor_bldtime_version(), hfi_get_mylabel(), hfi_ident_tag, psmi_hal_get_hal_instance_type(), psmi_hal_get_hal_instance_description()); } *major = (int)psmi_verno_major; *minor = (int)psmi_verno_minor; fail: _HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_init) static psm2_error_t psmi_get_psm2_config(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t *out) { psm2_error_t rv = PSM2_INTERNAL_ERR; *out = 0; if (&mq->ep->ptl_ips == epaddr->ptlctl) { rv = PSM2_OK; *out |= PSM2_INFO_QUERY_CONFIG_IPS; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) { *out |= PSM2_INFO_QUERY_CONFIG_CUDA; if (PSMI_IS_GDR_COPY_ENABLED) *out |= PSM2_INFO_QUERY_CONFIG_GDR_COPY; } #endif { union psmi_envvar_val env_sdma; psmi_getenv("PSM2_SDMA", "hfi send dma flags (0 disables send dma, 2 disables send pio, " "1 for both sdma/spio, default 1)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, &env_sdma); if (env_sdma.e_uint == 0) *out |= PSM2_INFO_QUERY_CONFIG_PIO; else if (env_sdma.e_uint == 1) *out |= (PSM2_INFO_QUERY_CONFIG_PIO | PSM2_INFO_QUERY_CONFIG_DMA); else if (env_sdma.e_uint == 2) *out |= PSM2_INFO_QUERY_CONFIG_DMA; } } else if (&mq->ep->ptl_amsh == epaddr->ptlctl) { *out |= PSM2_INFO_QUERY_CONFIG_AMSH; rv = PSM2_OK; } else if (&mq->ep->ptl_self == epaddr->ptlctl) { *out |= PSM2_INFO_QUERY_CONFIG_SELF; rv = PSM2_OK; } return rv; } psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out, size_t nargs, psm2_info_query_arg_t args[]) { static const size_t expected_arg_cnt[PSM2_INFO_QUERY_LAST] = { 0, /* PSM2_INFO_QUERY_NUM_UNITS */ 0, /* PSM2_INFO_QUERY_NUM_PORTS */ 1, /* PSM2_INFO_QUERY_UNIT_STATUS */ 2, /* PSM2_INFO_QUERY_UNIT_PORT_STATUS */ 1, /* PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */ 1, /* PSM2_INFO_QUERY_NUM_CONTEXTS */ 2, /* PSM2_INFO_QUERY_CONFIG */ 3, /* PSM2_INFO_QUERY_THRESH */ 3, /* PSM2_INFO_QUERY_DEVICE_NAME */ 2, /* PSM2_INFO_QUERY_MTU */ 2, /* PSM2_INFO_QUERY_LINK_SPEED */ 1, /* PSM2_INFO_QUERY_NETWORK_TYPE */ 0, /* PSM2_INFO_QUERY_FEATURE_MASK */ }; psm2_error_t rv = PSM2_INTERNAL_ERR; if ((q < 0) || (q >= PSM2_INFO_QUERY_LAST)) return PSM2_IQ_INVALID_QUERY; if (nargs != expected_arg_cnt[q]) return PSM2_PARAM_ERR; switch (q) { case PSM2_INFO_QUERY_NUM_UNITS: *((uint32_t*)out) = psmi_hal_get_num_units_(); rv = PSM2_OK; break; case PSM2_INFO_QUERY_NUM_PORTS: *((uint32_t*)out) = psmi_hal_get_num_ports_(); rv = PSM2_OK; break; case PSM2_INFO_QUERY_UNIT_STATUS: *((uint32_t*)out) = psmi_hal_get_unit_active(args[0].unit); rv = PSM2_OK; break; case PSM2_INFO_QUERY_UNIT_PORT_STATUS: *((uint32_t*)out) = psmi_hal_get_port_active(args[0].unit, args[1].port); rv = PSM2_OK; break; case PSM2_INFO_QUERY_NUM_FREE_CONTEXTS: *((uint32_t*)out) = psmi_hal_get_num_free_contexts(args[0].unit); rv = PSM2_OK; break; case PSM2_INFO_QUERY_NUM_CONTEXTS: *((uint32_t*)out) = psmi_hal_get_num_contexts(args[0].unit); rv = PSM2_OK; break; case PSM2_INFO_QUERY_CONFIG: { psm2_mq_t mq = args[0].mq; psm2_epaddr_t epaddr = args[1].epaddr; rv = psmi_get_psm2_config(mq, epaddr, (uint32_t*)out); } break; case PSM2_INFO_QUERY_THRESH: { psm2_mq_t mq = args[0].mq; psm2_epaddr_t epaddr = args[1].epaddr; enum psm2_info_query_thresh_et iqt = args[2].mstq; uint32_t config; rv = psmi_get_psm2_config(mq, epaddr, &config); if (rv == PSM2_OK) { *((uint32_t*)out) = 0; /* Delegate the call to the ptl member function: */ rv = epaddr->ptlctl->msg_size_thresh_query(iqt, (uint32_t*)out, mq, epaddr); } } break; case PSM2_INFO_QUERY_DEVICE_NAME: { char *hfiName = (char*)out; psm2_mq_t mq = args[0].mq; psm2_epaddr_t epaddr = args[1].epaddr; size_t hfiNameLength = args[2].length; uint32_t config; rv = psmi_get_psm2_config(mq, epaddr, &config); if (rv == PSM2_OK) { if (snprintf(hfiName, hfiNameLength, "%s_%d", psmi_hal_get_hfi_name(), psmi_hal_get_unit_id(mq->ep->context.psm_hw_ctxt)) < hfiNameLength) rv = PSM2_OK; } } break; case PSM2_INFO_QUERY_MTU: { psm2_mq_t mq = args[0].mq; psm2_epaddr_t epaddr = args[1].epaddr; uint32_t config; rv = psmi_get_psm2_config(mq, epaddr, &config); if (rv == PSM2_OK) { *((uint32_t*)out) = mq->ep->mtu; } } break; case PSM2_INFO_QUERY_LINK_SPEED: { psm2_mq_t mq = args[0].mq; psm2_epaddr_t epaddr = args[1].epaddr; uint32_t config; rv = psmi_get_psm2_config(mq, epaddr, &config); if (rv == PSM2_OK) { *((uint32_t*)out) = psmi_hal_get_port_rate(psmi_hal_get_unit_id(mq->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(mq->ep->context.psm_hw_ctxt)); } } break; case PSM2_INFO_QUERY_NETWORK_TYPE: { char *networkType = (char*)out; size_t networkTypeLength = args[0].length; const char *const intelopa = "Intel(R) OPA"; if (networkTypeLength >= strlen(intelopa)+1) { strcpy(networkType,intelopa); rv = PSM2_OK; } } break; case PSM2_INFO_QUERY_FEATURE_MASK: { #ifdef PSM_CUDA *((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA; #else *((uint32_t*)out) = 0; #endif /* #ifdef PSM_CUDA */ } rv = PSM2_OK; break; default: break; } return rv; } PSMI_API_DECL(psm2_info_query) uint64_t __psm2_get_capability_mask(uint64_t req_cap_mask) { return (psm2_capabilities_bitset & req_cap_mask); } PSMI_API_DECL(psm2_get_capability_mask) psm2_error_t __psm2_finalize(void) { struct psmi_eptab_iterator itor; char *hostname; psm2_ep_t ep; PSM2_LOG_MSG("entering"); _HFI_DBG("psmi_refcount=%d\n", psmi_refcount); PSMI_ERR_UNLESS_INITIALIZED(NULL); psmi_assert(psmi_refcount > 0); psmi_refcount--; if (psmi_refcount > 0) { return PSM2_OK; } /* When PSM_PERF is enabled, the following line causes the instruction cycles gathered in the current run to be dumped to stderr. */ GENERIC_PERF_DUMP(stderr); ep = psmi_opened_endpoint; while (ep != NULL) { psm2_ep_t saved_ep = ep->user_ep_next; psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL, 2 * PSMI_MIN_EP_CLOSE_TIMEOUT); psmi_opened_endpoint = ep = saved_ep; } #ifdef PSM_FI psmi_faultinj_fini(); #endif /* #ifdef PSM_FI */ /* De-allocate memory for any allocated space to store hostnames */ psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME); while ((hostname = psmi_epid_itor_next(&itor))) psmi_free(hostname); psmi_epid_itor_fini(&itor); psmi_epid_fini(); /* unmap shared mem object for affinity */ if (psmi_affinity_shared_file_opened) { /* * Start critical section to decrement ref count and unlink * affinity shm file. */ psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name); shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1; if (shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) { _HFI_VDBG("Unlink shm file for HFI affinity as there are no more users\n"); shm_unlink(affinity_shm_name); } else { _HFI_VDBG("Number of affinity shared memory users left=%ld\n", shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); } msync(shared_affinity_ptr, AFFINITY_SHMEMSIZE, MS_SYNC); /* End critical section */ psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); munmap(shared_affinity_ptr, AFFINITY_SHMEMSIZE); psmi_free(affinity_shm_name); affinity_shm_name = NULL; psmi_affinity_shared_file_opened = 0; } if (psmi_affinity_semaphore_open) { _HFI_VDBG("Closing and Unlinking Semaphore: %s.\n", sem_affinity_shm_rw_name); sem_close(sem_affinity_shm_rw); sem_unlink(sem_affinity_shm_rw_name); psmi_free(sem_affinity_shm_rw_name); sem_affinity_shm_rw_name = NULL; psmi_affinity_semaphore_open = 0; } psmi_hal_finalize(); #ifdef PSM_CUDA if (is_cuda_primary_context_retain) { /* * This code will be called during deinitialization, and if * CUDA is deinitialized before PSM, then * CUDA_ERROR_DEINITIALIZED will happen here */ CUdevice device; if (psmi_cuCtxGetDevice(&device) == CUDA_SUCCESS) PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device); } #endif psmi_refcount = PSMI_FINALIZED; PSM2_LOG_MSG("leaving"); psmi_log_fini(); psmi_stats_deregister_all(); psmi_heapdebug_finalize(); return PSM2_OK; } PSMI_API_DECL(psm2_finalize) /* * Function exposed in >= 1.05 */ psm2_error_t __psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames) { int i; psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); if (nids == NULL || hostnames == NULL) { err = PSM2_PARAM_ERR; goto fail; } for (i = 0; i < num; i++) { if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1))) break; } fail: PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_map_nid_hostname) void __psm2_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label) { PSM2_LOG_MSG("entering"); PSM2_LOG_MSG("leaving"); return; /* ignore this function */ } PSMI_API_DECL(psm2_epaddr_setlabel) void __psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt) { /* Eventually deprecate this API to use set/get opt as this is unsafe. */ PSM2_LOG_MSG("entering"); psm2_setopt(PSM2_COMPONENT_CORE, (const void *)epaddr, PSM2_CORE_OPT_EP_CTXT, (const void *)ctxt, sizeof(void *)); PSM2_LOG_MSG("leaving"); } PSMI_API_DECL(psm2_epaddr_setctxt) void *__psm2_epaddr_getctxt(psm2_epaddr_t epaddr) { psm2_error_t err; uint64_t optlen = sizeof(void *); void *result = NULL; PSM2_LOG_MSG("entering"); /* Eventually deprecate this API to use set/get opt as this is unsafe. */ err = psm2_getopt(PSM2_COMPONENT_CORE, (const void *)epaddr, PSM2_CORE_OPT_EP_CTXT, (void *)&result, &optlen); PSM2_LOG_MSG("leaving"); if (err == PSM2_OK) return result; else return NULL; } PSMI_API_DECL(psm2_epaddr_getctxt) psm2_error_t __psm2_setopt(psm2_component_t component, const void *component_obj, int optname, const void *optval, uint64_t optlen) { psm2_error_t rv; PSM2_LOG_MSG("entering"); switch (component) { case PSM2_COMPONENT_CORE: rv = psmi_core_setopt(component_obj, optname, optval, optlen); PSM2_LOG_MSG("leaving"); return rv; break; case PSM2_COMPONENT_MQ: /* Use the deprecated MQ set/get opt for now which does not use optlen */ rv = psm2_mq_setopt((psm2_mq_t) component_obj, optname, optval); PSM2_LOG_MSG("leaving"); return rv; break; case PSM2_COMPONENT_AM: /* Hand off to active messages */ rv = psmi_am_setopt(component_obj, optname, optval, optlen); PSM2_LOG_MSG("leaving"); return rv; break; case PSM2_COMPONENT_IB: /* Hand off to IPS ptl to set option */ rv = psmi_ptl_ips.setopt(component_obj, optname, optval, optlen); PSM2_LOG_MSG("leaving"); return rv; break; } /* Unrecognized/unknown component */ rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u", component); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_setopt); psm2_error_t __psm2_getopt(psm2_component_t component, const void *component_obj, int optname, void *optval, uint64_t *optlen) { psm2_error_t rv; PSM2_LOG_MSG("entering"); switch (component) { case PSM2_COMPONENT_CORE: rv = psmi_core_getopt(component_obj, optname, optval, optlen); PSM2_LOG_MSG("leaving"); return rv; break; case PSM2_COMPONENT_MQ: /* Use the deprecated MQ set/get opt for now which does not use optlen */ rv = psm2_mq_getopt((psm2_mq_t) component_obj, optname, optval); PSM2_LOG_MSG("leaving"); return rv; break; case PSM2_COMPONENT_AM: /* Hand off to active messages */ rv = psmi_am_getopt(component_obj, optname, optval, optlen); PSM2_LOG_MSG("leaving"); return rv; break; case PSM2_COMPONENT_IB: /* Hand off to IPS ptl to set option */ rv = psmi_ptl_ips.getopt(component_obj, optname, optval, optlen); PSM2_LOG_MSG("leaving"); return rv; break; } /* Unrecognized/unknown component */ rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u", component); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_getopt); psm2_error_t __psmi_poll_noop(ptl_t *ptl, int replyonly) { PSM2_LOG_MSG("entering"); PSM2_LOG_MSG("leaving"); return PSM2_OK_NO_PROGRESS; } PSMI_API_DECL(psmi_poll_noop) psm2_error_t __psm2_poll(psm2_ep_t ep) { psm2_error_t err1 = PSM2_OK, err2 = PSM2_OK; psm2_ep_t tmp; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); PSMI_LOCK(ep->mq->progress_lock); tmp = ep; do { err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ PSMI_UNLOCK(ep->mq->progress_lock); PSM2_LOG_MSG("leaving"); return err1; } err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ PSMI_UNLOCK(ep->mq->progress_lock); PSM2_LOG_MSG("leaving"); return err2; } ep = ep->mctxt_next; } while (ep != tmp); /* This is valid because.. * PSM2_OK & PSM2_OK_NO_PROGRESS => PSM2_OK * PSM2_OK & PSM2_OK => PSM2_OK * PSM2_OK_NO_PROGRESS & PSM2_OK => PSM2_OK * PSM2_OK_NO_PROGRESS & PSM2_OK_NO_PROGRESS => PSM2_OK_NO_PROGRESS */ PSMI_UNLOCK(ep->mq->progress_lock); PSM2_LOG_MSG("leaving"); return (err1 & err2); } PSMI_API_DECL(psm2_poll) psm2_error_t __psmi_poll_internal(psm2_ep_t ep, int poll_amsh) { psm2_error_t err1 = PSM2_OK_NO_PROGRESS; psm2_error_t err2; psm2_ep_t tmp; PSM2_LOG_MSG("entering"); PSMI_LOCK_ASSERT(ep->mq->progress_lock); tmp = ep; do { if (poll_amsh) { err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ PSM2_LOG_MSG("leaving"); return err1; } } err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ PSM2_LOG_MSG("leaving"); return err2; } ep = ep->mctxt_next; } while (ep != tmp); PSM2_LOG_MSG("leaving"); return (err1 & err2); } PSMI_API_DECL(psmi_poll_internal) #ifdef PSM_PROFILE /* These functions each have weak symbols */ void psmi_profile_block() { ; /* empty for profiler */ } void psmi_profile_unblock() { ; /* empty for profiler */ } void psmi_profile_reblock(int did_no_progress) { ; /* empty for profiler */ } #endif opa-psm2-PSM2_11.2.185/psm2.h000066400000000000000000002277061370564314600152710ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef PSM2_H #define PSM2_H #include #include #ifdef __cplusplus extern "C" { #endif /*! * @file psm2.h * @page psm2_main PSM2 API * * @brief PSM2 OPA Messaging Library * * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level * user-level communications interface for the OPA family of products. * PSM2 users are enabled with mechanisms necessary to implement higher level * communications interfaces in parallel environments. * * Since PSM2 targets clusters of multicore processors, it internally implements * two levels of communication: intra-node shared memory communication and * inter-node OPA communication. Both of these levels are encapsulated * below the interface and the user is free to assume that intra-node and * inter-node communication is transparently handled within PSM. * * @section compat Compatibility * * PSM2 can coexist with other QLogic/Pathscale software distributions, such as * OpenIB/OpenFabrics, which allows applications to simultaneously target * PSM-based and non PSM-based applications on a single node without changing * any system-level configuration. However, PSM2 does not support running * PSM-based and non PSM-based communication within the same user process. * * Except where noted, PSM2 does not assume an SPMD (single program, multiple * data) parallel model and extends to MPMD (multiple program, multiple data) * environments in specific areas. However, PSM2 assumes the runtime environment * to be homogeneous on all nodes in bit width (32-bit or 64-bit) and endianness * (little or big) and will fail at startup if any of these assumptions do not * hold. For homogeneous systems PSM2 can run either in 32-bit or 64-bit * environments. Even though both environments should expect similar * performance from the API, PSM2 has chosen to favor 64-bit environments in * some minor areas. * * @section ep_model Endpoint Communication Model * * PSM2 follows an endpoint communication model where an endpoint is defined as * an object (or handle) instantiated to support sending and receiving messages * to other endpoints. In order to prevent PSM2 from being tied to a particular * parallel model (such as SPMD), control over the parallel layout of endpoints * is retained by the user. Opening endpoints (@ref psm2_ep_open) and * connecting endpoints to enable communication (@ref psm2_ep_connect) are two * decoupled mechanisms. Users that do not dynamically change the number of * endpoints beyond parallel startup will probably lump both mechanisms * together at startup. Users that wish to manipulate the location and number * of endpoints at runtime can do so by explicitly connecting sets or subsets * of endpoints. * * As a side effect, this greater flexibility forces the user to cope with a * two-stage initialization process. In the first stage of opening an endpoint * (@ref psm2_ep_open), a user obtains an opaque handle to the endpoint and a * globally distributable endpoint identifier (@ref psm2_epid_t). Prior to the * second stage of connecting endpoints (@ref psm2_ep_connect), a user must * distribute all relevent endpoint identifiers through an out-of-band * mechanism. Once the endpoint identifiers are successfully distributed to * all processes that wish to communicate, the user * connects all endpoint identifiers to the locally opened endpoint * (@ref psm2_ep_connect). In connecting the endpoints, the user obtains an * opaque endpoint address (@ref psm2_epaddr_t), which is required for all PSM * communication primitives. * * * @section components PSM2 Components * * PSM2 exposes a single endpoint initialization model, but enables various * levels of communication functionality and semantics through @e components. * The first major component available in PSM2 is PSM2 Matched Queues * (@ref psm2_mq), and the second is PSM2 Active Message (@ref psm2_am). * * Matched Queues (MQ) present a queue-based communication model with the * distinction that queue consumers use a 3-tuple of metadata to match incoming * messages against a list of preposted receive buffers. The MQ semantics are * sufficiently akin to MPI to cover the entire MPI-1.2 standard. * * The Active Message (AM) component presents a request/reply model where * the arrival of a message triggers the execution of consumer-provided * handler code. This can be used to implement many one-sided and two-sided * communications paradigms. * * With future releases of the PSM2 interface, more components will * be exposed to accommodate users that implement parallel communication * models that deviate from the Matched Queue semantics. For example, PSM * plans to expose a connection management component to make it easier to * handle endpoint management for clients without their own connection * managers. * * * @section progress PSM2 Communication Progress Guarantees * * PSM2 internally ensures progress of both intra-node and inter-node messages, * but not autonomously. This means that while performance does not depend * greatly on how the user decides to schedule communication progress, * explicit progress calls are required for correctness. The @ref psm2_poll * function is available to make progress over all PSM2 components in a generic * manner. For more information on making progress over many communication * operations in the MQ component, see the @ref mq_progress documentation. * * * @section completion PSM2 Completion semantics * * PSM2 implements the MQ component, which documents its own * message completion semantics (@ref mq_completion). * * * @section error_handling PSM2 Error handling * * PSM2 exposes a list of user and runtime errors enumerated in @ref psm2_error. * While most errors are fatal in that the user is not expected to be able to * recover from them, PSM2 still allows some level of control. By * default, PSM2 returns all errors to the user but as a convenience, allows * users to either defer errors internally to PSM2 or to have PSM2 return all * errors to the user (callers to PSM2 functions). PSM2 attempts to deallocate * its resources as a best effort, but exits are always non-collective with * respect to endpoints opened in other processes. The user is expected to be * able to handle non-collective exits from any endpoint and in turn cleanly * and independently terminate the parallel environment. Local error handling * can be handled in three modes: * * Errors and error handling can be individually registered either globally or * per-endpoint: * @li @b Per-endpoint error handling captures errors for functions where the * error scoping is determined to be over an endpoint. This includes all * communication functions that include an EP or MQ handle as the first * parameter. * * @li @b Global error handling captures errors for functions where a * particular endpoint cannot be identified or for @ref psm2_ep_open, where * errors (if any) occur before the endpoint is opened. * * Error handling is controlled by registering error handlers (@ref * psm2_error_register_handler). The global error handler can * be set at any time (even before @ref psm2_init), whereas a per-endpoint error * handler can be set as soon as a new endpoint is successfully created. If a * per-endpoint handle is not registered, the per-endpoint handler inherits * from the global error handler at time of open. * * PSM2 predefines two different mechanisms for handling errors: * * @li PSM-internal error handler (@ref PSM2_ERRHANDLER_PSM_HANDLER) * @li No-op PSM2 error handler where errors are returned * (@ref PSM2_ERRHANDLER_NO_HANDLER) * * The default PSM-internal error handler effectively frees the user from * explicitly handling the return values of ever PSM2 function but may not * return to the user in a function determined to have caused a fatal error. * * The No-op PSM2 error handler bypasses all error handling functionality and * always returns the error to the user. The user can then use @ref * psm2_error_get_string to obtain a generic string from an error code (compared * to a more detailed error message available through registering of error * handlers). * * For even more control, users can register their own error handlers to have * access to more precise error strings and selectively control when an when * not to return to callers of PSM2 functions. All error handlers shown defer * error handling to PSM2 for errors that are not recognized using @ref * psm2_error_defer. Deferring an error from a custom error handler is * equivalent to relying on the default error handler. * * @section env_var Environment variables * * Some PSM2 behaviour can be controlled via environment variables. * * @li @b PSM2_DEVICES. PSM2 implements three devices for communication which * are, in order, @c self, @c shm and @c hfi. For PSM2 jobs that do not * require shared-memory communications, @b PSM2_DEVICES can be specified as @c * self, @c hfi. Similarly, for shared-memory only jobs, the @c hfi device * can be disabled. It is up to the user to ensure that the endpoint ids * passed in @ref psm2_ep_connect do not require a device that has been * explicitly disabled by the user. In some instances, enabling only the * devices that are required may improve performance. * * @li @b PSM2_TRACEMASK. Depending on the value of the tracemask, various parts * of PSM2 will output debugging information. With a default value of @c 0x1, * informative messages will be printed (this value should be considered a * minimum). At @c 0x101, startup and finalization messages are added to the * output. At @c 0x1c3, every communication event is logged and should hence * be used for extreme debugging only. * * @li @b PSM2_MULTI_EP. By default, only one PSM2 endpoint may be opened in * a process. With the correct setting of this environment variable, a process * may open more than one PSM2 endpoint. In order to enable multiple endpoint * per process support, the value of this environment variable should be set * to "1" or "yes". * * @section thr_sfty Thread safety and reentrancy * Unless specifically noted otherwise, all PSM2 functions should not be considered * to be thread safe or reentrant. */ /** @brief Local endpoint handle (opaque) * @ingroup ep * * Handle returned to the user when a new local endpoint is created. The * handle is a local handle to be used in all communication functions and is * not intended to globally identify the opened endpoint in any way. * * All open endpoint handles can be globally identified using the endpoint id * integral type (@ref psm2_epid_t) and all communication must use an endpoint * address (@ref psm2_epaddr_t) that can be obtained by connecting a local * endpoint to one or more endpoint identifiers. * * @remark The local endpoint handle is opaque to the user. */ typedef struct psm2_ep *psm2_ep_t; /** @brief MQ handle (opaque) * @ingroup mq * * Handle returned to the user when a new Matched queue is created (@ref * psm2_mq_init). */ typedef struct psm2_mq *psm2_mq_t; /*! @defgroup init PSM2 Initialization and Maintenance * @{ */ #define PSM2_VERNO 0x0202 /*!< Header-defined Version number */ #define PSM2_VERNO_MAJOR 0x02 /*!< Header-defined Major Version Number */ #define PSM2_VERNO_MINOR 0x02 /*!< Header-defined Minor Version Number */ #define PSM2_VERNO_COMPAT_MAJOR 0x01 /*! PSM2_VERNO_MAJOR) { if (err) fprintf(stderr, "PSM2 initialization failure: %s\n", psm2_error_get_string(err)); else fprintf(stderr, "PSM2 loaded an unexpected/unsupported " "version (%d.%d)\n", verno_major, verno_minor); return -1; } // We were able to initialize PSM2 but will defer all further error // handling since most of the errors beyond this point will be fatal. int err = psm2_error_register_handler(NULL, // Global handler PSM2_ERRHANDLER_PSM_HANDLER); if (err) { fprintf(stderr, "Couldn't register global errhandler: %s\n", psm2_error_get_string(err)); return -1; } return 1; } @endcode */ psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor); /*! @brief PSM2 capabilities definitions * * Each capability is defined as a separate bit, * i.e. next capabilities must be defined as * consecutive bits : 0x2, 0x4 ... and so on. */ #define PSM2_MULTI_EP_CAP 0x1 /* Multiple Endpoints capability */ #define PSM2_LIB_REFCOUNT_CAP 0x2 /* Library finalization is managed with reference count */ /** @brief PSM2 capabilities provider * * @param[in] req_cap_mask Requested capabilities are given as bit field. * * @returns internal capabilities bit field ANDed with a requested bit mask */ uint64_t psm2_get_capability_mask(uint64_t req_cap_mask); /** @brief Finalize PSM2 interface * * Single call to finalize PSM2 and close all unclosed endpoints * * @post The user guarantees not to make any further PSM2 calls, including @ref * psm2_init. * * @returns PSM2_OK Always returns @c PSM2_OK */ psm2_error_t psm2_finalize(void); /** @brief Error handling opaque token * * A token is required for users that register their own handlers and wish to * defer further error handling to PSM. */ typedef struct psm2_error_token *psm2_error_token_t; /** @brief Error handling function * * Users can handle errors explicitly instead of relying on PSM's own error * handler. There is one global error handler and error handlers that can be * individually set for each opened endpoint. By default, endpoints will * inherit the global handler registered at the time of open. * * @param[in] ep Handle associated to the endpoint over which the error occurred * or @c NULL if the error is being handled by the global error * handler. * @param[in] error PSM2 error identifier * @param[in] error_string A descriptive error string of maximum length @ref * PSM2_ERRSTRING_MAXLEN. * @param[in] token Opaque PSM2 token associated with the particular event that * generated the error. The token can be used to extract the * error string and can be passed to @ref psm2_error_defer to * defer any remaining or unhandled error handling to PSM. * * @post If the error handler returns, the error returned is propagated to the * caller. */ typedef psm2_error_t(*psm2_ep_errhandler_t) (psm2_ep_t ep, const psm2_error_t error, const char *error_string, psm2_error_token_t token); #define PSM2_ERRHANDLER_DEFAULT ((psm2_ep_errhandler_t)-1) /**< Obsolete names, only here for backwards compatibility */ #define PSM2_ERRHANDLER_NOP ((psm2_ep_errhandler_t)-2) /**< Obsolete names, only here for backwards compatibility */ #define PSM2_ERRHANDLER_PSM_HANDLER ((psm2_ep_errhandler_t)-1) /**< PSM2 error handler as explained in @ref error_handling */ #define PSM2_ERRHANDLER_NO_HANDLER ((psm2_ep_errhandler_t)-2) /**< Bypasses the default PSM2 error handler and returns all errors to the user * (this is the default) */ #define PSM2_ERRSTRING_MAXLEN 512 /**< Maximum error string length. */ /** @brief PSM2 error handler registration * * Function to register error handlers on a global basis and on a per-endpoint * basis. PSM2_ERRHANDLER_PSM_HANDLER and PSM2_ERRHANDLER_NO_HANDLER are special * pre-defined handlers to respectively enable use of the default PSM-internal * handler or the no-handler that disables registered error handling and * returns all errors to the caller (both are documented in @ref * error_handling). * * @param[in] ep Handle of the endpoint over which the error handler should be * registered. With ep set to @c NULL, the behavior of the * global error handler can be controlled. * @param[in] errhandler Handler to register. Can be a user-specific error * handling function or PSM2_ERRHANDLER_PSM_HANDLER or * PSM2_ERRHANDLER_NO_HANDLER. * * @remark When ep is set to @c NULL, this is the only function that can be * called before @ref psm2_init */ psm2_error_t psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler); /** @brief PSM2 deferred error handler * * Function to handle fatal PSM2 errors if no error handler is installed or if * the user wishes to defer further error handling to PSM. Depending on the * type of error, PSM2 may or may not return from the function call. * * @param[in] err_token Error token initially passed to error handler * * @pre The user is calling into the function because it has decided that PSM * should handle an error case. * * @post The function may or may not return depending on the error */ psm2_error_t psm2_error_defer(psm2_error_token_t err_token); /** @brief Get generic error string from error * * Function to return the default error string associated to a PSM2 error. * * While a more detailed and precise error string is usually available within * error handlers, this function is available to obtain an error string out of * an error handler context or when a no-op error handler is registered. * * @param[in] error PSM2 error */ const char *psm2_error_get_string(psm2_error_t error); /** @brief Option key/pair structure * * Currently only used in MQ. */ struct psm2_optkey { uint32_t key; /**< Option key */ void *value; /**< Option value */ }; /*! @} */ /*! @defgroup ep PSM2 Device Endpoint Management * @{ */ /** @brief Endpoint ID * * Integral type of size 8 bytes that can be used by the user to globally * identify a successfully opened endpoint. Although the contents of the * endpoint id integral type remains opaque to the user, unique network id and * OPA port number can be extracted using @ref psm2_epid_nid and @ref * psm2_epid_context. */ typedef uint64_t psm2_epid_t; /** @brief Endpoint Address (opaque) * * Remote endpoint addresses are created when the user binds an endpoint ID * to a particular endpoint handle using @ref psm2_ep_connect. A given endpoint * address is only guaranteed to be valid over a single endpoint. */ typedef struct psm2_epaddr *psm2_epaddr_t; /** @brief PSM2 Unique UID * * PSM2 type equivalent to the DCE-1 uuid_t, used to uniquely identify an * endpoint within a particular job. Since PSM2 does not participate in job * allocation and management, users are expected to generate a unique ID to * associate endpoints to a particular parallel or collective job. * @see psm2_uuid_generate */ typedef uint8_t psm2_uuid_t[16]; /** @brief Get Endpoint identifier's Unique Network ID */ uint64_t psm2_epid_nid(psm2_epid_t epid); /** @brief Get Endpoint identifier's OPA context number */ uint64_t psm2_epid_context(psm2_epid_t epid); /** @brief Get Endpoint identifier's OPA port (deprecated, use * @ref psm2_epid_context instead) */ uint64_t psm2_epid_port(psm2_epid_t epid); /** @brief List the number of available OPA units * * Function used to determine the number of locally available OPA units. * For @c N units, valid unit numbers in @ref psm2_ep_open are @c 0 to @c N-1. * * @returns PSM2_OK unless the user has not called @ref psm2_init */ psm2_error_t psm2_ep_num_devunits(uint32_t *num_units); /** @brief Utility to generate UUIDs for @ref psm2_ep_open * * This function is available as a utility for generating unique job-wide ids. * See discussion in @ref psm2_ep_open for further information. * * @remark This function does not require PSM2 to be initialized. */ void psm2_uuid_generate(psm2_uuid_t uuid_out); /* Affinity modes for the affinity member of struct psm2_ep_open_opts */ #define PSM2_EP_OPEN_AFFINITY_SKIP 0 /**< Disable setting affinity */ #define PSM2_EP_OPEN_AFFINITY_SET 1 /**< Enable setting affinity unless already set */ #define PSM2_EP_OPEN_AFFINITY_FORCE 2 /**< Enable setting affinity regardless of current affinity setting */ /* Default values for some constants */ #define PSM2_EP_OPEN_PKEY_DEFAULT 0xffffffffffffffffULL /**< Default protection key */ /** @brief Endpoint Open Options * * These options are available for opening a PSM2 endpoint. Each is * individually documented and setting each option to -1 or passing NULL as the * options parameter in @ref psm2_ep_open instructs PSM2 to use * implementation-defined defaults. * * Each option is documented in @ref psm2_ep_open */ struct psm2_ep_open_opts { int64_t timeout; /**< timeout in nanoseconds to open device */ int unit; /**< OPA Unit ID to open on */ int affinity; /**< How PSM2 should set affinity */ int shm_mbytes; /**< Megabytes used for intra-node, deprecated */ int sendbufs_num; /**< Preallocated send buffers */ uint64_t network_pkey; /**< Network Protection Key (v1.01) */ int port; /**< IB port to use (1 to N) */ int outsl; /**< IB SL to use when sending pkts */ uint64_t service_id; /* IB Service ID to use for endpoint */ psm2_path_res_t path_res_type; /* Path resolution type */ int senddesc_num; /* Preallocated send descriptors */ int imm_size; /* Immediate data size for endpoint */ }; /** @brief OPA endpoint creation * * Function used to create a new local communication endpoint on an OPA * adapter. The returned endpoint handle is required in all PSM2 communication * operations, as PSM2 can manage communication over multiple endpoints. An * opened endpoint has no global context until the user connects the endpoint * to other global endpoints by way of @ref psm2_ep_connect. All local endpoint * handles are globally identified by endpoint IDs (@ref psm2_epid_t) which are * also returned when an endpoint is opened. It is assumed that the user can * provide an out-of-band mechanism to distribute the endpoint IDs in order to * establish connections between endpoints (@ref psm2_ep_connect for more * information). * * @param[in] unique_job_key Endpoint key, to uniquely identify the endpoint in * a parallel job. It is up to the user to ensure * that the key is globally unique over a period long * enough to prevent duplicate keys over the same set * of endpoints (see comments below). * * @param[in] opts Open options of type @ref psm2_ep_open_opts * (see @ref psm2_ep_open_opts_get_defaults). * * @param[out] ep User-supplied storage to return a pointer to the newly * created endpoint. The returned pointer of type @ref psm2_ep_t * is a local handle and cannot be used to globally identify the * endpoint. * @param[out] epid User-supplied storage to return the endpoint ID associated * to the newly created local endpoint returned in the @c ep * handle. The endpoint ID is an integral type suitable for * uniquely identifying the local endpoint. * * PSM2 does not internally verify the consistency of the uuid, it is up to the * user to ensure that the uid is unique enough not to collide with other * currently-running jobs. Users can employ three mechanisms to obtain a uuid. * * 1. Use the supplied @ref psm2_uuid_generate utility * * 2. Use an OS or library-specific uuid generation utility, that complies with * OSF DCE 1.1, such as @c uuid_generate on Linux or @c uuid_create on * FreeBSD. * (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm) * * 3. Manually pack a 16-byte string using a utility such as /dev/random or * other source with enough entropy and proper seeding to prevent two nodes * from generating the same uuid_t. * * The following options are relevent when opening an endpoint: * @li @c timeout establishes the number of nanoseconds to wait before * failing to open a port (with -1, defaults to 15 secs). * @li @c unit sets the OPA unit number to use to open a port (with * -1, PSM2 determines the best unit to open the port). If @c * HFI_UNIT is set in the environment, this setting is ignored. * @li @c affinity enables or disables PSM2 setting processor affinity. The * option can be controlled to either disable (@ref * PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting * only if it is already unset (@ref * PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity being * set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE). * If @c HFI_NO_CPUAFFINITY is set in the environment, this * setting is ignored. * @li @c shm_mbytes sets a maximum number of megabytes that can be allocated * to each local endpoint ID connected through this * endpoint (with -1, defaults to 10 MB). * @li @c sendbufs_num sets the number of send buffers that can be * pre-allocated for communication (with -1, defaults to * 512 buffers of MTU size). * @li @c network_pkey sets the protection key to employ for point-to-point * PSM2 communication. Unless a specific value is used, * this parameter should be set to * PSM2_EP_OPEN_PKEY_DEFAULT. * * @warning By default, PSM2 limits the user to calling @ref psm2_ep_open only * once per process and subsequent calls will fail. In order to enable creation * of multiple endoints per process, one must properly set the environment variable * @ref PSM2_MULTI_EP before calling @ref psm2_init. * * @code{.c} // In order to open an endpoint and participate in a job, each endpoint has // to be distributed a unique 16-byte UUID key from an out-of-band source. // Presumably this can come from the parallel spawning utility either // indirectly through an implementors own spawning interface or as in this // example, the UUID is set as a string in an environment variable // propagated to all endpoints in the job. int try_to_open_psm2_endpoint(psm2_ep_t *ep, // output endpoint handle psm2_epid_t *epid, // output endpoint identifier int unit) // unit of our choice { struct psm2_ep_open_opts epopts; psm2_uuid_t job_uuid; char *c; // Let PSM2 assign its default values to the endpoint options. psm2_ep_open_opts_get_defaults(&epopts); // We want a stricter timeout and a specific unit epopts.timeout = 15*1e9; // 15 second timeout epopts.unit = unit; // We want a specific unit, -1 would let PSM // choose the unit for us. epopts.port = port; // We want a specific unit, <= 0 would let PSM // choose the port for us. // We've already set affinity, don't let PSM2 do so if it wants to. if (epopts.affinity == PSM2_EP_OPEN_AFFINITY_SET) epopts.affinity = PSM2_EP_OPEN_AFFINITY_SKIP; // ENDPOINT_UUID is set to the same value in the environment of all the // processes that wish to communicate over PSM2 and was generated by // the process spawning utility c = getenv("ENDPOINT_UUID"); if (c && *c) implementor_string_to_16byte_packing(c, job_uuid); else { fprintf(stderr, "Can't find UUID for endpoint\n); return -1; } // Assume we don't want to handle errors here. psm2_ep_open(job_uuid, &epopts, ep, epid); return 1; } @endcode */ psm2_error_t psm2_ep_open(const psm2_uuid_t unique_job_key, const struct psm2_ep_open_opts *opts, psm2_ep_t *ep, psm2_epid_t *epid); /** @brief Endpoint open default options. * * Function used to initialize the set of endpoint options to their default * values for use in @ref psm2_ep_open. * * @param[out] opts Endpoint Open options. * * @warning For portable operation, users should always call this function * prior to calling @ref psm2_ep_open. * * @return PSM2_OK If result could be updated * @return PSM2_INIT_NOT_INIT If psm has not been initialized. */ psm2_error_t psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts); /** @brief Endpoint shared memory query * * Function used to determine if a remote endpoint shares memory with a * currently opened local endpiont. * * @param[in] ep Endpoint handle * @param[in] epid Endpoint ID * * @param[out] result Result is non-zero if the remote endpoint shares memory with the local * endpoint @c ep, or zero otherwise. * * @return PSM2_OK If result could be updated * @return PSM2_EPID_UNKNOWN If the epid is not recognized */ psm2_error_t psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result); /** @brief Close endpoint * @param[in] ep PSM2 endpoint handle * @param[in] mode One of @ref PSM2_EP_CLOSE_GRACEFUL or @ref PSM2_EP_CLOSE_FORCE * @param[in] timeout How long to wait in nanoseconds if mode is * PSM2_EP_CLOSE_GRACEFUL, 0 waits forever. If @c mode is * @ref PSM2_EP_CLOSE_FORCE, this parameter is ignored. * * The following errors are returned, others are handled by the per-endpoint * error handler: * * @return PSM2_OK Endpoint was successfully closed without force or * successfully closed with force within the supplied timeout. * @return PSM2_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed * within timeout. */ psm2_error_t psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout); #define PSM2_EP_CLOSE_GRACEFUL 0 /**< Graceful mode in @ref psm2_ep_close */ #define PSM2_EP_CLOSE_FORCE 1 /**< Forceful mode in @ref psm2_ep_close */ /** @brief Provide mappings for network id to hostname * * Since PSM2 does not assume or rely on the availability of an external * networkid-to-hostname mapping service, users can provide one or more of * these mappings. The @ref psm2_map_nid_hostname function allows a list of * network ids to be associated to hostnames. * * This function is not mandatory for correct operation but may allow PSM2 to * provide better diagnostics when remote endpoints are unavailable and can * otherwise only be identified by their network id. * * @param[in] num Number elements in @c nid and @c hostnames arrays * @param[in] nids User-provided array of network ids (i.e. OPA LIDs), * should be obtained by calling @ref psm2_epid_nid on each * epid. * @param[in] hostnames User-provided array of hostnames (array of * NUL-terimated strings) where each hostname index * maps to the provided nid hostname. * * @warning Duplicate nids may be provided in the input @c nids array, only * the first corresponding hostname will be remembered. * * @pre The user may or may not have already provided a hostname mappings. * @post The user may free any dynamically allocated memory passed to the * function. * */ psm2_error_t psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames); /** @brief Connect one or more remote endpoints to a local endpoint * * Function to non-collectively establish a connection to a set of endpoint IDs * and translate endpoint IDs into endpoint addresses. Establishing a remote * connection with a set of remote endpoint IDs does not imply a collective * operation and the user is free to connect unequal sets on each process. * Similarly, a given endpoint address does not imply that a pairwise * communication context exists between the local endpoint and remote endpoint. * * @param[in] ep PSM2 endpoint handle * * @param[in] num_of_epid The number of endpoints to connect to, which * also establishes the number of elements contained in * all of the function's array-based parameters. * * @param[in] array_of_epid User-allocated array that contains @c num_of_epid * valid endpoint identifiers. Each endpoint id (or * epid) has been obtained through an out-of-band * mechanism and each endpoint must have been opened * with the same uuid key. * * @param[in] array_of_epid_mask User-allocated array that contains * @c num_of_epid integers. This array of masks * allows users to select which of the epids in @c * array_of_epid should be connected. If the integer * at index i is zero, psm does not attempt to connect * to the epid at index i in @c array_of_epid. If * this parameter is NULL, psm will try to connect to * each epid. * * @param[out] array_of_errors User-allocated array of at least @c num_of_epid * elements. If the function does not return * PSM2_OK, this array can be consulted for each * endpoint not masked off by @c array_of_epid_mask * to know why the endpoint could not be connected. * Endpoints that could not be connected because of * an unrelated failure will be marked as @ref * PSM2_EPID_UNKNOWN. If the function returns * PSM2_OK, the errors for all endpoints will also * contain PSM2_OK. * * @param[out] array_of_epaddr User-allocated array of at least @c num_of_epid * elements of type psm2_epaddr_t. Each * successfully connected endpoint is updated with * an endpoint address handle that corresponds to * the endpoint id at the same index in @c * array_of_epid. Handles are only updated if the * endpoint could be connected and if its error in * array_of_errors is PSM2_OK. * * @param[in] timeout Timeout in nanoseconds after which connection attempts * will be abandoned. Setting this value to 0 disables * timeout and waits until all endpoints have been * successfully connected or until an error is detected. * * @pre The user has opened a local endpoint and obtained a list of endpoint * IDs to connect to a given endpoint handle using an out-of-band * mechanism not provided by PSM. * * @post If the connect is successful, @c array_of_epaddr is updated with valid * endpoint addresses. * * @post If unsuccessful, the user can query the return status of each * individual remote endpoint in @c array_of_errors. * * @post The user can call into @ref psm2_ep_connect many times with the same * endpoint ID and the function is guaranteed to return the same output * parameters. * * @post PSM2 does not keep any reference to the arrays passed into the * function and the caller is free to deallocate them. * * The error value with the highest importance is returned by * the function if some portion of the communication failed. Users should * always refer to individual errors in @c array_of_errors whenever the * function cannot return PSM2_OK. * * @returns PSM2_OK The entire set of endpoint IDs were successfully connected * and endpoint addresses are available for all endpoint IDs. * * @code{.c} int connect_endpoints(psm2_ep_t ep, int numep, const psm2_epid_t *array_of_epid, psm2_epaddr_t **array_of_epaddr_out) { psm2_error_t *errors = (psm2_error_t *) calloc(numep, sizeof(psm2_error_t)); if (errors == NULL) return -1; psm2_epaddr_t *all_epaddrs = (psm2_epaddr_t *) calloc(numep, sizeof(psm2_epaddr_t)); if (all_epaddrs == NULL) return -1; psm2_ep_connect(ep, numep, array_of_epid, NULL, // We want to connect all epids, no mask needed errors, all_epaddrs, 30*e9); // 30 second timeout, <1 ns is forever *array_of_epaddr_out = all_epaddrs; free(errors); return 1; } @endcode */ psm2_error_t psm2_ep_connect(psm2_ep_t ep, int num_of_epid, const psm2_epid_t *array_of_epid, const int *array_of_epid_mask, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, int64_t timeout); /* @brief Disconnect one or more remote endpoints from a local endpoint. * * Function to non-collectively disconnect a connection to a set of endpoint * addresses and free the endpoint addresses. After disconnecting, the * application cannot send messages to the remote processes and PSM2 is * restored back to the state before calling psm2_ep_connect. The application * must call psm2_ep_connect to establish the connections again. * * This function is equivalent to calling psm2_ep_disconnect2() with mode == * PSM2_EP_DISCONNECT_GRACEFUL. * * @param[in] ep PSM2 endpoint handle * * @param[in] num_of_epaddr The number of endpoint addresses to disconnect from, * which also indicates the number of elements contained * in all of the function’s array-based parameters. * * @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr * valid endpoint addresses. Each endpoint address (or * epaddr) has been obtained through a previous * psm2_ep_connect call. * * @param[in] array_of_epaddr_mask User-allocated array that contains * num_of_epaddr integers. This array of masks * allows users to select which of the * epaddresses in array_of_epaddr should be * disconnected. If the integer at index i is * zero, PSM2 does not attempt to disconnect to * the epaddr at index i in array_of_epaddr. If * this parameter is NULL, PSM2 tries to * disconnect all epaddr in array_of_epaddr. * * @param[out] array_of_errors User-allocated array of at least num_of_epaddr * elements. If the function does not return PSM2_OK, * this array can be consulted for each endpoint * address not masked off by array_of_epaddr_mask to * know why the endpoint could not be disconnected. * Any endpoint address that could not be * disconnected because of an unrelated failure is * marked as PSM2_EPID_UNKNOWN. If the function * returns PSM2_OK, the errors for all endpoint * addresses also contain PSM2_OK. * * @param[in] timeout Timeout in nanoseconds after which disconnection attempts * are abandoned. Setting this value to 0 disables timeout and * waits until all endpoints have been successfully * disconnected or until an error is detected. * * @pre You have established the connections with previous psm2_ep_connect calls. * * @post If the disconnect is successful, the corresponding epaddr in * array_of_epaddr is reset to NULL pointer. * * @post If unsuccessful, you can query the return status of each individual * remote endpoint in array_of_errors. * * @post PSM2 does not keep any reference to the arrays passed into the function * and the caller is free to deallocate them. * * @post The error value with the highest importance is returned by the function * if some portion of the communication failed. Refer to individual errors * in array_of_errors whenever the function cannot return PSM2_OK. * * @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected * and endpoint addresses are freed by PSM2. * * @code{.c} int disconnect_endpoints(psm2_ep_t ep, int num_epaddr, const psm2_epaddr_t *array_of_epaddr) { psm2_error_t *errors = (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t)); if (errors == NULL) return -1; psm2_ep_disconnect( ep, num_epaddr, array_of_epaddr, NULL, // We want to disconnect all epaddrs, no mask needed, errors, 30 * e9); // 30 second timeout, <1 ns is forever free(errors); return 1; } @endcode */ psm2_error_t psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr, psm2_epaddr_t *array_of_epaddr, const int *array_of_epaddr_mask, psm2_error_t *array_of_errors, int64_t timeout); /* @brief Disconnect one or more remote endpoints from a local endpoint. * * Function to non-collectively disconnect a connection to a set of endpoint * addresses and free the endpoint addresses. After disconnecting, the * application cannot send messages to the remote processes and PSM2 is * restored back to the state before calling psm2_ep_connect. The application * must call psm2_ep_connect to establish the connections again. * * @param[in] ep PSM2 endpoint handle * * @param[in] num_of_epaddr The number of endpoint addresses to disconnect from, * which also indicates the number of elements contained * in all of the function’s array-based parameters. * * @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr * valid endpoint addresses. Each endpoint address (or * epaddr) has been obtained through a previous * psm2_ep_connect call. * * @param[in] array_of_epaddr_mask User-allocated array that contains * num_of_epaddr integers. This array of masks * allows users to select which of the * epaddresses in array_of_epaddr should be * disconnected. If the integer at index i is * zero, PSM2 does not attempt to disconnect to * the epaddr at index i in array_of_epaddr. If * this parameter is NULL, PSM2 tries to * disconnect all epaddr in array_of_epaddr. * * @param[out] array_of_errors User-allocated array of at least num_of_epaddr * elements. If the function does not return PSM2_OK, * this array can be consulted for each endpoint * address not masked off by array_of_epaddr_mask to * know why the endpoint could not be disconnected. * Any endpoint address that could not be * disconnected because of an unrelated failure is * marked as PSM2_EPID_UNKNOWN. If the function * returns PSM2_OK, the errors for all endpoint * addresses also contain PSM2_OK. * * @param[in] mode One of @ref PSM2_EP_DISCONECT_GRACEFUL or @ref PSM2_EP_DISCONECT_FORCE * * @param[in] timeout Timeout in nanoseconds after which disconnection attempts * are abandoned. Setting this value to 0 disables timeout and * waits until all endpoints have been successfully * disconnected or until an error is detected. Supplying a * negative value here sets the disconnection mode to "force". * * @pre You have established the connections with previous psm2_ep_connect calls. * * @post If the disconnect is successful, the corresponding epaddr in * array_of_epaddr is reset to NULL pointer. * * @post If unsuccessful, you can query the return status of each individual * remote endpoint in array_of_errors. * * @post PSM2 does not keep any reference to the arrays passed into the function * and the caller is free to deallocate them. * * @post The error value with the highest importance is returned by the function * if some portion of the communication failed. Refer to individual errors * in array_of_errors whenever the function cannot return PSM2_OK. * * @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected * and endpoint addresses are freed by PSM2. * * @code{.c} int disconnect_endpoints(psm2_ep_t ep, int num_epaddr, const psm2_epaddr_t *array_of_epaddr) { psm2_error_t *errors = (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t)); if (errors == NULL) return -1; psm2_ep_disconnect2( ep, num_epaddr, array_of_epaddr, NULL, // We want to disconnect all epaddrs, no mask needed, errors, PSM2_EP_DISCONECT_GRACEFUL, 30 * e9); // 30 second timeout, 0 ns is forever free(errors); return 1; } @endcode */ psm2_error_t psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, psm2_epaddr_t *array_of_epaddr, const int *array_of_epaddr_mask, psm2_error_t *array_of_errors, int mode, int64_t timeout); #define PSM2_EP_DISCONNECT_GRACEFUL PSM2_EP_CLOSE_GRACEFUL /**< Graceful mode in @ref psm2_ep_disconnect2 */ #define PSM2_EP_DISCONNECT_FORCE PSM2_EP_CLOSE_FORCE /**< Forceful mode in @ref psm2_ep_disconnect2 */ /** @brief Ensure endpoint communication progress * * Function to ensure progress for all PSM2 components instantiated on an * endpoint (currently, this only includes the MQ component). The function * never blocks and is typically required in two cases: * * @li Allowing all PSM2 components instantiated over a given endpoint to make * communication progress. Refer to @ref mq_progress for a detailed * discussion on MQ-level progress issues. * * @li Cases where users write their own synchronization primitives that * depend on remote communication (such as spinning on a memory location * which's new value depends on ongoing communication). * * The poll function doesn't block, but the user can rely on the @ref * PSM2_OK_NO_PROGRESS return value to control polling behaviour in terms of * frequency (poll until an event happens) or execution environment (poll for a * while but yield to other threads of CPUs are oversubscribed). * * @returns PSM2_OK Some communication events were progressed * @returns PSM2_OK_NO_PROGRESS Polling did not yield any communication progress * */ psm2_error_t psm2_poll(psm2_ep_t ep); /** @brief Set a user-determined ep address label. * * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect * @param[in] epaddr_label_string User-allocated string to print when * identifying endpoint in error handling or other verbose * printing. The NULL-terminated string must be allocated by * the user since PSM2 only keeps a pointer to the label. If * users do not explicitly set a label for each endpoint, * endpoints will identify themselves as hostname:port. */ void psm2_epaddr_setlabel(psm2_epaddr_t epaddr, const char *epaddr_label_string); /** @brief Set a user-determined ep address context. * * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect * @param[in] ctxt Opaque user defined state to associate with an endpoint * address. This state can be retrieved via * @ref psm2_epaddr_getctxt. */ void psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt); /** @brief Get the user-determined ep address context. Users can associate an * opaque context with each endpoint via @ref psm2_epaddr_setctxt. * * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect. */ void *psm2_epaddr_getctxt(psm2_epaddr_t epaddr); /* Below are all component specific options. The component object for each of * the options is also specified. */ /* PSM2_COMPONENT_CORE options */ /* PSM2 debug level */ #define PSM2_CORE_OPT_DEBUG 0x101 /**< [@b uint32_t ] Set/Get the PSM2 debug level. This option can be set * before initializing the PSM2 library. * * component object: (null) * option value: PSM2 Debug mask to set or currently active debug level. */ /* PSM2 endpoint address context */ #define PSM2_CORE_OPT_EP_CTXT 0x102 /**< [@b uint32_t ] Set/Get the context associated with a PSM2 endpoint * address (psm2_epaddr_t). * * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. * option value: Context associated with PSM2 endpoint address. */ /* PSM2_COMPONENT_IB options */ /* Default service level to use to communicate with remote endpoints */ #define PSM2_IB_OPT_DF_SL 0x201 /**< [@b uint32_t ] Default OPA SL to use for all remote communication. * If unset defaults to Service Level 0. * * component object: Opened PSM2 endpoint id (@ref psm2_ep_t). * option value: Default IB SL to use for endpoint. (0 <= SL < 15) */ /* Set IB service level to use for communication to an endpoint */ #define PSM2_IB_OPT_EP_SL 0x202 /**< [@b uint32_t ] OPA SL to use for communication to specified * remote endpoint. * * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address. * option value: SL used to communicate with remote endpoint. (0 <= SL < 15) */ /* PSM2_COMPONENT_MQ options (deprecates psm2_mq_set|getopt) */ /* MQ options that can be set in psm2_mq_init and psm2_{set,get}_opt */ #define PSM2_MQ_OPT_RNDV_IB_SZ 0x301 /**< [@b uint32_t ] Size at which to start enabling rendezvous * messaging for OPA messages (if unset, defaults to values * between 56000 and 72000 depending on the system configuration) * * component object: PSM2 Matched Queue (@ref psm2_mq_t). * option value: Size at which to switch to rendezvous protocol. */ #define PSM2_MQ_RNDV_HFI_SZ PSM2_MQ_OPT_RNDV_IB_SZ #define PSM2_MQ_RNDV_IPATH_SZ PSM2_MQ_OPT_RNDV_IB_SZ #define PSM2_MQ_OPT_RNDV_SHM_SZ 0x302 #define PSM2_MQ_RNDV_SHM_SZ PSM2_MQ_OPT_RNDV_SHM_SZ /**< [@b uint32_t ] Size at which to start enabling * rendezvous messaging for shared memory (intra-node) messages (If * unset, defaults to 64000 bytes). * * component object: PSM2 Matched Queue (@ref psm2_mq_t). * option value: Size at which to switch to rendezvous protocol. */ #define PSM2_MQ_OPT_SYSBUF_MYBYTES 0x303 #define PSM2_MQ_MAX_SYSBUF_MBYTES PSM2_MQ_OPT_SYSBUF_MYBYTES /**< [@b uint32_t ] Maximum number of bytes to allocate for unexpected * messages. * * component object: PSM2 Matched Queue (@ref psm2_mq_t). * option value: Deprecated; this option has no effect. */ /* PSM2_COMPONENT_AM options */ #define PSM2_AM_OPT_FRAG_SZ 0x401 #define PSM2_AM_MAX_FRAG_SZ PSM2_AM_OPT_FRAG_SZ /*!< [@b uint32_t ] Maximum active message fragment size that can be sent * for a given endpoint or across all endpoints. This value can only be * queried. * * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then * option value is the smalles fragment size across all * active endpoints. * option value: Maximum active message fragment size in bytes. */ #define PSM2_AM_OPT_NARGS 0x402 #define PSM2_AM_MAX_NARGS PSM2_AM_OPT_NARGS /*!< [@b uint32_t ] Maximum number of message arguments that can be sent * for a given endpoint or across all endpoints. This value can only be * queried. * * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then * option value is the smalles fragment size across all * active endpoints. * option value: Maximum number of active message arguments. */ #define PSM2_AM_OPT_HANDLERS 0x403 #define PSM2_AM_MAX_HANDLERS PSM2_AM_OPT_HANDLERS /*!< [@b uint32_t ] Maximum number of message handlers that can be registered * for a given endpoint or across all endpoints. This value can only be * queried. * * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then * option value is the smalles fragment size across all * active endpoints. * option value: Maximum number of active message handlers. */ /** @brief Set an option for a PSM2 component * * Function to set the value of a PSM2 component option * * @param[in] component Type of PSM2 component for which to set the option * @param[in] component_obj Opaque component specify object to apply the set * operation on. These are passed uninterpreted to the * appropriate component for interpretation. * @param[in] optname Name of component option to set. These are component * specific and passed uninterpreted to the appropriate * component for interpretation. * @param[in] optval Pointer to storage that contains the value to be updated * for the supplied option. It is up to the user to * ensure that the pointer points to a memory location with a * correct size and format. * @param[in] optlen Size of the memory region pointed to by optval. * * @returns PSM2_OK if option could be set. * @returns PSM2_PARAM_ERR if the component or optname are not valid. * @returns PSM2_OPT_READONLY if the option to be set is a read-only option. * */ psm2_error_t psm2_setopt(psm2_component_t component, const void *component_obj, int optname, const void *optval, uint64_t optlen); /** @brief Get an option for a PSM2 component * * Function to get the value of a PSM2 component option * * @param[in] component Type of PSM2 component for which to get the option * @param[in] component_obj Opaque component specify object to apply the get * operation on. These are passed uninterpreted to the * appropriate component for interpretation. * @param[in] optname Name of component option to get. These are component * specific and passed uninterpreted to the appropriate * component for interpretation. * @param[out] optval Pointer to storage that contains the value to be updated * for the supplied option. It is up to the user to * ensure that the pointer points to a valid memory region. * @param[in,out] optlen This is a value result parameter initially containing * the size of the memory region pointed to by optval and * modified to return the actual size of optval. * * @returns PSM2_OK if option value could be retrieved successfully. * @returns PSM2_PARAM_ERR if the component or optname are not valid. * @returns PSM2_NO_MEMORY if the memory region optval is of insufficient size. * optlen contains the required memory region size for * optname value. * */ psm2_error_t psm2_getopt(psm2_component_t component, const void *component_obj, int optname, void *optval, uint64_t *optlen); /** @brief Datatype for end-point information */ typedef struct psm2_epinfo { psm2_ep_t ep; /**< The ep for this end-point*/ psm2_epid_t epid; /**< The epid for this end-point */ psm2_uuid_t uuid; /**< The UUID for this end-point */ uint16_t jkey; /**< The job key for this end-point */ char uuid_str[64]; /**< String representation of the UUID for this end-point */ } psm2_epinfo_t; /** @brief Datatype for end-point connection */ typedef struct psm2_epconn { psm2_epaddr_t addr; /**< The epaddr for this connection */ psm2_ep_t ep; /**< The ep for this connection */ psm2_mq_t mq; /**< The mq for this connection */ } psm2_epconn_t; /** @brief Query PSM2 for end-point information. * * Function to query PSM2 for end-point information. This allows retrieval of * end-point information in cases where the caller does not have access to the * results of psm2_ep_open(). In the default single-rail mode PSM2 will use * a single endpoint. If either multi-rail mode or multi-endpoint mode is * enabled, PSM2 will use multiple endpoints. * * @param[in,out] num_of_epinfo On input, sizes the available number of entries * in array_of_epinfo. On output, specifies the * returned number of entries in array_of_epinfo. * @param[out] array_of_epinfo Returns end-point information structures. * * @pre PSM2 is initialized and the end-point has been opened. * * @returns PSM2_OK indicates success. * @returns PSM2_PARAM_ERR if input num_if_epinfo is less than or equal to zero. * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist. */ psm2_error_t psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo); /** @brief Query PSM2 for end-point connections. * * Function to query PSM2 for end-point connections. This allows retrieval of * end-point connections in cases where the caller does not have access to the * results of psm2_ep_connect(). The epid values can be found using * psm2_ep_query() so that each PSM2 process can determine its own epid. These * values can then be distributed across the PSM2 process so that each PSM * process knows the epid for all other PSM2 processes. * * @param[in] epid The epid of a PSM2 process. * @param[out] epconn The connection information for that PSM2 process. * * @pre PSM2 is initialized and the end-point has been connected to this epid. * * @returns PSM2_OK indicates success. * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist. * @returns PSM2_EPID_UNKNOWN if the epid value is not known to PSM. */ psm2_error_t psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn); /** @brief Query given PSM2 end-point for its connections. * * The need for this function comes with 'multi-ep' feature. * Function is similar to (@ref psm2_ep_epid_lookup). * It differs in that an extra parameter which identifies * the end-point [ep] must be provided which limits the lookup to that single ep. * * @returns PSM2_OK indicates success. * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point [ep] is closed or does not exist. * @returns PSM2_EPID_UNKNOWN if the [epid] value is not known to PSM. * @returns PSM2_PARAM_ERR if output [epconn] is NULL. */ psm2_error_t psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn); /** @brief Get PSM2 epid for given epaddr. * * @param[in] epaddr The endpoint address. * @param[out] epid The epid of a PSM2 process. * * @returns PSM2_OK indicates success. * @returns PSM2_PARAM_ERR if input [epaddr] or output [epid] is NULL. */ psm2_error_t psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid); /*! @} */ /*! @addtogroup init PSM2 Information Query * @{ */ /** @brief Enumeration for info query APIs * * Note that calling the function: * @code{.c} psm2_error_t psm2_info_query(psm2_info_query_t, void *out, size_t nargs, psm2_info_query_arg_t []); @endcode * * Takes a variable number of input arguments, per the initial psm2_info_query_t * * Below, there is an explanation of the number, type and order of the * required input arguments, as well as a definition of the type of the output. */ typedef enum psm2_info_query_et { /*! Required input arguments 0 Output parameter: uint32_t*, description: the number of units */ PSM2_INFO_QUERY_NUM_UNITS, /*! Required input arguments: 0 Output parameter: uint32_t*, description: the number of ports */ PSM2_INFO_QUERY_NUM_PORTS, /*! Required input arguments: 1 1. type: uint32_t, description: the unit for which status is desired (use: psm2_info_query_arg_t.unit). Output parameter: uint32_t, description: zero, when the unit is not active, non-zero when the unit is active. */ PSM2_INFO_QUERY_UNIT_STATUS, /*! Required input arguments: 2 1. type: uint32_t, description: the unit for which status is desired (use: psm2_info_query_arg_t.unit). 2. type: uint32_t, description: the port for which status is desired (use: psm2_info_query_arg_t.port). Output parameter: uint32_t, description: zero, when the unit is not active, non-zero when the unit is active. */ PSM2_INFO_QUERY_UNIT_PORT_STATUS, /*! Required input arguments: 1 1. type: uint32_t, description: the unit for which the number of free contexts is desired (use: psm2_info_query_arg_t.unit). Output parameter: uint32_t, description: the number of free contexts.. */ PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, /*! Required input arguments: 1 1. type: uint32_t, description: the unit for which the number of contexts is desired (use: psm2_info_query_arg_t.unit). Output parameter: uint32_t, description: the number of contexts.. */ PSM2_INFO_QUERY_NUM_CONTEXTS, /*! Required input arguments: 2 1. type: psm2_mq_t, description: the mq that is associated with the connection for which configuration information is wanted. (use: psm2_info_query_arg_t.mq). 2. type: psm2_epaddr_t, description: the ep address that is associated with the connection for which configuration information is wanted (use: psm2_info_query_arg_t.epaddr). Output parameter: uint32_t, description: a bit mask containing bits defining the configuration. see psm2_info_query_config for a description of the bits. */ PSM2_INFO_QUERY_CONFIG, /*! Required input arguments: 3 1. type: psm2_mq_t, description: the mq that is associated with the connection for which the msg size query information is wanted. (use: psm2_info_query_arg_t.mq). 2. type: psm2_epaddr_t, description: the ep address that is associated with the connection for which the msg size query information is wanted (use: psm2_info_query_arg_t.epaddr). 3. type: enum psm2_info_query_thresh_et, the specific msg size query. (use: psm2_info_query_arg_t.mstq). Output parameter: uint32_t, description: the message size threshold. */ PSM2_INFO_QUERY_THRESH, /*! Required input arguments: 3 1. type: psm2_mq_t, description: the mq that is associated with the connection for which the device name is wanted. (use: psm2_info_query_arg_t.mq). 2. type: psm2_epaddr_t, description: the ep address that is associated with the connection for which device name is wanted. (use: psm2_info_query_arg_t.epaddr). 3. type: size_t, the length of the output buffer that will recieve the device name (use: psm2_info_query_arg_t.length). Output parameter: char *, description: the device name. */ PSM2_INFO_QUERY_DEVICE_NAME, /*! Required input arguments: 2 1. type: psm2_mq_t, description: the mq that is associated with the connection for which the mtu is wanted (use: psm2_info_query_arg_t.mq). 2. type: psm2_epaddr_t, description: the ep address that is associated with the connection for which mtu is wanted. (use: psm2_info_query_arg_t.epaddr). Output parameter: uint32_t, description: the mtu. */ PSM2_INFO_QUERY_MTU, /*! Required input arguments: 2 1. type: psm2_mq_t, description: the mq that is associated with the connection for which the link speed is wanted (use: psm2_info_query_arg_t.mq). 2. type: psm2_epaddr_t, description: the ep address that is associated with the connection for which link speed is wanted. (use: psm2_info_query_arg_t.epaddr). Output parameter: uint32_t, description: the link speed. */ PSM2_INFO_QUERY_LINK_SPEED, /*! Required input arguments: 1 1. type: size_t, description: the length of the output buffer to receive the network type (use: psm2_info_query_arg_t.length). Output parameter: char*, description: the network type. */ PSM2_INFO_QUERY_NETWORK_TYPE, /*! Required input arguments 0 Output parameter: uint32_t*, description: a bit mask of the features in libpsm2. See psm2_info_query_feature_mask below for bit mask definition. */ PSM2_INFO_QUERY_FEATURE_MASK, PSM2_INFO_QUERY_LAST, /* must appear last, and the info query constants are used as an index. */ } psm2_info_query_t; /** @brief Enumeration for info query config */ enum psm2_info_query_config { /*! The following three are 'main configs': */ PSM2_INFO_QUERY_CONFIG_IPS = (1 << 0), PSM2_INFO_QUERY_CONFIG_AMSH = (1 << 1), PSM2_INFO_QUERY_CONFIG_SELF = (1 << 2), /*! The following three are sub-configs of the IPS main config: */ PSM2_INFO_QUERY_CONFIG_CUDA = (1 << 3), PSM2_INFO_QUERY_CONFIG_PIO = (1 << 4), PSM2_INFO_QUERY_CONFIG_DMA = (1 << 5), /*! The following is a sub-config of IPS & CUDA main config: */ PSM2_INFO_QUERY_CONFIG_GDR_COPY = (1 << 6), }; /** @brief Enumeration info query thresholds */ enum psm2_info_query_thresh_et { /*! This is the start of the thresh queries for IPS config: */ PSM2_INFO_QUERY_THRESH_IPS_START, /*! Not shown here are the specific queries supported by the CUDA and GDR_COPY, sub-configs. But, those configs will need to include threshold queries in case the config includes them. Note that for the case of gdr_copy the thresholds varies for the case of the memory is gpu memory or not. */ /*! The following threshold queres are supported for the IPS config only. */ /*! The PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA threshold query indicates at what message size the send transport transitions from PIO to DMA. Note that this threshold query may be meaningless if PIO or DMA is disabled. */ PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA = PSM2_INFO_QUERY_THRESH_IPS_START, /*! Messages with messages sizes less than or equal to the tiny threshold will be sent by tiny message. */ PSM2_INFO_QUERY_THRESH_IPS_TINY, /*! Messages with messages sizes greater than tiny, but less than or equal to frag size will be sent by short message. */ PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE, PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE, /*! Messages that are greater than the frag_size, but less than RNDV will be sent by eager message. Messages with messages sizes greater than or equal to RNDV will be sent by the rendezvous protocol message. */ PSM2_INFO_QUERY_THRESH_IPS_RNDV, PSM2_INFO_QUERY_THRESH_IPS_END = PSM2_INFO_QUERY_THRESH_IPS_RNDV, /*! Not shown here are the specific thresh queries supported by AMSH and SELF configs: */ PSM2_INFO_QUERY_THRESH_AMSH_START, PSM2_INFO_QUERY_THRESH_AMSH_END = PSM2_INFO_QUERY_THRESH_AMSH_START, PSM2_INFO_QUERY_THRESH_SELF_START, PSM2_INFO_QUERY_THRESH_SELF_END = PSM2_INFO_QUERY_THRESH_SELF_START, }; enum psm2_info_query_feature_mask { /*! The following bit means that the libpsm2 _can_ support cuda. If the PSM2_INFO_QUERY_FEATURE_MASK request is made and the PSM2_INFO_QUERY_FEATURE_CUDA bit is not present, thne cuda is not supported. */ PSM2_INFO_QUERY_FEATURE_CUDA = (1 << 0), }; /** @brief Union for info query arg type */ typedef union psm2_info_query_arg { uint32_t unit; uint32_t port; size_t length; psm2_mq_t mq; psm2_epaddr_t epaddr; enum psm2_info_query_thresh_et mstq; } psm2_info_query_arg_t; /** @brief PSM2 info query * * Function that allows a client to interrogate PSM2 for various information. * * @param[in] psm2_info_query_t What information is requested. * @param[out] void * out, where the information will be delivered on a * PSM2_OK return. * @param[in] size_t nargs, the number of following arguments. * @param[in] psm2_info_query_arg_t [], The arguments that are required for * certain queries. See documentation * at @ref psm2_info_query_t for what * arguments are required for what * queries as well as what the type * the output is expected to be. * * @retval PSM2_OK The out buffer has successfully been written with the * result of the query. */ psm2_error_t psm2_info_query(psm2_info_query_t, void *out, size_t nargs, psm2_info_query_arg_t []); /*! @} */ #ifdef __cplusplus } /* extern "C" */ #endif #endif opa-psm2-PSM2_11.2.185/psm2_am.h000066400000000000000000000515011370564314600157320ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef PSM2_AM_H #define PSM2_AM_H #include #include #include #ifdef __cplusplus extern "C" { #endif /*! * @file psm2_am.h * @brief PSM2 Active Message. * * @page psm2_am Active Message Interface * * PSM2 implements an Active Message (AM) component that lives alongside the * Matched Queues (MQ) component. The active message interface essentially * provides a remote procedure call mechanism. A PSM2 process can generate a * request to run an active message handler on a remote PSM2 process * identified by its end-point address (epaddr). End-point address values * are returned by PSM2 when connecting end-points using the psm2_ep_connect() * function. * * An AM handler may make local state updates, and may generate at most * one reply to be returned to the original requestor. This reply will cause * a handler to be run on that requestor. The requestor handler may make * local state updates but is not allowed to reply nor request in that handler * context. A request or reply can convey a small number of in-line arguments * plus a short amount of data. A tight bound is placed on the number of * in-line arguments to allow them to be packed into a header. A bound is * placed on the size of the data payload so that the request or reply can * be sent as a single packet within the MTU of the underlying communication * transport. Longer payloads must be synthesized on top of the provided * short request/reply mechanism by fragmentation and reassembly, or * transported by some other means. * * Handlers are run in the process context of the targeted PSM2 process, * either in its main thread of execution or in a progress thread. A handler * may therefore be executed concurrently with the main thread of execution * of the PSM2 process. PSM2 ensures that its own state is protected against this * concurrent execution. However, a handler must make its own arrangements to * protect its own state. Alternatively, the PSM2 progress thread can be * disabled using the PSM2_RCVTHREAD environment variable if this is too * onerous for the handler. * * PSM2 has an active progress model and requires that the PSM2 library is * called in order to make progress. This can be achieved using the psm2_poll() * function. A PSM2 implementatation may provide passive progress through some * other mechanism (e.g. a receive thread), but a PSM2 consumer must not assume * this and must arrange to make active progress through calls into the PSM * library. Note that the PSM2 AM interface is not MTsafe, same as the other PSM * interfaces, and that MTsafety must be provided by the consumer if required. * * The order in which AM requests are issued by an initiator to a particular * target defines the order in which those AM requests will be executed on * that target. Therefore the AM implementation will maintain the order * of handler executions on a flow, and this also applies when progress * threads are used. For multiple initiators issuing requests to a particular * target, the handler executions will be interleaved in some sequentially * consistent ordering. */ /*! @defgroup am PSM2 Active Message * * @{ */ /** @brief Datatype for an index representing an active message handler */ typedef uint32_t psm2_handler_t; /** @brief Datatype for a token for an active message handler.*/ typedef void *psm2_am_token_t; /* PSM2 AM flags * These flags may be combined using bitwise-or. */ #define PSM2_AM_FLAG_NONE 0 /**< No other PSM2 AM flags are needed. */ #define PSM2_AM_FLAG_ASYNC 1 /**< No need to copy source data. */ #define PSM2_AM_FLAG_NOREPLY 2 /**< The handler for this AM request is guaranteed not to generate a reply. */ /** @brief The psm2_amarg type represents the type of an AM argument. This is * a 64-bit type and is broken down into four 16-bit fields, two 32-bit * fields or one 64-bit field for the convenience of code using the PSM2 AM * interface. */ typedef struct psm2_amarg { union { struct { uint16_t u16w3; uint16_t u16w2; uint16_t u16w1; uint16_t u16w0; }; struct { uint32_t u32w1; uint32_t u32w0; }; uint64_t u64w0; uint64_t u64; }; } psm2_amarg_t; /** @brief The AM handler function type * * psm2_am_handler_fn_t is the datatype for an AM handler. PSM2 AM will call-back * into an AM handler using this function prototype. The parameters and result * of these handler functions are described here. * * @param[in] token This is an opaque token value passed into a handler. * A request handler may send at most one reply back to the * original requestor, and must pass this value as the token * parameter to the psm2_am_reply_short() function. A reply * handler is also passed a token value, but must not attempt * to reply. * @param[in] args A pointer to the arguments provided to this handler. * @param[in] nargs The number of arguments. * @param[in] src A pointer to the data payload provided to this handler. * @param[in] len The length of the data payload in bytes. * * @returns 0 The handler should always return a result of 0. */ typedef int (*psm2_am_handler_fn_t) (psm2_am_token_t token, psm2_amarg_t *args, int nargs, void *src, uint32_t len); /** @brief The AM handler function type with caller context * * psm2_am_handler_2_fn_t is the datatype for an AM handler that * includes a user context. PSM2 AM will call-back into an AM handler using * this function prototype. The parameters and result * of these handler functions are described here. * * @param[in] token This is an opaque token value passed into a handler. * A request handler may send at most one reply back to the * original requestor, and must pass this value as the token * parameter to the psm2_am_reply_short() function. A reply * handler is also passed a token value, but must not attempt * to reply. * @param[in] args A pointer to the arguments provided to this handler. * @param[in] nargs The number of arguments. * @param[in] src A pointer to the data payload provided to this handler. * @param[in] len The length of the data payload in bytes. * @param[in] hctx The user context pointer provided at handler registration. * * @returns 0 The handler should always return a result of 0. */ typedef int (*psm2_am_handler_2_fn_t) (psm2_am_token_t token, psm2_amarg_t *args, int nargs, void *src, uint32_t len, void *hctx); /** @brief Type for a completion call-back handler. * * A completion handler can be specified to give a call-back on the initiation * side that an AM request or reply has completed on the target side. The * call-back has a context pointer which is provided along with the call-back * function pointer when the initiator generates the request or reply. This * approach will typically give higher performance than using an AM request or * reply to achieve the same effect, though note that no additional information * can be passed from the target side back to the initiator side with the * completion handler approach. * * @param[in] context A context pointer. * @returns void This handler has no return result. */ typedef void (*psm2_am_completion_fn_t) (void *context); /** @brief Register AM call-back handlers at the specified end-point. * * This function is used to register an array of handlers, and may be called * multiple times to register additonal handlers. The maximum number of * handlers that can be registered is limited to the max_handlers value * returned by psm2_am_get_parameters(). Handlers are associated with a PSM * end-point. The handlers are allocated index numbers in the the handler table * for that end-point. The allocated index for the handler function in * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These * handler index values are used in the psm2_am_request_short() and * psm2_am_reply_short() functions. * * @param[in] ep End-point value * @param[in] handlers Array of handler functions * @param[in] num_handlers Number of handlers (sizes the handlers and * handlers_idx arrays) * @param[out] handlers_idx Used to return handler index mapping table * * @returns PSM2_OK Indicates success * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table */ psm2_error_t psm2_am_register_handlers(psm2_ep_t ep, const psm2_am_handler_fn_t * handlers, int num_handlers, int *handlers_idx); /** @brief Register AM call-back handlers at the specified end-point. * * This function is used to register an array of handlers, and may be called * multiple times to register additonal handlers. The maximum number of * handlers that can be registered is limited to the max_handlers value * returned by psm2_am_get_parameters(). Handlers are associated with a PSM * end-point. The handlers are allocated index numbers in the the handler table * for that end-point. The allocated index for the handler function in * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These * handler index values are used in the psm2_am_request_short() and * psm2_am_reply_short() functions. * * @param[in] ep End-point value * @param[in] handlers Array of handler functions * @param[in] num_handlers Number of handlers (sizes the handlers and * handlers_idx arrays) * @param[in] hctx Array of void* pointers to a user contexts for identifying the * target ep that registered these handlers. * @param[out] handlers_idx Used to return handler index mapping table * * @returns PSM2_OK Indicates success * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table */ psm2_error_t psm2_am_register_handlers_2(psm2_ep_t ep, const psm2_am_handler_2_fn_t * handlers, int num_handlers, void **hctx, int *handlers_idx); /** @brief Unregister all AM call-back handlers for the specific end-point. * * This function is used to unregister all AM handlers registered to the * specified end-point. * * @param[in] ep End-point value * */ void psm2_am_unregister_handlers(psm2_ep_t ep); /** @brief Generate an AM request. * * This function generates an AM request causing an AM handler function to be * called in the PSM2 process associated with the specified end-point address. * The number of arguments is limited to max_nargs and the payload length in * bytes to max_request_short returned by the psm2_am_get_parameters() function. * If arguments are not required, set the number of arguments to 0 and the * argument pointer will not be dereferenced. If payload is not required, set * the payload size to 0 and the payload pointer will not be dereferenced. * * Optionally a completion function and completion context pointer can be * provided, and a local call-back will be made to that function passing in * that context pointer once remote execution of the handler has completed. If * the completion call-back is not required, the handler should be specified as * NULL and the pointer value will not be used. * * The allowed flags are any combination of the following combined with * bitwise-or: * PSM2_AM_FLAG_NONE - No flags * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data * PSM2_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to * generate a reply * * The PSM2 AM implementation will not dereference the args pointer after return * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM * implementation will not dereference the src pointer after return from this * function. This may require the implementation to take a copy of the payload * if the request cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC * is provided then a copy will not be taken and the PSM2 AM implementation * retains ownership of the payload src memory until the request is locally * complete. Local completion can be determined using the completion handler * call-back, or through an AM handler associated with an AM reply. * * The PSM2_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that * a reply will not be generated. Use of this flag is optional, but it may * enable a performance optimization in this case by indicating that reply * state is not required. * * @param[in] epaddr End-point address to run handler on * @param[in] handler Index of handler to run * @param[in] args Array of arguments to be provided to the handler * @param[in] nargs Number of arguments to be provided to the handler * @param[in] src Pointer to the payload to be delivered to the handler * @param[in] len Length of the payload in bytes * @param[in] flags These are PSM2 AM flags and may be combined together with * bitwise-or * @param[in] completion_fn The completion function to called locally when * remote handler is complete * @param[in] completion_ctxt User-provided context pointer to be passed to the * completion handler * * @returns PSM2_OK indicates success. */ psm2_error_t psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); /** @brief Generate an AM reply. * * This function may only be called from an AM handler called due to an AM * request. If the AM request uses the PSM2_AM_FLAG_NOREPLY flag, the AM * handler must not call this function. Otherwise, the AM request handler may * call psm2_am_reply_short() at most once, and must pass in the token value * that it received in its own handler call-back. * * This function generates an AM reply causing an AM handler function to be * called in the PSM2 process associated with the specified end-point address. * The number of arguments is limited to max_nargs and the payload length in * bytes to max_reply_short returned by the psm2_am_get_parameters() function. * If arguments are not required, set the number of arguments to 0 and the * argument pointer will not be dereferenced. If payload is not required, set * the payload size to 0 and the payload pointer will not be dereferenced. * * Optionally a completion function and completion context pointer can be * provided, and a local call-back will be made to that function passing in * that context pointer once remote execution of the handler has completed. If * the completion call-back is not required, the handler should be specified as * NULL and the pointer value will not be used. * * The allowed flags are any combination of the following combined with * bitwise-or: * PSM2_AM_FLAG_NONE - No flags * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data * * The PSM2 AM implementation will not dereference the args pointer after return * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM * implementation will not dereference the src pointer after return from this * function. This may require the implementation to take a copy of the payload * if the reply cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC is * provided then a copy will not be taken and the PSM2 AM implementation retains * ownership of the payload src memory until the reply is locally complete. * Local completion can be determined using the completion handler call-back. * * @param[in] token Token value provided to the AM handler that is generating * the reply. * @param[in] handler Index of handler to run * @param[in] args Array of arguments to be provided to the handler * @param[in] nargs Number of arguments to be provided to the handler * @param[in] src Pointer to the payload to be delivered to the handler * @param[in] len Length of the payload in bytes * @param[in] flags These are PSM2 AM flags and may be combined together with * bitwise-or * @param[in] completion_fn The completion function to called locally when * remote handler is complete * @param[in] completion_ctxt User-provided context pointer to be passed to the * completion handler * * @returns PSM2_OK indicates success. */ psm2_error_t psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); /** @brief Return the source end-point address for a token. * * This function is used to obtain the epaddr object representing the message * initiator from a token passed by PSM2 to a message handler. * * @param[in] token Token value provided to the AM handler that is generating * the reply. * @param[out] epaddr_out Pointer to the where the epaddr should be returned. * * @returns PSM2_OK indicates success. * @returns PSM2_PARAM_ERR token is invalid or epaddr_out is NULL. */ psm2_error_t psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out); /** @brief AM parameters * * This structure is used to return PSM2 AM implementation-specific parameter * values back to the caller of the psm2_am_get_parameters() function. This * API also specifies the minimum values for these parameters that an * implementation must at least provide: * max_handlers >= 64, * max_nargs >= 2, * max_request_short >= 256 and * max_reply_short >= 256. */ struct psm2_am_parameters { /** Maximum number of handlers that can be registered. */ uint32_t max_handlers; /** Maximum number of arguments to an AM handler. */ uint32_t max_nargs; /** Maximum number of bytes in a request payload. */ uint32_t max_request_short; /** Maximum number of bytes in a reply payload. */ uint32_t max_reply_short; }; /** @brief Get the AM parameter values * * This function retrieves the implementation-specific AM parameter values for * the specified end-point. * * @param[in] ep The end-point value returned by psm2_ep_open(). * @param[out] parameters Pointer to the struct where the parameters will be * returned. * @param[in] sizeof_parameters_in The size in bytes of the struct provided by * the caller. * @param[out] sizeof_parameters_out The size in bytes of the struct returned * by PSM. * * @returns PSM2_OK indicates success. */ psm2_error_t psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters, size_t sizeof_parameters_in, size_t *sizeof_parameters_out); /*! @} */ #ifdef __cplusplus } /* extern "C" */ #endif #endif opa-psm2-PSM2_11.2.185/psm2_hal.c000066400000000000000000000350761370564314600161050ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "psm_user.h" #include "psm2_hal.h" #include "ptl_ips/ips_scb.h" static SLIST_HEAD(, _psmi_hal_instance) head_hi; /* define the current hal instance pointer */ psmi_hal_instance_t *psmi_hal_current_hal_instance = NULL; /* psmi_hal_register_instance */ void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi) { #define REJECT_IMPROPER_HI(MEMBER) if (!psm_hi->MEMBER) return /* If an attempt to register a hal instance contains a NULL func ptr, reject it. */ /* To allow fast lookups, please keep this code segment alphabetized by hfp_* func ptr member name: */ #if PSMI_HAL_INST_CNT > 1 REJECT_IMPROPER_HI(hfp_ack_hfi_event); REJECT_IMPROPER_HI(hfp_check_rhf_sequence_number); REJECT_IMPROPER_HI(hfp_cl_q_empty); REJECT_IMPROPER_HI(hfp_close_context); REJECT_IMPROPER_HI(hfp_context_open); REJECT_IMPROPER_HI(hfp_dma_slot_available); REJECT_IMPROPER_HI(hfp_finalize_); REJECT_IMPROPER_HI(hfp_forward_packet_to_subcontext); REJECT_IMPROPER_HI(hfp_free_tid); REJECT_IMPROPER_HI(hfp_get_bthqp); REJECT_IMPROPER_HI(hfp_get_cc_settings_bin); REJECT_IMPROPER_HI(hfp_get_cc_table_bin); REJECT_IMPROPER_HI(hfp_get_cl_q_head_index); REJECT_IMPROPER_HI(hfp_get_cl_q_tail_index); REJECT_IMPROPER_HI(hfp_get_context); REJECT_IMPROPER_HI(hfp_get_egr_buff); REJECT_IMPROPER_HI(hfp_get_fd); REJECT_IMPROPER_HI(hfp_get_gid_hi); REJECT_IMPROPER_HI(hfp_get_gid_lo); REJECT_IMPROPER_HI(hfp_get_hfi_event_bits); REJECT_IMPROPER_HI(hfp_get_hfi_type); REJECT_IMPROPER_HI(hfp_get_hw_status); REJECT_IMPROPER_HI(hfp_get_hw_status_freezemsg); REJECT_IMPROPER_HI(hfp_get_jkey); REJECT_IMPROPER_HI(hfp_get_lid); REJECT_IMPROPER_HI(hfp_get_node_id); REJECT_IMPROPER_HI(hfp_get_num_contexts); REJECT_IMPROPER_HI(hfp_get_num_free_contexts); REJECT_IMPROPER_HI(hfp_get_pio_size); REJECT_IMPROPER_HI(hfp_get_pio_stall_cnt); REJECT_IMPROPER_HI(hfp_get_port_active); REJECT_IMPROPER_HI(hfp_get_port_gid); REJECT_IMPROPER_HI(hfp_get_port_index2pkey); REJECT_IMPROPER_HI(hfp_get_port_lid); REJECT_IMPROPER_HI(hfp_get_port_lmc); REJECT_IMPROPER_HI(hfp_get_port_num); REJECT_IMPROPER_HI(hfp_get_port_rate); REJECT_IMPROPER_HI(hfp_get_sc2vl_map); REJECT_IMPROPER_HI(hfp_get_port_sl2sc); REJECT_IMPROPER_HI(hfp_get_receive_event); REJECT_IMPROPER_HI(hfp_get_rhf_expected_sequence_number); REJECT_IMPROPER_HI(hfp_get_rx_egr_tid_cnt); REJECT_IMPROPER_HI(hfp_get_rx_hdr_q_cnt); REJECT_IMPROPER_HI(hfp_get_rx_hdr_q_ent_size); REJECT_IMPROPER_HI(hfp_get_sdma_req_size); REJECT_IMPROPER_HI(hfp_get_sdma_ring_size); REJECT_IMPROPER_HI(hfp_get_sdma_ring_slot_status); REJECT_IMPROPER_HI(hfp_get_subctxt); REJECT_IMPROPER_HI(hfp_get_subctxt_cnt); REJECT_IMPROPER_HI(hfp_get_tid_exp_cnt); REJECT_IMPROPER_HI(hfp_get_tidcache_invalidation); REJECT_IMPROPER_HI(hfp_get_unit_active); REJECT_IMPROPER_HI(hfp_get_unit_id); REJECT_IMPROPER_HI(hfp_get_user_major_bldtime_version); REJECT_IMPROPER_HI(hfp_get_user_major_runtime_version); REJECT_IMPROPER_HI(hfp_get_user_minor_bldtime_version); REJECT_IMPROPER_HI(hfp_get_user_minor_runtime_version); REJECT_IMPROPER_HI(hfp_hfi_reset_context); REJECT_IMPROPER_HI(hfp_poll_type); REJECT_IMPROPER_HI(hfp_retire_hdr_q_entry); REJECT_IMPROPER_HI(hfp_set_cl_q_head_index); REJECT_IMPROPER_HI(hfp_set_cl_q_tail_index); REJECT_IMPROPER_HI(hfp_set_effective_mtu); REJECT_IMPROPER_HI(hfp_set_pbc); REJECT_IMPROPER_HI(hfp_set_pio_size); REJECT_IMPROPER_HI(hfp_set_pkey); REJECT_IMPROPER_HI(hfp_set_rhf_expected_sequence_number); REJECT_IMPROPER_HI(hfp_set_tf_valid); REJECT_IMPROPER_HI(hfp_spio_fini); REJECT_IMPROPER_HI(hfp_spio_init); REJECT_IMPROPER_HI(hfp_spio_process_events); REJECT_IMPROPER_HI(hfp_spio_transfer_frame); REJECT_IMPROPER_HI(hfp_subcontext_ureg_get); REJECT_IMPROPER_HI(hfp_tidflow_check_update_pkt_seq); REJECT_IMPROPER_HI(hfp_tidflow_get); REJECT_IMPROPER_HI(hfp_tidflow_get_enabled); REJECT_IMPROPER_HI(hfp_tidflow_get_flowvalid); REJECT_IMPROPER_HI(hfp_tidflow_get_genmismatch); REJECT_IMPROPER_HI(hfp_tidflow_get_genval); REJECT_IMPROPER_HI(hfp_tidflow_get_hw); REJECT_IMPROPER_HI(hfp_tidflow_get_keep_after_seqerr); REJECT_IMPROPER_HI(hfp_tidflow_get_keep_on_generr); REJECT_IMPROPER_HI(hfp_tidflow_get_keep_payload_on_generr); REJECT_IMPROPER_HI(hfp_tidflow_get_seqmismatch); REJECT_IMPROPER_HI(hfp_tidflow_get_seqnum); REJECT_IMPROPER_HI(hfp_tidflow_reset); REJECT_IMPROPER_HI(hfp_tidflow_set_entry); REJECT_IMPROPER_HI(hfp_update_tid); REJECT_IMPROPER_HI(hfp_writev); #endif REJECT_IMPROPER_HI(hfp_get_default_pkey); REJECT_IMPROPER_HI(hfp_get_num_ports); REJECT_IMPROPER_HI(hfp_get_num_units); REJECT_IMPROPER_HI(hfp_initialize); #ifndef PSM2_MOCK_TESTING if (!sysfs_init(psm_hi->hfi_sys_class_path)) #endif SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi); } static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void); int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) { va_list ap; va_start(ap, k); int rv = 0; struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(); if (!p) rv = -1; else { switch(k) { case psmi_hal_pre_init_cache_func_get_num_units: rv = p->params.num_units; break; case psmi_hal_pre_init_cache_func_get_num_ports: rv = p->params.num_ports; break; case psmi_hal_pre_init_cache_func_get_unit_active: { int unit = va_arg(ap,int); if ((unit >= 0) && (unit < p->params.num_units)) { if (!p->params.unit_active_valid[unit]) { p->params.unit_active_valid[unit] = 1; p->params.unit_active[unit] = p->hfp_get_unit_active(unit); } rv = p->params.unit_active[unit]; } else rv = -1; } break; case psmi_hal_pre_init_cache_func_get_port_active: { int unit = va_arg(ap,int); if ((unit >= 0) && (unit < p->params.num_units)) { int port = va_arg(ap,int); if ((port >= 1) && (port <= p->params.num_ports)) { if (!p->params.port_active_valid[unit*port]) { p->params.port_active_valid[unit*port] = 1; p->params.port_active[unit*port] = p->hfp_get_port_active(unit,port); } rv = p->params.port_active[unit*port]; } else rv = -1; } else rv = -1; } break; case psmi_hal_pre_init_cache_func_get_num_contexts: { int unit = va_arg(ap,int); if ((unit >= 0) && (unit < p->params.num_units)) { if (!p->params.num_contexts_valid[unit]) { p->params.num_contexts_valid[unit] = 1; p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit); } rv = p->params.num_contexts[unit]; } else rv = -1; } break; case psmi_hal_pre_init_cache_func_get_num_free_contexts: { int unit = va_arg(ap,int); if ((unit >= 0) && (unit < p->params.num_units)) { if (!p->params.num_free_contexts_valid[unit]) { p->params.num_free_contexts_valid[unit] = 1; p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit); } rv = p->params.num_free_contexts[unit]; } else rv = -1; } break; case psmi_hal_pre_init_cache_func_get_default_pkey: rv = p->params.default_pkey; break; default: rv = -1; break; } } va_end(ap); return rv; } static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void) { if (psmi_hal_current_hal_instance) return psmi_hal_current_hal_instance; if (SLIST_EMPTY(&head_hi)) return NULL; /* At this point, assuming there are multiple HAL INSTANCES that are registered, and two or more of the HAL INSTANCES are capable of initialization on a host, the environment variable PSM2_HAL_PREF allows the user to identify the one HAL INSTANCE that is desired to be used. The default policy is, when the PSM2_HAL_PREF is not set, the first hal instance that successfully initializes is used. */ union psmi_envvar_val env_hi_pref; /* HAL instance preference */ psmi_getenv("PSM2_HAL_PREF", "Indicate preference for HAL instance (Default is use first HAL" " instance to successfully initialize))", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)PSM_HAL_INSTANCE_ANY_GEN, &env_hi_pref); /* The hfp_get_num_units() call below, will not wait for the HFI driver to come up and create device nodes in /dev/.) */ struct _psmi_hal_instance *p; SLIST_FOREACH(p, &head_hi, next_hi) { if ((env_hi_pref.e_int == PSM_HAL_INSTANCE_ANY_GEN) || (p->type == env_hi_pref.e_int)) { const int valid_flags = PSM_HAL_PARAMS_VALID_DEFAULT_PKEY | PSM_HAL_PARAMS_VALID_NUM_UNITS | PSM_HAL_PARAMS_VALID_NUM_PORTS; if ((p->params.sw_status & valid_flags) == valid_flags) return p; int nunits = p->hfp_get_num_units(); int nports = p->hfp_get_num_ports(); int dflt_pkey = p->hfp_get_default_pkey(); if (nunits > 0 && nports > 0 && dflt_pkey > 0 #ifndef PSM2_MOCK_TESTING && (0 == sysfs_init(p->hfi_sys_class_path)) #endif ) { p->params.num_units = nunits; p->params.num_ports = nports; p->params.default_pkey = dflt_pkey; p->params.sw_status |= valid_flags; p->params.unit_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t)); p->params.unit_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(int8_t)); p->params.port_active = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t)); p->params.port_active_valid = (int8_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits*nports, sizeof(int8_t)); p->params.num_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(uint16_t)); p->params.num_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(uint16_t)); p->params.num_free_contexts = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(uint16_t)); p->params.num_free_contexts_valid = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, nunits, sizeof(uint16_t)); return p; } } } return NULL; } /* psmi_hal_initialize */ int psmi_hal_initialize(void) { struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(); if (!p) return -PSM_HAL_ERROR_INIT_FAILED; int rv = p->hfp_initialize(p); if (!rv) { psmi_hal_current_hal_instance = p; if (psmi_hal_has_cap(PSM_HAL_CAP_HDRSUPP)) { union psmi_envvar_val env_hdrsupp; psmi_getenv("PSM2_HDRSUPP", "Receive header suppression. Default is 1 (enabled)," " 0 to disable.\n", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, &env_hdrsupp); if (env_hdrsupp.e_uint) psmi_hal_add_sw_status(PSM_HAL_HDRSUPP_ENABLED); else /* user wants to disable header suppression */ psmi_hal_set_tf_valid(0, p); } return rv; } return -PSM_HAL_ERROR_INIT_FAILED; } int psmi_hal_finalize(void) { struct _psmi_hal_instance *p = psmi_hal_current_hal_instance; int rv = psmi_hal_finalize_(); psmi_free(p->params.unit_active); psmi_free(p->params.unit_active_valid); psmi_free(p->params.port_active); psmi_free(p->params.port_active_valid); psmi_free(p->params.num_contexts); psmi_free(p->params.num_contexts_valid); psmi_free(p->params.num_free_contexts); psmi_free(p->params.num_free_contexts_valid); p->params.unit_active = NULL; p->params.unit_active_valid = NULL; p->params.port_active = NULL; p->params.port_active_valid = NULL; p->params.num_contexts = NULL; p->params.num_contexts_valid = NULL; p->params.num_free_contexts = NULL; p->params.num_free_contexts_valid = NULL; psmi_hal_current_hal_instance = NULL; sysfs_fini(); return rv; } #ifdef PSM2_MOCK_TESTING #include "psm_hal_gen1/opa_user_gen1.h" void ips_ptl_non_dw_mul_sdma_init(void) { uint16_t major_version = hfi_get_user_major_version(); uint16_t minor_version = hfi_get_user_minor_version(); int allow_non_dw_mul = 0; if ((major_version > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) || ((major_version == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) && (minor_version >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED))) { allow_non_dw_mul = 1; } psmi_hal_current_hal_instance->params.cap_mask = 0; if (allow_non_dw_mul) psmi_hal_current_hal_instance->params.cap_mask |= PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE; } void set_sdma_ring_size_in_MOCK_HAL_instance(int sdma_ring_size) { extern int __psm_hal_mock_sdma_ring_size; __psm_hal_mock_sdma_ring_size = sdma_ring_size; } void set_comp_entry(struct hfi1_sdma_comp_entry *pce) { extern struct hfi1_sdma_comp_entry * __psm_hal_mock_hfi1_sdma_comp_entry; __psm_hal_mock_hfi1_sdma_comp_entry = pce; } #endif opa-psm2-PSM2_11.2.185/psm2_hal.h000066400000000000000000001147611370564314600161110ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __PSM2_HAL_H__ #define __PSM2_HAL_H__ #include "psm_user.h" /* Forward declaration of PSM structs: */ struct ips_subcontext_ureg; struct ips_recvhdrq_event; struct ips_writehdrq; struct ips_flow; struct ips_scb; struct ips_tid_session_list_tag; struct ips_epinfo; struct ips_message_header; /* Declare types: */ typedef enum { PSM_HAL_INSTANCE_ANY_GEN = 0, PSM_HAL_INSTANCE_GEN1 = 1, PSM_HAL_INSTANCE_GEN2 = 2, PSM_HAL_INSTANCE_GEN3 = 3, #ifdef PSM2_MOCK_TESTING PSM_HAL_INSTANCE_MOCK = 99, #endif } psmi_hal_instance_type; typedef enum { /* Operation was successful. No error occurred. */ PSM_HAL_ERROR_OK = 0, /* The operation can not be done unless HAL is initialized first. */ PSM_HAL_ERROR_NOT_INITIALIZED = 1, /* No HAL INSTANCE has been registered. Initialization is impossible. */ PSM_HAL_ERROR_NO_HI_REGISTERED = 2, /* Initialization failure. */ PSM_HAL_ERROR_INIT_FAILED = 3, /* Can't open device file. */ PSM_HAL_ERROR_CANNOT_OPEN_DEVICE = 4, /* Can't open context. */ PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT = 5, /* Context is not open. */ PSM_HAL_ERROR_CONTEXT_IS_NOT_OPEN = 6, /* General error. */ PSM_HAL_ERROR_GENERAL_ERROR = 7, /* Not implemented. */ PSM_HAL_ERROR_NOT_IMPLEMENTED = 8, /* Internal error. */ PSM_HAL_ERROR_INTERNAL_ERROR = 9, /* HAL instances should not return errors less than the value PSM_HAL_ERROR_RESERVED_BY_HAL_API. These errors are reserved by the HAL API layer. */ PSM_HAL_ERROR_RESERVED_BY_HAL_API = 1000, } psmi_hal_errors; typedef enum { PSM_HAL_HW_STATUS_INITTED = (1UL << 0), PSM_HAL_HW_STATUS_CHIP_PRESENT = (1UL << 1), PSM_HAL_HW_STATUS_IB_READY = (1UL << 2), PSM_HAL_HW_STATUS_IB_CONF = (1UL << 3), PSM_HAL_HW_STATUS_HWERROR = (1UL << 4) } psmi_hal_hw_status; typedef enum { PSM_HAL_HFI_EVENT_FROZEN = (1UL << 0), PSM_HAL_HFI_EVENT_LINKDOWN = (1UL << 1), PSM_HAL_HFI_EVENT_LID_CHANGE = (1UL << 2), PSM_HAL_HFI_EVENT_LMC_CHANGE = (1UL << 3), PSM_HAL_HFI_EVENT_SL2VL_CHANGE = (1UL << 4), PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY = (1UL << 5) } psmi_hal_hfi_events; /* The following enum constants correspond to the bits in the cap_mask member of the psmi_hal_params_t. */ typedef enum { PSM_HAL_CAP_SDMA = (1UL << 0), PSM_HAL_CAP_SDMA_AHG = (1UL << 1), PSM_HAL_CAP_EXTENDED_PSN = (1UL << 2), PSM_HAL_CAP_HDRSUPP = (1UL << 3), PSM_HAL_CAP_USE_SDMA_HEAD = (1UL << 4), PSM_HAL_CAP_MULTI_PKT_EGR = (1UL << 5), PSM_HAL_CAP_NODROP_RHQ_FULL = (1UL << 6), PSM_HAL_CAP_NODROP_EGR_FULL = (1UL << 7), PSM_HAL_CAP_TID_UNMAP = (1UL << 8), PSM_HAL_CAP_PRINT_UNIMPL = (1UL << 9), PSM_HAL_CAP_ALLOW_PERM_JKEY = (1UL << 10), PSM_HAL_CAP_NO_INTEGRITY = (1UL << 11), PSM_HAL_CAP_PKEY_CHECK = (1UL << 12), PSM_HAL_CAP_STATIC_RATE_CTRL = (1UL << 13), PSM_HAL_CAP_SDMA_HEAD_CHECK = (1UL << 14), PSM_HAL_CAP_EARLY_CREDIT_RETURN = (1UL << 15), PSM_HAL_CAP_GPUDIRECT_OT = (1UL << 16), PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS = (1UL << 17), PSM_HAL_CAP_RSM_FECN_SUPP = (1UL << 18), PSM_HAL_CAP_MERGED_TID_CTRLS = (1UL << 19), PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE = (1UL << 20), } psmi_hal_capability_bits; /* The following enum constants correspond to the bits in the sw_status member of the psmi_hal_params_t. */ typedef enum { /* Request to start rx thread. */ PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD = (1UL << 0), /* Rx thread is started. */ PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED = (1UL << 1), PSM_HAL_PSMI_RUNTIME_INTR_ENABLED = (1UL << 2), /* Header suppression is enabled: */ PSM_HAL_HDRSUPP_ENABLED = (1UL << 3), PSM_HAL_PARAMS_VALID_NUM_UNITS = (1UL << 4), PSM_HAL_PARAMS_VALID_NUM_PORTS = (1UL << 5), PSM_HAL_PARAMS_VALID_DEFAULT_PKEY = (1UL << 6), } psmi_hal_sw_status; /* The _psmi_hal_params structure stores values that remain constant for the entire life of the process and this structure resides in the hal instance structure (below). The values are settled after the context is opened. */ typedef struct _psmi_hal_params { uint32_t cap_mask; uint32_t sw_status; /* start cached members */ uint16_t num_units; uint16_t num_ports; uint16_t default_pkey; int8_t *unit_active,*unit_active_valid; int8_t *port_active,*port_active_valid; uint16_t *num_contexts,*num_contexts_valid; uint16_t *num_free_contexts,*num_free_contexts_valid; } psmi_hal_params_t; /* HAL assumes that the rx hdr q and the egr buff q are circular lists with two important indexes: head - software takes from this side of the circular list tail - hardware deposits new content here The indexes advance in the list 0, 1, 2, 3, ... until they reach the value: (number_of_entries_in_the_q-1), then the next value they take is 0. And, so, that is why these are called circular lists. When the head idx == tail idx, that represents an empty circular list. A completely full circular list is when: head_idx == (tail_idx + 1) % number_of_entries_in_the_q Both indexes will always be in the range: 0 <= index < number_of_entries_in_the_q After software receives the packet in the slot corresponding to the head idx, and processes it completely, software will signal to the hardware that the slot is available for re-use by retiring it - see api below for details. Note that these are simplified assumptions for the benefit of the hardware independent layer of PSM. The actual implementation details are hidden in the hal instances. Note that subcontexts have a collection of head / tail indexes for their use. So, HAL supports the use of the following circular lists dealing with the following entities: 1. Rx Hdr q - corresponding to hardware (software modifies head index, hardware modifies tail index). 2. Rx egr q - corresponding to hardware (software modifies head index, hardware modifies tail index). 3. Rx Hdr q - corresponding to a subcontext (software modifies both head and tail indexes). 4. Rx egr q - corresponding to a subcontext (software modifies both head and tail indexes). Declare a type to indicate a circular list index: */ typedef uint32_t psmi_hal_cl_idx; typedef enum { PSM_HAL_CL_Q_RX_HDR_Q = 0, /* HW context for the rx hdr q. */ PSM_HAL_CL_Q_RX_EGR_Q = 1, /* HW context for the rx eager q. */ /* Start of subcontexts (This is subcontext 0) */ PSM_HAL_CL_Q_RX_HDR_Q_SC_0 = 2, /* Subcontext 0's rx hdr q. */ PSM_HAL_CL_Q_RX_EGR_Q_SC_0 = 3, /* Subcontext 0's rx eager q. */ /* Following SC 0's CL_Q's are the circular list q for subcontexts 1-7, two per subcontext. Even values are the rx hdr q for the subcontext Odd value are for the eager q. */ /* Given a subcontext number (0-7), return the CL_Q for the RX HDR_Q: */ #define PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(SC) ((SC)*2 + PSM_HAL_CL_Q_RX_HDR_Q_SC_0) /* Given a subcontext number (0-7), return the CL_Q for the RX EGR_Q: */ #define PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(SC) ((SC)*2 + PSM_HAL_CL_Q_RX_EGR_Q_SC_0) } psmi_hal_cl_q; #define PSM_HAL_MAX_SHARED_CTXTS 8 #define PSM_HAL_ALG_ACROSS 0 #define PSM_HAL_ALG_WITHIN 1 #define PSM_HAL_ALG_ACROSS_ALL 2 typedef enum { PSM_HAL_EXP = 0, PSM_HAL_EGR = 1, } psmi_hal_set_sdma_req_type; #define PSM_HAL_SDMA_REQ_VERSION_MASK 0xF #define PSM_HAL_SDMA_REQ_VERSION_SHIFT 0x0 #define PSM_HAL_SDMA_REQ_OPCODE_MASK 0xF #define PSM_HAL_SDMA_REQ_OPCODE_SHIFT 0x4 #define PSM_HAL_SDMA_REQ_IOVCNT_MASK 0xFF #define PSM_HAL_SDMA_REQ_IOVCNT_SHIFT 0x8 #ifdef PSM_CUDA #define PSM_HAL_BUF_GPU_MEM 1 #endif struct psm_hal_sdma_req_info { /* * bits 0-3 - version (currently used only for GPU direct) * 1 - user space is NOT using flags field * 2 - user space is using flags field * bits 4-7 - opcode (enum sdma_req_opcode) * bits 8-15 - io vector count */ __u16 ctrl; /* * Number of fragments contained in this request. * User-space has already computed how many * fragment-sized packet the user buffer will be * split into. */ __u16 npkts; /* * Size of each fragment the user buffer will be * split into. */ __u16 fragsize; /* * Index of the slot in the SDMA completion ring * this request should be using. User-space is * in charge of managing its own ring. */ __u16 comp_idx; #ifdef PSM_CUDA /* * Buffer flags for this request. See HFI1_BUF_* */ __u16 flags; /* The extra bytes for the PSM_CUDA version of the sdma req info * struct is the size of the flags member. */ #define PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA sizeof(__u16) #endif } __attribute__((packed)); typedef enum { PSM_HAL_SDMA_RING_AVAILABLE = 0, PSM_HAL_SDMA_RING_QUEUED = 1, PSM_HAL_SDMA_RING_COMPLETE = 2, PSM_HAL_SDMA_RING_ERROR = 3, } psmi_hal_sdma_ring_slot_status; typedef uint64_t psmi_hal_raw_rhf_t; typedef struct psmi_hal_rhf_ { /* The first entity in rhf is the decomposed rhf. Each HAL instance, in hfp_get_receive_event(), will decompose the raw rhf obtained from the hardware and deposit the data into this common decomposed rhf, so the upper layers of psm can find the data in one uniform place. */ uint64_t decomposed_rhf; /* The second entry is the raw rhf that comes from the h/w. The upper layers of psm should not use the raw rhf, instead use the decomposed rhf above. The raw rhf is intended for use by the HAL instance only. */ uint64_t raw_rhf; } psmi_hal_rhf_t; #define PSMI_HAL_RHF_ERR_ICRC_NBITS 1 #define PSMI_HAL_RHF_ERR_ICRC_SHFTC 63 #define PSMI_HAL_RHF_ERR_RSRV_NBITS 1 #define PSMI_HAL_RHF_ERR_RSRV_SHFTC 62 #define PSMI_HAL_RHF_ERR_ECC_NBITS 1 #define PSMI_HAL_RHF_ERR_ECC_SHFTC 61 #define PSMI_HAL_RHF_ERR_LEN_NBITS 1 #define PSMI_HAL_RHF_ERR_LEN_SHFTC 60 #define PSMI_HAL_RHF_ERR_TID_NBITS 1 #define PSMI_HAL_RHF_ERR_TID_SHFTC 59 #define PSMI_HAL_RHF_ERR_TFGEN_NBITS 1 #define PSMI_HAL_RHF_ERR_TFGEN_SHFTC 58 #define PSMI_HAL_RHF_ERR_TFSEQ_NBITS 1 #define PSMI_HAL_RHF_ERR_TFSEQ_SHFTC 57 #define PSMI_HAL_RHF_ERR_RTE_NBITS 3 #define PSMI_HAL_RHF_ERR_RTE_SHFTC 56 #define PSMI_HAL_RHF_ERR_DC_NBITS 1 #define PSMI_HAL_RHF_ERR_DC_SHFTC 55 #define PSMI_HAL_RHF_ERR_DCUN_NBITS 1 #define PSMI_HAL_RHF_ERR_DCUN_SHFTC 54 #define PSMI_HAL_RHF_ERR_KHDRLEN_NBITS 1 #define PSMI_HAL_RHF_ERR_KHDRLEN_SHFTC 53 #define PSMI_HAL_RHF_ALL_ERR_FLAGS_NBITS (PSMI_HAL_RHF_ERR_ICRC_NBITS + PSMI_HAL_RHF_ERR_RSRV_NBITS \ + PSMI_HAL_RHF_ERR_ECC_NBITS \ + PSMI_HAL_RHF_ERR_LEN_NBITS + PSMI_HAL_RHF_ERR_TID_NBITS \ + PSMI_HAL_RHF_ERR_TFGEN_NBITS + PSMI_HAL_RHF_ERR_TFSEQ_NBITS \ + PSMI_HAL_RHF_ERR_RTE_NBITS + PSMI_HAL_RHF_ERR_DC_NBITS \ + PSMI_HAL_RHF_ERR_DCUN_NBITS + PSMI_HAL_RHF_ERR_KHDRLEN_NBITS) #define PSMI_HAL_RHF_ALL_ERR_FLAGS_SHFTC 53 #define PSMI_HAL_RHF_EGR_BUFF_OFF_NBITS 12 #define PSMI_HAL_RHF_EGR_BUFF_OFF_SHFTC 32 #define PSMI_HAL_RHF_SEQ_NBITS 4 #define PSMI_HAL_RHF_SEQ_SHFTC 28 #define PSMI_HAL_RHF_EGR_BUFF_IDX_NBITS 11 #define PSMI_HAL_RHF_EGR_BUFF_IDX_SHFTC 16 #define PSMI_HAL_RHF_USE_EGR_BUFF_NBITS 1 #define PSMI_HAL_RHF_USE_EGR_BUFF_SHFTC 15 #define PSMI_HAL_RHF_RX_TYPE_NBITS 3 #define PSMI_HAL_RHF_RX_TYPE_SHFTC 12 #define PSMI_HAL_RHF_PKT_LEN_NBITS 12 #define PSMI_HAL_RHF_PKT_LEN_SHFTC 0 typedef enum { PSM_HAL_RHF_RX_TYPE_EXPECTED = 0, PSM_HAL_RHF_RX_TYPE_EAGER = 1, PSM_HAL_RHF_RX_TYPE_NON_KD = 2, PSM_HAL_RHF_RX_TYPE_ERROR = 3 } psmi_hal_rhf_rx_type; struct psm_hal_pbc { __u32 pbc0; __u16 PbcStaticRateControlCnt; __u16 fill1; }; typedef enum { PSMI_HAL_POLL_TYPE_URGENT = 1 } psmi_hal_poll_type; /* Forward declaration of incomplete struct type _psmi_hal_instance and * psmi_hal_instance_t typedef: */ struct _psmi_hal_instance; typedef struct _psmi_hal_instance psmi_hal_instance_t; struct _psmi_hal_instance { SLIST_ENTRY(_psmi_hal_instance) next_hi; psmi_hal_instance_type type; const char *description; const char *hfi_name; const char *hfi_sys_class_path; /* The params member should be read-only for HIC, and written only by the HAL instance. */ psmi_hal_params_t params; /* Initialize the HAL INSTANCE. */ int (*hfp_initialize)(psmi_hal_instance_t *); /* Finalize the HAL INSTANCE. */ int (*hfp_finalize_)(void); /* Returns the number of hfi units installed on ths host: NOTE: hfp_get_num_units is a function that must be callable before the hal instance is initialized. */ int (*hfp_get_num_units)(void); /* Returns the number of ports on each hfi unit installed. on ths host. NOTE: hfp_get_num_ports is a function that must be callable before the hal instance is initialized. */ int (*hfp_get_num_ports)(void); /* Returns the default pkey: NOTE: hfp_get_default_pkey is a function that must be callable before the hal instance is initialized. */ int (*hfp_get_default_pkey)(void); /* Given a unit number, returns 1 if any port on the unit is active. returns 0 if no port on the unit is active. returns -1 when an error occurred. NOTE: hfp_get_unit_active is a function that must be callable before the hal instance is initialized. */ int (*hfp_get_unit_active)(int unit); int (*hfp_get_port_active)(int unit,int port); /* NOTE: hfp_get_num_contexts is a function that must be callable before the hal instance is initialized. */ int (*hfp_get_num_contexts)(int unit); /* NOTE: hfp_get_num_free_contexts is a function that must be callable before the hal instance is initialized. */ int (*hfp_get_num_free_contexts)(int unit); /* Context open includes opening the device file, and get hw params. */ int (*hfp_context_open)(int unit, int port, uint64_t open_timeout, psm2_ep_t ep, psm2_uuid_t const job_key, psmi_context_t *psm_ctxt, uint32_t cap_mask, unsigned retryCnt); /* Close the context, including the device file. */ int (*hfp_close_context)(psmi_hal_hw_context *); /* Given a unit, port and index, return an error, or the corresponding pkey for the index as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int (*hfp_get_port_index2pkey)(int unit, int port, int index); int (*hfp_get_cc_settings_bin)(int unit, int port, char *ccabuf, size_t len_ccabuf); int (*hfp_get_cc_table_bin)(int unit, int port, uint16_t **cctp); int (*hfp_get_port_lmc)(int unit, int port); int (*hfp_get_port_rate)(int unit, int port); int (*hfp_get_port_sl2sc)(int unit, int port,int sl); int (*hfp_get_sc2vl_map)(struct ips_proto *proto); int (*hfp_set_pkey)(psmi_hal_hw_context, uint16_t); int (*hfp_poll_type)(uint16_t poll_type, psmi_hal_hw_context); int (*hfp_get_port_lid)(int unit, int port); int (*hfp_get_port_gid)(int unit, int port, uint64_t *hi, uint64_t *lo); int (*hfp_free_tid)(psmi_hal_hw_context, uint64_t tidlist, uint32_t tidcnt); int (*hfp_get_tidcache_invalidation)(psmi_hal_hw_context, uint64_t tidlist, uint32_t *tidcnt); int (*hfp_update_tid)(psmi_hal_hw_context, uint64_t vaddr, uint32_t *length, uint64_t tidlist, uint32_t *tidcnt, uint16_t flags); /* Initiate a DMA. Intrinsically specifies a DMA slot to use. */ int (*hfp_writev)(const struct iovec *iov, int iovcnt, struct ips_epinfo *, psmi_hal_hw_context); /* Updates PSM from h/w on DMA completions: */ int (*hfp_get_sdma_ring_slot_status)(int slotIdx, psmi_hal_sdma_ring_slot_status *, uint32_t *errorCode, psmi_hal_hw_context); /* Returns > 0 if the specified slots is available. 0 if not available and a negative value if an error occurred. */ int (*hfp_dma_slot_available)(int slotidx, psmi_hal_hw_context); /* Start of receive packet functions. */ /* Getter for cl q head indexes: */ psmi_hal_cl_idx (*hfp_get_cl_q_head_index)(psmi_hal_cl_q, psmi_hal_hw_context); /* Getter for cl q tail indexes: */ psmi_hal_cl_idx (*hfp_get_cl_q_tail_index)(psmi_hal_cl_q, psmi_hal_hw_context); /* Setter for cl q head indexes: */ void (*hfp_set_cl_q_head_index)(psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); /* Setter for cl q tail indexes: */ void (*hfp_set_cl_q_tail_index)(psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); /* Indicate whether the cl q is empty. When this returns > 0 the cl q is empty. When this returns == 0, the cl q is NOT empty (there are packets in the circular list that are available to receive). When this returns < 0, an error occurred. the parameter should correspond to the head index of the cl q circular list. */ int (*hfp_cl_q_empty)(psmi_hal_cl_idx head_idx, psmi_hal_cl_q, psmi_hal_hw_context); /* Receive the raw rhf, decompose it, and then receive the ips_message_hdr. */ int (*hfp_get_receive_event)(psmi_hal_cl_idx head_idx, psmi_hal_hw_context, struct ips_recvhdrq_event *); /* Deliver an eager buffer given the index. If the index does not refer to a current egr buffer, hfp_get_egr_buff() returns NULL. */ void *(*hfp_get_egr_buff)(psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); /* Retire the given head idx of the header q, and change *head_idx to point to the next entry, lastly set *empty to indicate whether the headerq is empty at the new head_idx. */ int (*hfp_retire_hdr_q_entry)(psmi_hal_cl_idx *head_idx, psmi_hal_cl_q, psmi_hal_hw_context, uint32_t elemsz, uint32_t elemlast, int *emptyp); /* Returns expected sequence number for RHF. */ int (*hfp_get_rhf_expected_sequence_number)(unsigned int *, psmi_hal_cl_q, psmi_hal_hw_context); /* Sets expected sequence number for RHF. */ int (*hfp_set_rhf_expected_sequence_number)(unsigned int, psmi_hal_cl_q, psmi_hal_hw_context); /* Checks sequence number from RHF. Returns PSM_HAL_ERROR_OK if the sequence number is good returns something else if the sequence number is bad. */ int (*hfp_check_rhf_sequence_number)(unsigned int); /* Set PBC struct that lies within the extended memory region of SCB */ int (*hfp_set_pbc)(struct ips_proto *proto, struct ips_flow *flow, uint32_t isCtrlMsg, struct psm_hal_pbc *dest, uint32_t hdrlen, uint32_t paylen); /* Start of tid flow functions. */ int (*hfp_set_tf_valid)(uint32_t, psmi_hal_hw_context); int (*hfp_tidflow_set_entry)(uint32_t flowid, uint32_t genval, uint32_t seqnum, psmi_hal_hw_context); int (*hfp_tidflow_reset)(psmi_hal_hw_context, uint32_t flowid, uint32_t genval, uint32_t seqnum); int (*hfp_tidflow_get)(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); /* hfp_tidflow_get_hw is identical to hfp_tidflow_get(), but guarantees to get its information fron h/w, and not from cached values, but may be significantly slower than hfp_tidflow_get(), so should be used for debug only. */ int (*hfp_tidflow_get_hw)(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); int (*hfp_tidflow_get_seqnum)(uint64_t val, uint32_t *pseqn); int (*hfp_tidflow_get_genval)(uint64_t val, uint32_t *pgv); int (*hfp_tidflow_check_update_pkt_seq)(void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, psmi_seqnum_t sequence_num, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr, void (*ips_protoexp_do_tf_generr) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr), void (*ips_protoexp_do_tf_seqerr) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr) ); int (*hfp_tidflow_get_flowvalid)(uint64_t val, uint32_t *pfv); int (*hfp_tidflow_get_enabled)(uint64_t val, uint32_t *penabled); int (*hfp_tidflow_get_keep_after_seqerr)(uint64_t val, uint32_t *pkase); int (*hfp_tidflow_get_keep_on_generr)(uint64_t val, uint32_t *pkoge); int (*hfp_tidflow_get_keep_payload_on_generr)(uint64_t val, uint32_t *pkpoge); /* For hfp_tidflow_get_seqmismatch and hfp_tidflow_get_genmismatch, if val was obtained from hfp_tidflow_get_hw(), then these will be valid but, if val was obtained from hfp_tidflow_get(), then these will always return 0. */ int (*hfp_tidflow_get_seqmismatch)(uint64_t val, uint32_t *psmm); int (*hfp_tidflow_get_genmismatch)(uint64_t val, uint32_t *pgmm); /* End of tid flow functions. */ /* End of receive functions. */ int (*hfp_forward_packet_to_subcontext)(struct ips_writehdrq *writeq, struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext, psmi_hal_hw_context); int (*hfp_subcontext_ureg_get)(ptl_t *ptl, struct ips_subcontext_ureg **uregp, psmi_hal_hw_context); int (*hfp_get_hfi_event_bits) (uint64_t *event_bits, psmi_hal_hw_context); int (*hfp_ack_hfi_event) (uint64_t ack_bits, psmi_hal_hw_context); int (*hfp_hfi_reset_context) (psmi_hal_hw_context); uint64_t (*hfp_get_hw_status) (psmi_hal_hw_context); int (*hfp_get_hw_status_freezemsg) (volatile char** msg, psmi_hal_hw_context); uint16_t (*hfp_get_user_major_bldtime_version) (void); uint16_t (*hfp_get_user_minor_bldtime_version) (void); uint16_t (*hfp_get_user_major_runtime_version) (psmi_hal_hw_context); uint16_t (*hfp_get_user_minor_runtime_version) (psmi_hal_hw_context); int (*hfp_set_pio_size)(uint32_t, psmi_hal_hw_context); int (*hfp_set_effective_mtu)(uint32_t, psmi_hal_hw_context); int (*hfp_spio_init)(const psmi_context_t *context, struct ptl *ptl, void **ctrl); int (*hfp_spio_fini)(void **ctrl, psmi_hal_hw_context); int (*hfp_spio_transfer_frame)(struct ips_proto *proto, struct ips_flow *flow, struct psm_hal_pbc *pbc, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum, psmi_hal_hw_context #ifdef PSM_CUDA , uint32_t is_cuda_payload #endif ); int (*hfp_spio_process_events)(const struct ptl *ptl); int (*hfp_get_node_id)(int unit, int *nodep); int (*hfp_get_bthqp)(psmi_hal_hw_context); int (*hfp_get_context)(psmi_hal_hw_context); uint64_t (*hfp_get_gid_lo)(psmi_hal_hw_context); uint64_t (*hfp_get_gid_hi)(psmi_hal_hw_context); int (*hfp_get_hfi_type)(psmi_hal_hw_context); int (*hfp_get_jkey)(psmi_hal_hw_context); int (*hfp_get_lid)(psmi_hal_hw_context); int (*hfp_get_pio_size)(psmi_hal_hw_context); int (*hfp_get_port_num)(psmi_hal_hw_context); int (*hfp_get_rx_egr_tid_cnt)(psmi_hal_hw_context); int (*hfp_get_rx_hdr_q_cnt)(psmi_hal_hw_context); int (*hfp_get_rx_hdr_q_ent_size)(psmi_hal_hw_context); int (*hfp_get_sdma_req_size)(psmi_hal_hw_context); int (*hfp_get_sdma_ring_size)(psmi_hal_hw_context); int (*hfp_get_subctxt)(psmi_hal_hw_context); int (*hfp_get_subctxt_cnt)(psmi_hal_hw_context); int (*hfp_get_tid_exp_cnt)(psmi_hal_hw_context); int (*hfp_get_unit_id)(psmi_hal_hw_context); int (*hfp_get_fd)(psmi_hal_hw_context); int (*hfp_get_pio_stall_cnt)(psmi_hal_hw_context, uint64_t **); }; /* This is the current psmi_hal_instance, or, NULL if not initialized. The HIC should not modify the contents of the HAL instance directly. */ extern psmi_hal_instance_t *psmi_hal_current_hal_instance; /* Declare functions called by the HAL INSTANCES. */ void psmi_hal_register_instance(psmi_hal_instance_t *); /* Declare functions that are called by the HIC: */ /* All of these functions return a negative int value to indicate failure, or >= 0 for success. */ /* Chooses one of the the psmi_hal_instances that have been registered and then initializes it. Returns: -PSM_HAL_ERROR_NOT_REGISTERED_HI if no HAL INSTANCES are registered, or PSM_HAL_ERROR_INIT_FAILED when another failure has occured during initialization. */ int psmi_hal_initialize(void); int psmi_hal_finalize(void); #include "psm2_hal_inlines_d.h" enum psmi_hal_pre_init_cache_func_krnls { psmi_hal_pre_init_cache_func_get_num_units, psmi_hal_pre_init_cache_func_get_num_ports, psmi_hal_pre_init_cache_func_get_unit_active, psmi_hal_pre_init_cache_func_get_port_active, psmi_hal_pre_init_cache_func_get_num_contexts, psmi_hal_pre_init_cache_func_get_num_free_contexts, psmi_hal_pre_init_cache_func_get_default_pkey, }; int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...); #define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_cache_func(psmi_hal_pre_init_cache_func_ ## KERNEL , ##__VA_ARGS__ ) ) #if PSMI_HAL_INST_CNT == 1 #define PSMI_HAL_DISPATCH(KERNEL,...) ( PSMI_HAL_CAT_INL_SYM(KERNEL) ( __VA_ARGS__ ) ) #else #define PSMI_HAL_DISPATCH(KERNEL,...) ( psmi_hal_current_hal_instance->hfp_ ## KERNEL ( __VA_ARGS__ )) #endif #define psmi_hal_get_num_units_(...) PSMI_HAL_DISPATCH_PI(get_num_units,##__VA_ARGS__) #define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) #define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) #define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) #define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) #define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) #define psmi_hal_get_default_pkey(...) PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__) #define psmi_hal_context_open(...) PSMI_HAL_DISPATCH(context_open,__VA_ARGS__) #define psmi_hal_close_context(...) PSMI_HAL_DISPATCH(close_context,__VA_ARGS__) #define psmi_hal_get_port_index2pkey(...) PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__) #define psmi_hal_get_cc_settings_bin(...) PSMI_HAL_DISPATCH(get_cc_settings_bin,__VA_ARGS__) #define psmi_hal_get_cc_table_bin(...) PSMI_HAL_DISPATCH(get_cc_table_bin,__VA_ARGS__) #define psmi_hal_get_port_lmc(...) PSMI_HAL_DISPATCH(get_port_lmc,__VA_ARGS__) #define psmi_hal_get_port_rate(...) PSMI_HAL_DISPATCH(get_port_rate,__VA_ARGS__) #define psmi_hal_get_port_sl2sc(...) PSMI_HAL_DISPATCH(get_port_sl2sc,__VA_ARGS__) #define psmi_hal_get_sc2vl_map(...) PSMI_HAL_DISPATCH(get_sc2vl_map, __VA_ARGS__) #define psmi_hal_set_pkey(...) PSMI_HAL_DISPATCH(set_pkey,__VA_ARGS__) #define psmi_hal_poll_type(...) PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__) #define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH(get_port_lid,__VA_ARGS__) #define psmi_hal_get_port_gid(...) PSMI_HAL_DISPATCH(get_port_gid,__VA_ARGS__) #define psmi_hal_free_tid(...) PSMI_HAL_DISPATCH(free_tid,__VA_ARGS__) #define psmi_hal_get_tidcache_invalidation(...) PSMI_HAL_DISPATCH(get_tidcache_invalidation,__VA_ARGS__) #define psmi_hal_update_tid(...) PSMI_HAL_DISPATCH(update_tid,__VA_ARGS__) #define psmi_hal_writev(...) PSMI_HAL_DISPATCH(writev,__VA_ARGS__) #define psmi_hal_dma_slot_available(...) PSMI_HAL_DISPATCH(dma_slot_available,__VA_ARGS__) #define psmi_hal_get_sdma_ring_slot_status(...) PSMI_HAL_DISPATCH(get_sdma_ring_slot_status,__VA_ARGS__) #define psmi_hal_get_cl_q_head_index(...) PSMI_HAL_DISPATCH(get_cl_q_head_index,__VA_ARGS__) #define psmi_hal_get_cl_q_tail_index(...) PSMI_HAL_DISPATCH(get_cl_q_tail_index,__VA_ARGS__) #define psmi_hal_set_cl_q_head_index(...) PSMI_HAL_DISPATCH(set_cl_q_head_index,__VA_ARGS__) #define psmi_hal_set_cl_q_tail_index(...) PSMI_HAL_DISPATCH(set_cl_q_tail_index,__VA_ARGS__) #define psmi_hal_cl_q_empty(...) PSMI_HAL_DISPATCH(cl_q_empty,__VA_ARGS__) #define psmi_hal_get_receive_event(...) PSMI_HAL_DISPATCH(get_receive_event,__VA_ARGS__) #define psmi_hal_get_egr_buff(...) PSMI_HAL_DISPATCH(get_egr_buff,__VA_ARGS__) #define psmi_hal_retire_hdr_q_entry(...) PSMI_HAL_DISPATCH(retire_hdr_q_entry,__VA_ARGS__) #define psmi_hal_get_rhf_expected_sequence_number(...) PSMI_HAL_DISPATCH(get_rhf_expected_sequence_number,__VA_ARGS__) #define psmi_hal_set_rhf_expected_sequence_number(...) PSMI_HAL_DISPATCH(set_rhf_expected_sequence_number,__VA_ARGS__) #define psmi_hal_check_rhf_sequence_number(...) PSMI_HAL_DISPATCH(check_rhf_sequence_number,__VA_ARGS__) #define psmi_hal_set_pbc(...) PSMI_HAL_DISPATCH(set_pbc,__VA_ARGS__) #define psmi_hal_tidflow_set_entry(...) PSMI_HAL_DISPATCH(tidflow_set_entry,__VA_ARGS__) #define psmi_hal_tidflow_reset(...) PSMI_HAL_DISPATCH(tidflow_reset,__VA_ARGS__) #define psmi_hal_tidflow_get(...) PSMI_HAL_DISPATCH(tidflow_get,__VA_ARGS__) #define psmi_hal_tidflow_get_hw(...) PSMI_HAL_DISPATCH(tidflow_get_hw,__VA_ARGS__) #define psmi_hal_tidflow_get_seqnum(...) PSMI_HAL_DISPATCH(tidflow_get_seqnum,__VA_ARGS__) #define psmi_hal_tidflow_get_genval(...) PSMI_HAL_DISPATCH(tidflow_get_genval,__VA_ARGS__) #define psmi_hal_tidflow_check_update_pkt_seq(...) PSMI_HAL_DISPATCH(tidflow_check_update_pkt_seq,__VA_ARGS__) #define psmi_hal_tidflow_get_flowvalid(...) PSMI_HAL_DISPATCH(tidflow_get_flowvalid,__VA_ARGS__) #define psmi_hal_tidflow_get_enabled(...) PSMI_HAL_DISPATCH(tidflow_get_enabled,__VA_ARGS__) #define psmi_hal_tidflow_get_keep_after_seqerr(...) PSMI_HAL_DISPATCH(tidflow_get_keep_after_seqerr,__VA_ARGS__) #define psmi_hal_tidflow_get_keep_on_generr(...) PSMI_HAL_DISPATCH(tidflow_get_keep_on_generr,__VA_ARGS__) #define psmi_hal_tidflow_get_keep_payload_on_generr(...) PSMI_HAL_DISPATCH(tidflow_get_keep_payload_on_generr,__VA_ARGS__) #define psmi_hal_tidflow_get_seqmismatch(...) PSMI_HAL_DISPATCH(tidflow_get_seqmismatch,__VA_ARGS__) #define psmi_hal_tidflow_get_genmismatch(...) PSMI_HAL_DISPATCH(tidflow_get_genmismatch,__VA_ARGS__) #define psmi_hal_forward_packet_to_subcontext(...) PSMI_HAL_DISPATCH(forward_packet_to_subcontext,__VA_ARGS__) #define psmi_hal_subcontext_ureg_get(...) PSMI_HAL_DISPATCH(subcontext_ureg_get,__VA_ARGS__) #define psmi_hal_finalize_(...) PSMI_HAL_DISPATCH(finalize_,__VA_ARGS__) #define psmi_hal_get_hfi_event_bits(...) PSMI_HAL_DISPATCH(get_hfi_event_bits,__VA_ARGS__) #define psmi_hal_ack_hfi_event(...) PSMI_HAL_DISPATCH(ack_hfi_event,__VA_ARGS__) #define psmi_hal_hfi_reset_context(...) PSMI_HAL_DISPATCH(hfi_reset_context,__VA_ARGS__) #define psmi_hal_get_hw_status(...) PSMI_HAL_DISPATCH(get_hw_status,__VA_ARGS__) #define psmi_hal_get_hw_status_freezemsg(...) PSMI_HAL_DISPATCH(get_hw_status_freezemsg,__VA_ARGS__) #define psmi_hal_get_user_major_bldtime_version(...) PSMI_HAL_DISPATCH(get_user_major_bldtime_version,__VA_ARGS__) #define psmi_hal_get_user_minor_bldtime_version(...) PSMI_HAL_DISPATCH(get_user_minor_bldtime_version,__VA_ARGS__) #define psmi_hal_get_user_major_runtime_version(...) PSMI_HAL_DISPATCH(get_user_major_runtime_version,__VA_ARGS__) #define psmi_hal_get_user_minor_runtime_version(...) PSMI_HAL_DISPATCH(get_user_minor_runtime_version,__VA_ARGS__) #define psmi_hal_set_pio_size(...) PSMI_HAL_DISPATCH(set_pio_size,__VA_ARGS__) #define psmi_hal_set_effective_mtu(...) PSMI_HAL_DISPATCH(set_effective_mtu,__VA_ARGS__) #define psmi_hal_set_tf_valid(...) PSMI_HAL_DISPATCH(set_tf_valid,__VA_ARGS__) #define psmi_hal_spio_init(...) PSMI_HAL_DISPATCH(spio_init,__VA_ARGS__) #define psmi_hal_spio_fini(...) PSMI_HAL_DISPATCH(spio_fini,__VA_ARGS__) #define psmi_hal_spio_transfer_frame(...) PSMI_HAL_DISPATCH(spio_transfer_frame,__VA_ARGS__) #define psmi_hal_spio_process_events(...) PSMI_HAL_DISPATCH(spio_process_events,__VA_ARGS__) #define psmi_hal_get_node_id(...) PSMI_HAL_DISPATCH(get_node_id,__VA_ARGS__) #define psmi_hal_get_bthqp(...) PSMI_HAL_DISPATCH(get_bthqp,__VA_ARGS__) #define psmi_hal_get_context(...) PSMI_HAL_DISPATCH(get_context,__VA_ARGS__) #define psmi_hal_get_gid_lo(...) PSMI_HAL_DISPATCH(get_gid_lo,__VA_ARGS__) #define psmi_hal_get_gid_hi(...) PSMI_HAL_DISPATCH(get_gid_hi,__VA_ARGS__) #define psmi_hal_get_hfi_type(...) PSMI_HAL_DISPATCH(get_hfi_type,__VA_ARGS__) #define psmi_hal_get_jkey(...) PSMI_HAL_DISPATCH(get_jkey,__VA_ARGS__) #define psmi_hal_get_lid(...) PSMI_HAL_DISPATCH(get_lid,__VA_ARGS__) #define psmi_hal_get_pio_size(...) PSMI_HAL_DISPATCH(get_pio_size,__VA_ARGS__) #define psmi_hal_get_port_num(...) PSMI_HAL_DISPATCH(get_port_num,__VA_ARGS__) #define psmi_hal_get_rx_egr_tid_cnt(...) PSMI_HAL_DISPATCH(get_rx_egr_tid_cnt,__VA_ARGS__) #define psmi_hal_get_rx_hdr_q_cnt(...) PSMI_HAL_DISPATCH(get_rx_hdr_q_cnt,__VA_ARGS__) #define psmi_hal_get_rx_hdr_q_ent_size(...) PSMI_HAL_DISPATCH(get_rx_hdr_q_ent_size,__VA_ARGS__) #define psmi_hal_get_sdma_req_size(...) PSMI_HAL_DISPATCH(get_sdma_req_size,__VA_ARGS__) #define psmi_hal_get_sdma_ring_size(...) PSMI_HAL_DISPATCH(get_sdma_ring_size,__VA_ARGS__) #define psmi_hal_get_subctxt(...) PSMI_HAL_DISPATCH(get_subctxt,__VA_ARGS__) #define psmi_hal_get_subctxt_cnt(...) PSMI_HAL_DISPATCH(get_subctxt_cnt,__VA_ARGS__) #define psmi_hal_get_tid_exp_cnt(...) PSMI_HAL_DISPATCH(get_tid_exp_cnt,__VA_ARGS__) #define psmi_hal_get_unit_id(...) PSMI_HAL_DISPATCH(get_unit_id,__VA_ARGS__) #define psmi_hal_get_fd(...) PSMI_HAL_DISPATCH(get_fd,__VA_ARGS__) #define psmi_hal_get_pio_stall_cnt(...) PSMI_HAL_DISPATCH(get_pio_stall_cnt,__VA_ARGS__) #define PSMI_HAL_NBITS_TO_MASK(NBITS) ((uint64_t)((1 << NBITS)-1)) #define PSMI_HAL_RHF_UNPACK(A,NAME) ((uint32_t)((A.decomposed_rhf >> \ PSMI_HAL_RHF_ ## NAME ## _SHFTC \ ) & PSMI_HAL_NBITS_TO_MASK( \ PSMI_HAL_RHF_ ## NAME ## _NBITS))) /* define constants for the decomposed rhf error masks. Note how each of these are shifted by the ALL_ERR_FLAGS shift count. */ #define PSMI_HAL_RHF_ERR_MASK_64(NAME) ((uint64_t)(((PSMI_HAL_NBITS_TO_MASK( \ PSMI_HAL_RHF_ERR_ ## NAME ## _NBITS) << \ PSMI_HAL_RHF_ERR_ ## NAME ## _SHFTC )))) #define PSMI_HAL_RHF_ERR_MASK_32(NAME) ((uint32_t)(PSMI_HAL_RHF_ERR_MASK_64(NAME) >> \ PSMI_HAL_RHF_ALL_ERR_FLAGS_SHFTC)) #define PSMI_HAL_RHF_ERR_ICRC PSMI_HAL_RHF_ERR_MASK_32(ICRC) #define PSMI_HAL_RHF_ERR_ECC PSMI_HAL_RHF_ERR_MASK_32(ECC) #define PSMI_HAL_RHF_ERR_LEN PSMI_HAL_RHF_ERR_MASK_32(LEN) #define PSMI_HAL_RHF_ERR_TID PSMI_HAL_RHF_ERR_MASK_32(TID) #define PSMI_HAL_RHF_ERR_TFGEN PSMI_HAL_RHF_ERR_MASK_32(TFGEN) #define PSMI_HAL_RHF_ERR_TFSEQ PSMI_HAL_RHF_ERR_MASK_32(TFSEQ) #define PSMI_HAL_RHF_ERR_RTE PSMI_HAL_RHF_ERR_MASK_32(RTE) #define PSMI_HAL_RHF_ERR_DC PSMI_HAL_RHF_ERR_MASK_32(DC) #define PSMI_HAL_RHF_ERR_DCUN PSMI_HAL_RHF_ERR_MASK_32(DCUN) #define PSMI_HAL_RHF_ERR_KHDRLEN PSMI_HAL_RHF_ERR_MASK_32(KHDRLEN) #define psmi_hal_rhf_get_use_egr_buff(A) PSMI_HAL_RHF_UNPACK(A,USE_EGR_BUFF) #define psmi_hal_rhf_get_egr_buff_index(A) PSMI_HAL_RHF_UNPACK(A,EGR_BUFF_IDX) #define psmi_hal_rhf_get_egr_buff_offset(A) PSMI_HAL_RHF_UNPACK(A,EGR_BUFF_OFF) #define psmi_hal_rhf_get_packet_length(A) (PSMI_HAL_RHF_UNPACK(A,PKT_LEN)<<2) #define psmi_hal_rhf_get_all_err_flags(A) PSMI_HAL_RHF_UNPACK(A,ALL_ERR_FLAGS) #define psmi_hal_rhf_get_seq(A) PSMI_HAL_RHF_UNPACK(A,SEQ) #define psmi_hal_rhf_get_rx_type(A) PSMI_HAL_RHF_UNPACK(A,RX_TYPE) #define PSMI_HAL_RHF_PACK(NAME,VALUE) ((uint64_t)((((uint64_t)(VALUE)) & \ PSMI_HAL_NBITS_TO_MASK( \ PSMI_HAL_RHF_ ## NAME ## _NBITS \ )) << ( \ PSMI_HAL_RHF_ ## NAME ## _SHFTC ))) #define psmi_hal_get_hal_instance_type() psmi_hal_current_hal_instance->type #define psmi_hal_get_hal_instance_description() psmi_hal_current_hal_instance->description #define psmi_hal_get_hfi_name() psmi_hal_current_hal_instance->hfi_name #define psmi_hal_get_num_units() psmi_hal_current_hal_instance->params.num_units #define psmi_hal_get_num_ports() psmi_hal_current_hal_instance->params.num_ports #define psmi_hal_get_cap_mask() psmi_hal_current_hal_instance->params.cap_mask #define psmi_hal_set_cap_mask(NEW_MASK) (psmi_hal_current_hal_instance->params.cap_mask = (NEW_MASK)) #define psmi_hal_add_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask |= (CAP)) #define psmi_hal_sub_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask &= (~(CAP))) #define psmi_hal_has_cap(CAP) ((psmi_hal_get_cap_mask() & (CAP)) == (CAP)) #define psmi_hal_get_sw_status() psmi_hal_current_hal_instance->params.sw_status #define psmi_hal_set_sw_status(NEW_STATUS) (psmi_hal_current_hal_instance->params.sw_status = (NEW_STATUS)) #define psmi_hal_add_sw_status(STATUS) (psmi_hal_current_hal_instance->params.sw_status |= (STATUS)) #define psmi_hal_sub_sw_status(STATUS) (psmi_hal_current_hal_instance->params.sw_status &= (~(STATUS))) #define psmi_hal_has_sw_status(STATUS) ((psmi_hal_get_sw_status() & (STATUS)) == (STATUS)) #include "psm2_hal_inlines_i.h" #endif /* #ifndef __PSM2_HAL_H__ */ opa-psm2-PSM2_11.2.185/psm2_hal_inline_t.h000066400000000000000000000336271370564314600177730ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* The psm2_hal_inline_t.h file serves as a template to allow all HAL instances to easily and conveniently declare their HAL methods. */ static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(initialize) (psmi_hal_instance_t *); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize_) (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_units) (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_ports) (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_active) (int unit); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_node_id) (int unit, int *nodep); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_active) (int unit, int port); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_contexts) (int unit); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_free_contexts) (int unit); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(close_context) (psmi_hal_hw_context *); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(context_open) (int unit, int port, uint64_t open_timeout, psm2_ep_t ep, psm2_uuid_t const job_key, psmi_context_t *psm_ctxt, uint32_t cap_mask, unsigned); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_index2pkey) (int unit, int port, int index); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_cc_settings_bin) (int unit, int port, char *ccabuf, size_t len_ccabuf); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_cc_table_bin) (int unit, int port, uint16_t **ccatp); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_lmc) (int unit, int port); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_rate) (int unit, int port); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_sl2sc) (int unit, int port, int sl); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sc2vl_map) (struct ips_proto *proto); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pkey) (psmi_hal_hw_context, uint16_t); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type) (uint16_t, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_lid) (int unit, int port); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_gid) (int unit, int port, uint64_t *hi, uint64_t *lo); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(free_tid) (psmi_hal_hw_context, uint64_t tidlist, uint32_t tidcnt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_tidcache_invalidation) (psmi_hal_hw_context, uint64_t tidlist, uint32_t *tidcnt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(update_tid) (psmi_hal_hw_context, uint64_t vaddr, uint32_t *length, uint64_t tidlist, uint32_t *tidcnt, uint16_t flags); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(writev) (const struct iovec *iov, int iovcnt, struct ips_epinfo *, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sdma_ring_slot_status) (int slotIdx, psmi_hal_sdma_ring_slot_status *, uint32_t *errorCode,void *); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(dma_slot_available) (int slotidx, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hfi_event_bits) (uint64_t *event_bits, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(ack_hfi_event) (uint64_t ack_bits, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(hfi_reset_context) (psmi_hal_hw_context); static PSMI_HAL_INLINE uint64_t PSMI_HAL_CAT_INL_SYM(get_hw_status) (psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hw_status_freezemsg) (volatile char** msg, psmi_hal_hw_context); static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_major_bldtime_version) (void); static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_minor_bldtime_version) (void); static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_major_runtime_version) (psmi_hal_hw_context); static PSMI_HAL_INLINE uint16_t PSMI_HAL_CAT_INL_SYM(get_user_minor_runtime_version) (psmi_hal_hw_context); static PSMI_HAL_INLINE psmi_hal_cl_idx PSMI_HAL_CAT_INL_SYM(get_cl_q_head_index) (psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE psmi_hal_cl_idx PSMI_HAL_CAT_INL_SYM(get_cl_q_tail_index) (psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(set_cl_q_head_index) (psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(set_cl_q_tail_index) (psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); static inline int PSMI_HAL_CAT_INL_SYM(cl_q_empty) (psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); static inline int PSMI_HAL_CAT_INL_SYM(get_rhf) (psmi_hal_cl_idx, psmi_hal_raw_rhf_t *, psmi_hal_cl_q, psmi_hal_hw_context); static inline int PSMI_HAL_CAT_INL_SYM(get_ips_message_hdr) (psmi_hal_cl_idx, psmi_hal_raw_rhf_t, struct ips_message_header **, psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_receive_event) (psmi_hal_cl_idx head_idx, psmi_hal_hw_context, struct ips_recvhdrq_event *); static PSMI_HAL_INLINE void *PSMI_HAL_CAT_INL_SYM(get_egr_buff) (psmi_hal_cl_idx, psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(retire_hdr_q_entry) (psmi_hal_cl_idx *, psmi_hal_cl_q, psmi_hal_hw_context, uint32_t elemsz, uint32_t elemlast, int *emptyp); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rhf_expected_sequence_number) (unsigned int *, psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_rhf_expected_sequence_number) (unsigned int, psmi_hal_cl_q, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(check_rhf_sequence_number) (unsigned int); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pbc) (struct ips_proto *proto, struct ips_flow *flow, uint32_t isCtrlMsg, struct psm_hal_pbc *dest, uint32_t hdrlen, uint32_t paylen); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_set_entry) (uint32_t flowid, uint32_t genval, uint32_t seqnum, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_reset) (psmi_hal_hw_context, uint32_t flowid, uint32_t genval, uint32_t seqnum); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get) (uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_hw) (uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_seqnum) (uint64_t val, uint32_t *pseqn); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_genval) (uint64_t val, uint32_t *pgv); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_check_update_pkt_seq) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, psmi_seqnum_t sequence_num, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr, void (*ips_protoexp_do_tf_generr) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr), void (*ips_protoexp_do_tf_seqerr) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr)); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_flowvalid) (uint64_t val, uint32_t *pfv); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_enabled) (uint64_t val, uint32_t *penabled); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_keep_after_seqerr) (uint64_t val, uint32_t *pkase); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_keep_on_generr) (uint64_t val, uint32_t *pkoge); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_keep_payload_on_generr) (uint64_t val, uint32_t *pkpoge); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_seqmismatch) (uint64_t val, uint32_t *psmm); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(tidflow_get_genmismatch) (uint64_t val, uint32_t *pgmm); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(forward_packet_to_subcontext) (struct ips_writehdrq *writeq, struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext, psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(subcontext_ureg_get) (ptl_t *ptl, struct ips_subcontext_ureg **uregp, psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_pio_size) (uint32_t, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_effective_mtu) (uint32_t, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(set_tf_valid) (uint32_t, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_default_pkey) (void); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_init) (const psmi_context_t *context, struct ptl *ptl, void **ctrl); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_fini) (void **ctrl, psmi_hal_hw_context); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_transfer_frame) (struct ips_proto *proto, struct ips_flow *flow, struct psm_hal_pbc *pbc, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum, psmi_hal_hw_context #ifdef PSM_CUDA , uint32_t is_cuda_payload #endif ); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_process_events) (const struct ptl *ptl); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_bthqp) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_context) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE uint64_t PSMI_HAL_CAT_INL_SYM(get_gid_lo) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE uint64_t PSMI_HAL_CAT_INL_SYM(get_gid_hi) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_hfi_type) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_jkey) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_lid) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_pio_size) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_num) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rx_egr_tid_cnt) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rx_hdr_q_cnt) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_rx_hdr_q_ent_size) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sdma_req_size) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_sdma_ring_size) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_subctxt) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_subctxt_cnt) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_tid_exp_cnt) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_id) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_fd) (psmi_hal_hw_context ctxt); static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_pio_stall_cnt) (psmi_hal_hw_context, uint64_t **); opa-psm2-PSM2_11.2.185/psm2_linker_script_map.in000066400000000000000000000063251370564314600212250ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* See http://sourceware.org/binutils/docs/ld/VERSION.html#VERSION for more info. C++ // Comments don't work in this file. */ PSM2_1.0 { /* Expose only those symbols we choose to. This way we do not pollute users namespace more than absolutely necessary. */ global: psm2_*; /* Below symbols are used for hfidiags hfi1_pkt_test */ /* opa_udebug.h - global */ hfi_debug; hfi_get_unit_name; __progname; /* opa_udebug.h - _HFI_DEBUGGING */ __hfi_mylabel; hfi_set_mylabel; hfi_get_mylabel; __hfi_dbgout; /* opa_service.h */ hfi_context_open; hfi_get_port_vl2mtu; hfi_get_port_lid; hfi_context_close; hfi_cmd_write; hfi_mmap64; /* opa_user.h */ hfi_userinit; hfi_poll_type; hfi_wait_for_packet; __hfi_pico_per_cycle; /* Additional globals */ _psm2_additional_globals_; /* Make all other symbols local */ local: *; }; opa-psm2-PSM2_11.2.185/psm2_mq.h000066400000000000000000002064401370564314600157560ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef PSM2_MQ_H #define PSM2_MQ_H #include #ifdef __cplusplus extern "C" { #endif /*! * @file psm2_mq.h * @brief PSM2 Matched Queues * * @page psm2_mq Matched Queues interface * * The Matched Queues (MQ) interface implements a queue-based communication * model with the distinction that queue message consumers use a 3-tuple of * metadata to match incoming messages against a list of preposted receive * buffers. These semantics are consistent with those presented by MPI-1.2 * and all the features and side-effects of Message-Passing find their way into * Matched Queues. There is currently a single MQ context, * If need be, MQs may expose a function to allocate more than * one MQ context in the future. Since an MQ is implicitly bound to a locally * opened endpoint, handle all MQ functions use an MQ handle instead of an EP * handle as a communication context. * * @section tagmatch MQ Tag Matching * * A successful MQ tag match requires an endpoint address (@ref psm2_epaddr_t) * and a 3-tuple of tag objects. Two of the tag objects are provided by the * receiver when posting a receive buffer (@ref psm2_mq_irecv) and the last is * provided by the sender as part of every message sent (@ref psm2_mq_send and * @ref psm2_mq_isend). Since MQ is a receiver-directed communication model, * the tag matching done at the receiver involves matching the sent message's * origin and send tag (@c stag) with the source endpointer address, tag (@c * rtag), and tag selector (@c rtagsel) attached to every preposted receive * buffer. The incoming @c stag is compared to the posted @c rtag but only for * significant bits set to @c 1 in the @c rtagsel. The @c rtagsel can be used * to mask off parts (or even all) of the bitwise comparison between sender and * receiver tags. A successful match causes the message to be received into * the buffer with which the tag is matched. If the incoming message is too * large, it is truncated to the size of the posted receive buffer. The * bitwise operation corresponding to a successful match and receipt of an * expected message amounts to the following expression evaluating as true: * * @verbatim ((stag ^ rtag) & rtagsel) == 0 @endverbatim * * It is up to the user to encode (pack) into the 64-bit unsigned * integers, including employing the @c rtagsel tag selector as a method to * wildcart part or all of the bits significant in the tag matching operation. * For example, MPI uses triple based on context (MPI communicator), source * rank, send tag. The following code example shows how the triple can be * packed into 64 bits: * * @code{.c} // // 64-bit send tag formed by packing the triple: // // ( context_id_16bits | source_rank_16bits | send_tag_32bits ) // stag = ( (((context_id)&0xffffULL)<<48)| \ (((source_rank)&0xffffULL)<<32)| \ (((send_tag)&0xffffffffULL)) ); @endcode * * Similarly, the receiver applies the @c rtag matching bits and @c rtagsel * masking bits against a list of send tags and returns the first successful * match. Zero bits in the @c tagsel can be used to indicate wildcarded bits * in the 64-bit tag which can be useful for implementing MPI's * @c MPI_ANY_SOURCE and @c MPI_ANY_TAG. Following the example bit splicing in * the above @c stag example: * * @code{.c} // Example MPI implementation where MPI_COMM_WORLD implemented as 0x3333 // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=7, comm=MPI_COMM_WORLD rtag = 0x3333000000000007; rtagsel = 0xffff0000ffffffff; // MPI_Irecv source_rank=3, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD rtag = 0x3333000300000000; rtagsel = 0xffffffff80000000; // can't ignore sign bit in tag // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD rtag = 0x3333000300000000; rtagsel = 0xffff000080000000; // can't ignore sign bit in tag @endcode * * * Applications that do not follow tag matching semantics can simply always * pass a value of @c 0 for @c rtagsel, which will always yield a successful * match to the first preposted buffer. If a message cannot be matched to any * of the preposted buffers, the message is delivered as an unexpected * message. * * @section mq_receive MQ Message Reception * * MQ messages are either received as @e expected or @e unexpected: @li The * received message is @e expected if the incoming message tag matches the * combination of tag and tag selector of at least one of the user-provided * receive buffers preposted with @ref psm2_mq_irecv. * * @li The received message is @e unexpected if the incoming message tag @b * doesn't match any combination of tag and tag selector from all the * user-provided receive buffers preposted with @ref psm2_mq_irecv. * * Unexpected messages are messages that the MQ library buffers until the * user provides a receive buffer that can match the unexpected message. * With Matched Queues and MPI alike, unexpected messages can occur as a * side-effect of the programming model, whereby the arrival of messages can be * slightly out of step with the ordering in which the user * provides receive buffers. Unexpected messages can also be triggered by the * difference between the rate at which a sender produces messages and the rate * at which a paired receiver can post buffers and hence consume the messages. * * In all cases, too many @e unexpected messages will negatively affect * performance. Users can employ some of the following mechanisms to reduce * the effect of added memory allocations and copies that result from * unexpected messages: * @li If and when possible, receive buffers should be posted as early as * possible and ideally before calling into the progress engine. * @li Use of rendezvous messaging that can be controlled with * @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options. These * options default to values determined to make effective use of * bandwidth and are hence not advisable for all communication message * sizes, but rendezvous messages inherently prevent unexpected * messages by synchronizing the sender with the receiver beforehand. * @li The amount of memory that is allocated to handle unexpected messages * can be bounded by adjusting the Global @ref PSM2_MQ_MAX_SYSBUF_MBYTES * option. * @li MQ statistics, such as the amount of received unexpected messages and * the aggregate amount of unexpected bytes are available in the @ref * psm2_mq_stats structure. * * Whenever a match occurs, whether the message is expected or unexpected, it * is generally up to the user to ensure that the message is not truncated. * Message truncation occurs when the size of the preposted buffer is less than * the size of the incoming matched message. MQ will correctly handle * message truncation by always copying the appropriate amount of bytes as to * not overwrite any data. While it is valid to send less data than the amount * of data that has been preposted, messages that are truncated will be marked * @ref PSM2_MQ_TRUNCATION as part of the error code in the message status * structure (@ref psm2_mq_status_t or @ref psm2_mq_status2_t). * * @section mq_completion MQ Completion Semantics * * Message completion in Matched Queues follows local completion semantics. * When sending an MQ message, it is deemed complete when MQ guarantees that * the source data has been sent and that the entire input source data memory * location can be safely overwritten. As with standard Message-Passing, * MQ does not make any remote completion guarantees for sends. MQ does * however, allow a sender to synchronize with a receiver to send a synchronous * message which sends a message only after a matching receive buffer has been * posted by the receiver (@ref PSM2_MQ_FLAG_SENDSYNC). * * A receive is deemed complete after it has matched its associated receive * buffer with an incoming send and that the data from the send has been * completely delivered to the receive buffer. * * @section mq_progress MQ Progress Requirements * * Progress on MQs must be @e explicitly ensured by the user for correctness. * The progress requirement holds even if certain areas of the MQ * implementation require less network attention than others, or if progress * may internally be guaranteed through interrupts. The main polling function, * @ref psm2_poll, is the most general form of ensuring process on a given * endpoint. Calling @ref psm2_poll ensures that progress is made over all the * MQs and other components instantiated over the endpoint passed to @ref * psm2_poll. * * While @ref psm2_poll is the only way to directly ensure progress, other MQ * functions will conditionally ensure progres depending on how they are used: * * @li @ref psm2_mq_wait employs polling and waits until the request is * completed. For blocking communication operations where the caller is * waiting on a single send or receive to complete, psm2_mq_wait usually * provides the best responsiveness in terms of latency. * * @li @ref psm2_mq_test can test a particular request for completion, but @b * never directly or indirectly ensures progress as it only tests the * completion status of a request, nothing more. See functional documentation * in @ref psm2_mq_test for a detailed discussion. * * @li @ref psm2_mq_ipeek ensures progress if and only if the MQ's completion * queue is empty and will not ensure progress as long as the completion queue * is non-empty. Users that always aggressively process all elements of the MQ * completion queue as part of their own progress engine will indirectly always * ensure MQ progress. The ipeek mechanism is the preferred way for * ensuring progress when many non-blocking requests are in flight since ipeek * returns requests in the order in which they complete. Depending on how the * user initiates and completes communication, this may be preferable to * calling other progress functions on individual requests. */ /*! @defgroup mq PSM Matched Queues * * @{ */ /** @brief Initialize the MQ component for MQ communication * * This function provides the Matched Queue handle necessary to perform all * Matched Queue communication operations. * * @param[in] ep Endpoint over which to initialize Matched Queue * @param[in] ignored * @param[in] opts Set of options for Matched Queue * @param[in] numopts Number of options passed * @param[out] mq User-supplied storage to return the Matched Queue handle * associated to the newly created Matched Queue. * * @remark This function can be called many times to retrieve the MQ handle * associated to an endpoint, but options are only considered the first * time the function is called. * * @post The user obtains a handle to an instantiated Match Queue. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK A new Matched Queue has been instantiated across all the * members of the group. * * @code{.c} int try_open_endpoint_and_initialize_mq( psm2_ep_t *ep, // endpoint handle psm2_epid_t *epid, // unique endpoint ID psm2_uuid_t job_uuid, // unique job uuid, for ep_open psm2_mq_t *mq, // MQ handle initialized on endpoint 'ep' uint64_t communicator_bits) // Where we store our communicator or // context bits in the 64-bit tag. { // Simplified open, see psm2_ep_open documentation for more info psm2_ep_open(job_uuid, NULL, // no options ep, epid); // We initialize a matched queue by telling PSM the bits that are // order-significant in the tag. Point-to-point ordering will not be // maintained between senders where the communicator bits are not the // same. psm2_mq_init(ep, communicator_bits, NULL, // no other MQ options 0, // 0 options passed mq); // newly initialized matched Queue return 1; } @endcode */ psm2_error_t psm2_mq_init(psm2_ep_t ep, uint64_t ignored, const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq); #define PSM2_MQ_ORDERMASK_NONE 0ULL /**< This macro is reserved for future tag order masking support. */ #define PSM2_MQ_ORDERMASK_ALL 0xffffffffffffffffULL /**< This macro is reserved for future tag order masking support. */ /** @brief Finalize (close) an MQ handle * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK A given Matched Queue has been freed and use of the future * use of the handle produces undefined results. */ psm2_error_t psm2_mq_finalize(psm2_mq_t mq); #define PSM2_MQ_TAG_ELEMENTS 4 /**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t * type plus one extra element to keep alignment and padding * as 16 bytes. */ /** @struct psm2_mq_tag ** @brief MQ Message tag * * Extended message tag type introduced in PSM 2.0. The previous 64 bit tag * values are replaced by a struct containing three 32 bit tag values for a * total of 96 bits. Matching semantics are unchanged from the previous 64-bit * matching scheme; the only difference is that 96 bits are matched instead of * 64. For interoperability with existing PSM routines, 64 bit tags are * extended to a 96 bit tag by setting the upper 32 bits (tag[2] or tag2) to * zero. Other than this caveat, all of the existing routines using 64-bit * tags are interchangeable with PSM2 routines using this psm2_mq_tag_t type. * For example, a message sent using @ref psm2_mq_send can be received using * @ref psm2_mq_irecv2, provided the tags match as described above. */ typedef //struct psm2_mq_tag { union psm2_mq_tag { // union { uint32_t tag[PSM2_MQ_TAG_ELEMENTS]; /* No longer specifying * alignment as it makes * code break with newer * compilers. */ /**< 3 x 32bit array representation of @ref psm2_mq_tag */ struct { uint32_t tag0; /**< 1 of 3 uint32_t tag values */ uint32_t tag1; /**< 2 of 3 uint32_t tag values */ uint32_t tag2; /**< 3 of 3 uint32_t tag values */ }; // }; } psm2_mq_tag_t; /** @brief MQ Non-blocking operation status * * Message completion status for asynchronous communication operations. * For wait and test functions, MQ fills in the structure upon completion. * Upon completion, receive requests fill in every field of the status * structure while send requests only return a valid error_code and context * pointer. */ typedef struct psm2_mq_status { /** Sender's original message tag (receive reqs only) */ uint64_t msg_tag; /** Sender's original message length (receive reqs only) */ uint32_t msg_length; /** Actual number of bytes transfered (receive reqs only) */ uint32_t nbytes; /** MQ error code for communication operation */ psm2_error_t error_code; /**< User-associated context for send or receive */ void *context; } psm2_mq_status_t; /** @brief MQ Non-blocking operation status * * Message completion status for asynchronous communication operations. For * wait and test functions, MQ fills in the structure upon completion. Upon * completion, requests fill in every field of the status structure with the * exception of the nbytes field, which is only valid for receives. Version 2 * of the status type contains an @ref psm2_mq_tag_t type to represent the tag * instead of a 64-bit integer value and is for use with PSM v2 routines. */ typedef struct psm2_mq_status2 { /** Remote peer's epaddr */ psm2_epaddr_t msg_peer; /** Sender's original message tag */ psm2_mq_tag_t msg_tag __attribute__ ((aligned(16)));/* Alignment added * to preserve the * layout as is * expected by * existent code */ /** Sender's original message length */ uint32_t msg_length; /** Actual number of bytes transfered (receiver only) */ uint32_t nbytes; /** MQ error code for communication operation */ psm2_error_t error_code; /** User-associated context for send or receive */ void *context; } psm2_mq_status2_t; /** @brief PSM2 Communication handle (opaque) */ typedef struct psm2_mq_req *psm2_mq_req_t; /** @brief MQ Request Struct * * Message completion request for asynchronous communication operations. * Upon completion, requests are filled with the valid data for the * corresponding send/recv operation that was completed. This datatype * contains the status data and is converted into the * mq_status structures in wait/test functions. */ struct psm2_mq_req_user { /* Tag matching vars */ psm2_epaddr_t peer; psm2_mq_tag_t tag __attribute__ ((aligned(16)));/* Alignment added * to preserve the * layout as is * expected by * existent code */ psm2_mq_tag_t tagsel; /* used for receives */ /* Buffer attached to request. May be a system buffer for unexpected * messages or a user buffer when an expected message */ uint8_t *buf; uint32_t buf_len; uint32_t error_code; uint32_t recv_msglen; /* Message length we are ready to receive */ uint32_t send_msglen; /* Message length from sender */ /* Used for request to send messages */ void *context; /* user context associated to sends or receives */ uint64_t user_reserved[4]; }; /*! @} */ /*! @ingroup mq * @defgroup mq_options PSM Matched Queue Options * @{ * * MQ options can be modified at any point at runtime, unless otherwise noted. * The following example shows how to retrieve the current message size at * which messages are sent as synchronous. * * @code{.c} uint32_t get_hfirv_size(psm2_mq_t mq) { uint32_t rvsize; psm2_getopt(mq, PSM2_MQ_RNDV_HFI_SZ, &rvsize); return rvsize; } @endcode */ /** @brief Get an MQ option (Deprecated. Use psm2_getopt with PSM2_COMPONENT_MQ) * * Function to retrieve the value of an MQ option. * * @param[in] mq Matched Queue handle * @param[in] option Index of option to retrieve. Possible values are: * @li @ref PSM2_MQ_RNDV_HFI_SZ * @li @ref PSM2_MQ_RNDV_SHM_SZ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES * * @param[in] value Pointer to storage that can be used to store the value of * the option to be set. It is up to the user to ensure that the * pointer points to a memory location large enough to accommodate * the value associated to the type. Each option documents the size * associated to its value. * * @returns PSM2_OK if option could be retrieved. * @returns PSM2_PARAM_ERR if the option is not a valid option number */ psm2_error_t psm2_mq_getopt(psm2_mq_t mq, int option, void *value); /** @brief Set an MQ option (Deprecated. Use psm2_setopt with PSM2_COMPONENT_MQ) * * Function to set the value of an MQ option. * * @param[in] mq Matched Queue handle * @param[in] option Index of option to retrieve. Possible values are: * @li @ref PSM2_MQ_RNDV_HFI_SZ * @li @ref PSM2_MQ_RNDV_SHM_SZ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES * * @param[in] value Pointer to storage that contains the value to be updated * for the supplied option number. It is up to the user to * ensure that the pointer points to a memory location with a * correct size. * * @returns PSM2_OK if option could be retrieved. * @returns PSM2_PARAM_ERR if the option is not a valid option number * @returns PSM2_OPT_READONLY if the option to be set is a read-only option * (currently no MQ options are read-only). */ psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value); /*! @} */ /*! @ingroup mq * @{ */ #define PSM2_MQ_FLAG_SENDSYNC 0x01 /**< MQ Send Force synchronous send */ #define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL)) /**< MQ request completion value */ #define PSM2_MQ_ANY_ADDR ((psm2_epaddr_t)NULL) /**< MQ receive from any source epaddr */ /** @brief MQ fast-path operation enumeration * * To provide for quick enqueing of send/receives from within an AM handler * PSM2 provdes fast path send/recv options that will enqueue those ops * into the MQ. The supported operations to call in fast path are enumerated * in the @ref psm2_mq_fp_op enum. */ enum psm2_mq_fp_op { PSM2_MQ_ISEND_FP = 1, PSM2_MQ_IRECV_FP, }; /** @brief Post a fast-path isend/irecv into the MQ * * Function to only enqueue fast-path non-blocking sends or non-blocking recvs * into a particular MQ. These calls only work if the process already holds * the mq progress lock, this case traditionally only applies to calls from * a registered AM Handler. * * This function helps to enable one-sided communication models from middleware * such as OFI to provide fast >2KB message transfers for RMA operations. * * When posting irecvs every MQ message received on a particular MQ, * the @c tag and @c tagsel parameters are used against the incoming * message's send tag as described in @ref tagmatch. * * When posting isends the user gurantees that the source data will remain * unmodified until the send is locally completed through a call such as * @ref psm2_mq_wait or @ref psm2_mq_test. * * Progress on the operations enqueued into the MQ will may not occur until * the next PSM2 progress API is invoked. * * @param[in] ep PSM2 endpoint * @param[in] mq Matched Queue Handle * @param[in] addr Destination EP address (used only on isends) * @param[in] tag Send/Receive tag * @param[in] tagsel Receive tag selector (used only on irecvs) * @param[in] flags Send/Receive Flags * @param[in] buf Send/Receive buffer * @param[in] len Send/Receive buffer length * @param[in] context User context pointer, available in @ref psm2_mq_status_t * upon completion * @param[in] fp_type Fast-path op requested * @param[out] req PSM MQ Request handle created by the preposted receive, to * be used for explicitly controlling message receive * completion. * * @post The supplied buffer is given to MQ to match against incoming * messages unless it is cancelled via @ref psm2_mq_cancel @e before any * match occurs. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. */ psm2_error_t psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req); /** @brief Post a receive to a Matched Queue with tag selection criteria * * Function to receive a non-blocking MQ message by providing a preposted * buffer. For every MQ message received on a particular MQ, the @c tag and @c * tagsel parameters are used against the incoming message's send tag as * described in @ref tagmatch. * * @param[in] mq Matched Queue Handle * @param[in] rtag Receive tag * @param[in] rtagsel Receive tag selector * @param[in] flags Receive flags (None currently supported) * @param[in] buf Receive buffer * @param[in] len Receive buffer length * @param[in] context User context pointer, available in @ref psm2_mq_status_t * upon completion * @param[out] req PSM MQ Request handle created by the preposted receive, to * be used for explicitly controlling message receive * completion. * * @post The supplied receive buffer is given to MQ to match against incoming * messages unless it is cancelled via @ref psm2_mq_cancel @e before any * match occurs. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. */ psm2_error_t psm2_mq_irecv(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *req); /** @brief Post a receive to a Matched Queue with source and tag selection * criteria * * Function to receive a non-blocking MQ message by providing a preposted * buffer. For every MQ message received on a particular MQ, the @c src, @c tag * and @c tagsel parameters are used against the incoming message's send tag as * described in @ref tagmatch. * * @param[in] mq Matched Queue Handle * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) * @param[in] rtag Receive tag * @param[in] rtagsel Receive tag selector * @param[in] flags Receive flags (None currently supported) * @param[in] buf Receive buffer * @param[in] len Receive buffer length * @param[in] context User context pointer, available in @ref psm2_mq_status2_t * upon completion * @param[out] req PSM MQ Request handle created by the preposted receive, to * be used for explicitly controlling message receive * completion. * * @post The supplied receive buffer is given to MQ to match against incoming * messages unless it is cancelled via @ref psm2_mq_cancel @e before any * match occurs. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. */ psm2_error_t psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, psm2_mq_tag_t *rtagsel, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *req); /** @brief Post a receive to a Matched Queue with matched request * * Function to receive a non-blocking MQ message by providing a preposted * buffer. The provided request should already be matched using the @ref * psm2_mq_improbe or @ref psm2_mq_improbe2 routines. It is an error to pass a * request that has not already been matched by one of those routines. * * @param[in] mq Matched Queue Handle * @param[in] flags Receive flags (None currently supported) * @param[in] buf Receive buffer * @param[in] len Receive buffer length * @param[in] context User context pointer, available in @ref psm2_mq_status_t * upon completion * @param[inout] reqo PSM MQ Request handle matched previously by a matched * probe routine (@ref psm2_mq_improbe or @ref * psm2_mq_improbe2), also to be used for explicitly * controlling message receive completion. * * @post The supplied receive buffer is given to MQ to deliver the matched * message. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. */ psm2_error_t psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo); /** @brief Send a blocking MQ message * * Function to send a blocking MQ message, whereby the message is locally * complete and the source data can be modified upon return. * * @param[in] mq Matched Queue Handle * @param[in] dest Destination EP address * @param[in] flags Message flags, currently: * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. * * @post The source buffer is reusable and the send is locally complete. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * @note This send function has been implemented to best suit MPI_Send. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The message has been successfully sent. */ psm2_error_t psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, const void *buf, uint32_t len); /** @brief Send a blocking MQ message * * Function to send a blocking MQ message, whereby the message is locally * complete and the source data can be modified upon return. * * @param[in] mq Matched Queue Handle * @param[in] dest Destination EP address * @param[in] flags Message flags, currently: * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. * * @post The source buffer is reusable and the send is locally complete. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * @note This send function has been implemented to best suit MPI_Send. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The message has been successfully sent. */ psm2_error_t psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, psm2_mq_tag_t *stag, const void *buf, uint32_t len); /** @brief Send a non-blocking MQ message * * Function to initiate the send of a non-blocking MQ message, whereby the * user guarantees that the source data will remain unmodified until the send * is locally completed through a call such as @ref psm2_mq_wait or @ref * psm2_mq_test. * * @param[in] mq Matched Queue Handle * @param[in] dest Destination EP address * @param[in] flags Message flags, currently: * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. * @param[in] context Optional user-provided pointer available in @ref * psm2_mq_status_t when the send is locally completed. * @param[out] req PSM MQ Request handle created by the non-blocking send, to * be used for explicitly controlling message completion. * * @post The source buffer is not reusable and the send is not locally complete * until its request is completed by either @ref psm2_mq_test or @ref * psm2_mq_wait. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * @note This send function has been implemented to suit MPI_Isend. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The message has been successfully initiated. * * @code{.c} psm2_mq_req_t non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep, const void *buf, uint32_t len, int context_id, int send_tag, const my_request_t *req) { psm2_mq_req_t req_mq; // Set up our send tag, assume that "my_rank" is global and represents // the rank of this process in the job uint64_t tag = ( ((context_id & 0xffff) << 48) | ((my_rank & 0xffff) << 32) | ((send_tag & 0xffffffff)) ); psm2_mq_isend(mq, dest_ep, 0, // no flags tag, buf, len, req, // this req is available in psm2_mq_status_t when one // of the synchronization functions is called. &req_mq); return req_mq; } @endcode */ psm2_error_t psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, const void *buf, uint32_t len, void *context, psm2_mq_req_t *req); /** @brief Send a non-blocking MQ message * * Function to initiate the send of a non-blocking MQ message, whereby the * user guarantees that the source data will remain unmodified until the send * is locally completed through a call such as @ref psm2_mq_wait or @ref * psm2_mq_test. * * @param[in] mq Matched Queue Handle * @param[in] dest Destination EP address * @param[in] flags Message flags, currently: * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. * @param[in] stag Message Send Tag, array of three 32-bit values. * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. * @param[in] context Optional user-provided pointer available in @ref * psm2_mq_status2_t when the send is locally completed. * @param[out] req PSM MQ Request handle created by the non-blocking send, to * be used for explicitly controlling message completion. * * @post The source buffer is not reusable and the send is not locally complete * until its request is completed by either @ref psm2_mq_test or @ref * psm2_mq_wait. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * @note This send function has been implemented to suit MPI_Isend. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The message has been successfully initiated. * * @code{.c} psm2_mq_req_t non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep, const void *buf, uint32_t len, int context_id, int send_tag, const my_request_t *req) { psm2_mq_req_t req_mq; // Set up our send tag, assume that "my_rank" is global and represents // the rank of this process in the job psm2_mq_tag_t tag; tag.tag[0] = send_tag; tag.tag[1] = my_rank; tag.tag[2] = context_id; psm2_mq_isend(mq, dest_ep, 0, // no flags &tag, buf, len, req, // this req is available in psm2_mq_status2_t when one // of the synchronization functions is called. &req_mq); return req_mq; } @endcode */ psm2_error_t psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context, psm2_mq_req_t *req); /** @brief Try to Probe if a message is received matching tag selection * criteria * * Function to verify if a message matching the supplied tag and tag selectors * has been received. The message is not fully matched until the user * provides a buffer with the successfully matching tag selection criteria * through @ref psm2_mq_irecv. * Probing for messages may be useful if the size of the * message to be received is unknown, in which case its size will be * available in the @c msg_length member of the returned @c status. * * Function ensures progress if matching request wasn’t found * after the first attempt. * * @param[in] mq Matched Queue Handle * @param[in] rtag Message receive tag * @param[in] rtagsel Message receive tag selector * @param[out] status Upon return, @c status is filled with information * regarding the matching send. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error codes are returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is * unchanged. */ psm2_error_t psm2_mq_iprobe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_status_t *status); /** @brief Try to Probe if a message is received matching source and tag * selection criteria * * Function to verify if a message matching the supplied source, tag, and tag * selectors has been received. The message is not fully matched until the * user provides a buffer with the successfully matching tag selection criteria * through @ref psm2_mq_irecv. Probing for messages may be useful if the size * of the message to be received is unknown, in which case its size will be * available in the @c msg_length member of the returned @c status. * * Function ensures progress if matching request wasn’t found * after the first attempt. * * @param[in] mq Matched Queue Handle * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) * @param[in] rtag Message receive tag * @param[in] rtagsel Message receive tag selector * @param[out] status Upon return, @c status is filled with information * regarding the matching send. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error codes are returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is * unchanged. */ psm2_error_t psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, psm2_mq_tag_t *rtagsel, psm2_mq_status2_t *status); /** @brief Try to Probe if a message is received matching tag selection * criteria * * Function to verify if a message matching the supplied source, tag, and tag * selectors has been received. If a match is successful, the message is * removed from the matching queue and returned as a request object. The * message can be received using @ref psm2_mq_imrecv. It is erroneous to use * the request object returned by @ref psm2_mq_improbe for any purpose other * than passing to @ref psm2_mq_imrecv. Probing for messages may be useful if * the size of the message to be received is unknown, in which case its size * will be available in the @c msg_length member of the returned @c status. * * Function ensures progress if matching request wasn’t found * after the first attempt. * * @param[in] mq Matched Queue Handle * @param[in] rtag Message receive tag * @param[in] rtagsel Message receive tag selector * @param[out] req PSM MQ Request handle, to be used for receiving the matched * message. * @param[out] status Upon return, @c status is filled with information * regarding the matching send. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error codes are returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged. */ psm2_error_t psm2_mq_improbe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_req_t *req, psm2_mq_status_t *status); /** @brief Try to Probe if a message is received matching source and tag * selection criteria * * Function to verify if a message matching the supplied tag and tag selectors * has been received. If a match is successful, the message is removed from * the matching queue and returned as a request object. The message can be * received using @ref psm2_mq_imrecv. It is erroneous to use the request * object returned by @ref psm2_mq_improbe for any purpose other than passing to * @ref psm2_mq_imrecv. Probing for messages may be useful if the size of the * message to be received is unknown, in which case its size will be available * in the @c msg_length member of the returned @c status. * * Function ensures progress if matching request wasn’t found * after the first attempt. * * @param[in] mq Matched Queue Handle * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) * @param[in] rtag Message receive tag * @param[in] rtagsel Message receive tag selector * @param[out] reqo PSM MQ Request handle, to be used for receiving the matched * message. * @param[out] status Upon return, @c status is filled with information * regarding the matching send. * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error codes are returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged. */ psm2_error_t psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, psm2_mq_tag_t *rtagsel, psm2_mq_req_t *reqo, psm2_mq_status2_t *status); /** @brief Query for non-blocking requests ready for completion. * * Function to query a particular MQ for non-blocking requests that are ready * for completion. Requests "ready for completion" are not actually considered * complete by MQ until they are returned to the MQ library through @ref * psm2_mq_wait or @ref psm2_mq_test. * * If the user can deal with consuming request completions in the order in * which they complete, this function can be used both for completions and for * ensuring progress. The latter requirement is satisfied when the user * peeks an empty completion queue as a side effect of always aggressively * peeking and completing all an MQ's requests ready for completion. * * * @param[in] mq Matched Queue Handle * @param[in,out] req MQ non-blocking request * @param[in] status Optional MQ status, can be NULL. * * @post The user has ensured progress if the function returns @ref * PSM2_MQ_NO_COMPLETIONS * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error codes are returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The peek is successful and @c req is updated with a request * ready for completion. If @c status is non-NULL, it is also * updated. * * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there * are no further requests ready for completion. * The contents of @c req and @c status remain * unchanged. * @code{.c} // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll // We return the amount of non-blocking requests that we've completed int main_progress_loop(psm2_mq_t mq) { int num_completed = 0; psm2_mq_req_t req; psm2_mq_status_t status; psm2_error_t err; my_request_t *myreq; do { err = psm2_mq_ipeek(mq, &req, NULL); // No need for status in ipeek here if (err == PSM2_MQ_NO_COMPLETIONS) return num_completed; else if (err != PSM2_OK) goto errh; num_completed++; // We obtained 'req' at the head of the completion queue. We can // now free the request with PSM and obtain our original reques // from the status' context err = psm2_mq_test(&req, // will be marked as invalid &status); // we need the status myreq = (my_request_t *) status.context; // handle the completion for myreq whether myreq is a posted receive // or a non-blocking send. } while (1); } @endcode */ psm2_error_t psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status); /** @brief Query for non-blocking requests ready for completion. * * Function to query a particular MQ for non-blocking requests that are ready * for completion. Requests "ready for completion" are not actually considered * complete by MQ until they are returned to the MQ library through @ref * psm2_mq_wait or @ref psm2_mq_test. * * If the user can deal with consuming request completions in the order in * which they complete, this function can be used both for completions and for * ensuring progress. The latter requirement is satisfied when the user * peeks an empty completion queue as a side effect of always aggressively * peeking and completing all an MQ's requests ready for completion. * * * @param[in] mq Matched Queue Handle * @param[in,out] req MQ non-blocking request * @param[in] status Optional MQ status, can be NULL. * * @post The user has ensured progress if the function returns @ref * PSM2_MQ_NO_COMPLETIONS * * @remark This function may be called simultaneously from multiple threads * as long as different MQ arguments are used in each of the calls. * * The following error codes are returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The peek is successful and @c req is updated with a request * ready for completion. If @c status is non-NULL, it is also * updated. * * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there * are no further requests ready for completion. * The contents of @c req and @c status remain * unchanged. * @code{.c} // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll // We return the amount of non-blocking requests that we've completed int main_progress_loop(psm2_mq_t mq) { int num_completed = 0; psm2_mq_req_t req; psm2_mq_status2_t status; psm2_error_t err; my_request_t *myreq; do { err = psm2_mq_ipeek2(mq, &req, NULL); // No need for status in ipeek here if (err == PSM2_MQ_NO_COMPLETIONS) return num_completed; else if (err != PSM2_OK) goto errh; num_completed++; // We obtained 'req' at the head of the completion queue. We can // now free the request with PSM and obtain our original reques // from the status' context err = psm2_mq_test2(&req, // will be marked as invalid &status); // we need the status myreq = (my_request_t *) status.context; // handle the completion for myreq whether myreq is a posted receive // or a non-blocking send. } while (1); } @endcode */ psm2_error_t psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status); /** @brief User defined Callback function handling copy of MQ request into user datatype * * Callback function used to convert an MQ request into a user's desired * status structure. The user's callback function converts the MQ request into * the provided status_array at the specified index. * * @param[in] req MQ External non-blocking Request structure * @param[in] status_array Array of User defined status datatypes * @param[in] entry_index Index in array where the converted request will be * stored if successful * * The following error codes are returned. * * @retval < 0 The MQ conversion failed with a user defined error. * * @retval 0 The MQ was successfully processed, but was not saved * in the provided @c status_array. * * @retval 1 The MQ was successfully processed and was saved in the * @c status_array at the specified index. * * @retval >1 The MQ was successfully processed and was saved in the * @c status_array at the specified index. This should * be the last MQ converted in the batch, even if there * are still spaces in @c status_array. */ typedef int (*psmi_mq_status_copy_user_t) (struct psm2_mq_req_user *req, void *status_array, int entry_index); /** @brief Check and dequeue MQ requests into a user's status array using a callback. * * Function to atomically check and dequeue MQ entries from the completed * queue and copy the MQ requests into a user's status datatype through a * status_copy callback function. * * Once the MQ request has been successfully converted by the callback, the * MQ request is freed and the next entry is processed making the supplied * Request pointer invalid. * * The variable "count" passed in will only be increased if the MQ request was * successfully stored into the user's passed in array. Otherwise the count * variable is unchanged. * * NOTE: a count of 0 passed into psm2_mq_ipeek_dequeue_multi will result in * no MQ elements being processed. * * @param[in] mq Matched Queue Handle * @param[in] status_array Array of User defined status datatypes * @param[in] status_copy Callback function pointer to convert * MQ to caller datatype * @param[in/out] count [in]Size of status_array, [out]number of elements * populated into status_array or user's error return code * * The following error codes are returned. * * @retval PSM2_OK The dequeue operation was successful and populated the * full @c status_array up to @c count entries. The parameter * @c count is equal to the count passed in by the user. * * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not able to read * @c count entries into the @c status_array. The number * of entries that were successfully written to the * @c status_array is set in the @c count for the user. * * @retval PSM2_INTERNAL_ERR The @c status_copy failed to successfully * copy the status entry into the user's datatype. * @c count is set to the return code from the * @c status_copy. */ psm2_error_t psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array, psmi_mq_status_copy_user_t status_copy, int *count); /** @brief Check and dequeue the first request entry from the completed queue. * * Function to atomically check and dequeue the first entry from the completed * queue. It must be paired with function psm2_mq_req_free, which returns the * request to PSM2 library. * * @param[in] mq Matched Queue Handle * @param[out] req PSM MQ Request handle, to be used for receiving the matched * message. * * The following error codes are returned. * * @retval PSM2_OK The dequeue operation was successful and @c req is updated * with a request ready for completion. * * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not successful, * meaning that there are no further requests ready * for completion. The contents of @c req remain * unchanged. */ psm2_error_t psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *req); /** @brief Return the request to PSM2 library. * * Function returns the request previously obtained via psm2_mq_ipeek_dequeue * to the PSM2 library. * * @param[in] mq Matched Queue Handle * @param[in] req PSM MQ Request handle to be returned to PSM2 library. If @p req is NULL, no operation is performed. * * The following error codes are returned. * * @retval PSM2_OK Return of an object to PSM2 library pool was successful. */ psm2_error_t psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req); /** @brief Wait until a non-blocking request completes * * Function to wait on requests created from either preposted receive buffers * or non-blocking sends. This is the only blocking function in the MQ * interface and will poll until the request is complete as per the progress * semantics explained in @ref mq_progress. * * @param[in,out] request MQ non-blocking request * @param[out] status Updated if non-NULL when request successfully completes * * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend * or @ref psm2_mq_irecv and passes a pointer to enough storage to write * the output of a @ref psm2_mq_status_t or NULL if status is to be * ignored. * * @pre Since MQ will internally ensure progress while the user is * suspended, the user need not ensure that progress is made prior to * calling this function. * * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all * associated MQ request storage is released back to the MQ library. * * @remark This function may be called simultaneously from multiple threads * as long as the requests that are used in each of the calls are * associated with different MQs. * * @remarks * @li This function ensures progress on the endpoint as long as the request * is incomplete. * @li @c status can be NULL, in which case no status is written upon * completion. * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns * immediately. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The request is complete or the value of @c was * @ref PSM2_MQ_REQINVALID. * */ psm2_error_t psm2_mq_wait(psm2_mq_req_t *request, psm2_mq_status_t *status); /** @brief Wait until a non-blocking request completes * * Function to wait on requests created from either preposted receive buffers * or non-blocking sends. This is the only blocking function in the MQ * interface and will poll until the request is complete as per the progress * semantics explained in @ref mq_progress. * * @param[in,out] request MQ non-blocking request * @param[out] status Updated if non-NULL when request successfully completes * * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend * or @ref psm2_mq_irecv and passes a pointer to enough storage to write * the output of a @ref psm2_mq_status2_t or NULL if status is to be * ignored. * * @pre Since MQ will internally ensure progress while the user is * suspended, the user need not ensure that progress is made prior to * calling this function. * * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all * associated MQ request storage is released back to the MQ library. * * @remark This function may be called simultaneously from multiple threads * as long as the requests that are used in each of the calls are * associated with different MQs. * * @remarks * @li This function ensures progress on the endpoint as long as the request * is incomplete. * @li @c status can be NULL, in which case no status is written upon * completion. * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns * immediately. * * The following error code is returned. Other errors are handled by the PSM * error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The request is complete or the value of @c was * @ref PSM2_MQ_REQINVALID. * */ psm2_error_t psm2_mq_wait2(psm2_mq_req_t *request, psm2_mq_status2_t *status); /** @brief Test if a non-blocking request is complete * * Function to test requests created from either preposted receive buffers or * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function * tests @c request for completion and @e never ensures progress directly or * indirectly. It is up to the user to employ some of the progress functions * described in @ref mq_progress to ensure progress if the user chooses to * exclusively test requests for completion. * * Testing a request for completion @e never internally ensure progress in * order to be useful to construct higher-level completion tests over arrays to * test some, all or any request that has completed. For testing arrays of * requests, it is preferable for performance reasons to only ensure progress * once before testing a set of requests for completion. * * @param[in,out] request MQ non-blocking request * @param[out] status Updated if non-NULL and the request successfully * completes * * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend * or @ref psm2_mq_irecv and passes a pointer to enough storage to write * the output of a @ref psm2_mq_status_t or NULL if status is to be * ignored. * * @pre The user has ensured progress on the Matched Queue if @ref * psm2_mq_test is exclusively used for guaranteeing request completions. * * @post If the request is complete, the request is assigned the value @ref * PSM2_MQ_REQINVALID and all associated MQ request storage is released * back to the MQ library. If the request is incomplete, the contents of * @c request is unchanged. * * @post The user will ensure progress on the Matched Queue if @ref * psm2_mq_test is exclusively used for guaranteeing request completions. * * @remark This function may be called simultaneously from multiple threads * as long as the requests that are used in each of the calls are * associated with different MQs. * * The following two errors are always returned. Other errors are handled by * the PSM error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The request is complete and @c request is set to @ref * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID * * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is * unchanged. * * @code{.c} // Function that returns the first completed request in an array // of requests. void * user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs) { int i; void *context = NULL; // Ensure progress only once psm2_poll(ep); // Test for at least one completion and return it's context psm2_mq_status_t stat; for (i = 0; i < nreqs; i++) { if (psm2_mq_test(&allreqs[i], &stat) == PSM2_OK) { context = stat.context; break; } } return context; } @endcode */ psm2_error_t psm2_mq_test(psm2_mq_req_t *request, psm2_mq_status_t *status); /** @brief Test if a non-blocking request is complete * * Function to test requests created from either preposted receive buffers or * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function * tests @c request for completion and @e never ensures progress directly or * indirectly. It is up to the user to employ some of the progress functions * described in @ref mq_progress to ensure progress if the user chooses to * exclusively test requests for completion. * * Testing a request for completion @e never internally ensure progress in * order to be useful to construct higher-level completion tests over arrays to * test some, all or any request that has completed. For testing arrays of * requests, it is preferable for performance reasons to only ensure progress * once before testing a set of requests for completion. * * @param[in,out] request MQ non-blocking request * @param[out] status Updated if non-NULL and the request successfully * completes * * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend * or @ref psm2_mq_irecv and passes a pointer to enough storage to write * the output of a @ref psm2_mq_status2_t or NULL if status is to be * ignored. * * @pre The user has ensured progress on the Matched Queue if @ref * psm2_mq_test is exclusively used for guaranteeing request completions. * * @post If the request is complete, the request is assigned the value @ref * PSM2_MQ_REQINVALID and all associated MQ request storage is released * back to the MQ library. If the request is incomplete, the contents of * @c request is unchanged. * * @post The user will ensure progress on the Matched Queue if @ref * psm2_mq_test is exclusively used for guaranteeing request completions. * * @remark This function may be called simultaneously from multiple threads * as long as the requests that are used in each of the calls are * associated with different MQs. * * The following two errors are always returned. Other errors are handled by * the PSM error handler (@ref psm2_error_register_handler). * * @retval PSM2_OK The request is complete and @c request is set to @ref * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID * * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is * unchanged. * * @code{.c} // Function that returns the first completed request in an array // of requests. void * user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs) { int i; void *context = NULL; // Ensure progress only once psm2_poll(ep); // Test for at least one completion and return it's context psm2_mq_status2_t stat; for (i = 0; i < nreqs; i++) { if (psm2_mq_test2(&allreqs[i], &stat) == PSM2_OK) { context = stat.context; break; } } return context; } @endcode */ psm2_error_t psm2_mq_test2(psm2_mq_req_t *request, psm2_mq_status2_t *status); /** @brief Cancel a preposted request * * Function to cancel a preposted receive request returned by @ref * psm2_mq_irecv. It is currently illegal to cancel a send request initiated * with @ref psm2_mq_isend. * * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend. * * @post Whether the cancel is successful or not, the user returns the * request to the library by way of @ref psm2_mq_test or @ref * psm2_mq_wait. * * @remark This function may be called simultaneously from multiple threads * as long as the requests that are used in each of the calls are * associated with different MQs. * * Only the two following errors can be returned directly, without being * handled by the error handler (@ref psm2_error_register_handler): * * @retval PSM2_OK The request could be successfully cancelled such that the * preposted receive buffer could be removed from the preposted * receive queue before a match occurred. The associated @c * request remains unchanged and the user must still return * the storage to the MQ library. * * @retval PSM2_MQ_NO_COMPLETIONS The request could not be successfully cancelled * since the preposted receive buffer has already * matched an incoming message. The @c request * remains unchanged. * */ psm2_error_t psm2_mq_cancel(psm2_mq_req_t *req); /*! @brief MQ statistics structure */ struct psm2_mq_stats { /** Bytes received into a matched user buffer */ uint64_t rx_user_bytes; /** Messages received into a matched user buffer */ uint64_t rx_user_num; /** Bytes received into an unmatched system buffer */ uint64_t rx_sys_bytes; /** Messages received into an unmatched system buffer */ uint64_t rx_sys_num; /** Total Messages transmitted (shm and hfi) */ uint64_t tx_num; /** Messages transmitted eagerly */ uint64_t tx_eager_num; /** Bytes transmitted eagerly */ uint64_t tx_eager_bytes; /** Messages transmitted using expected TID mechanism */ uint64_t tx_rndv_num; /** Bytes transmitted using expected TID mechanism */ uint64_t tx_rndv_bytes; /** Messages transmitted (shm only) */ uint64_t tx_shm_num; /** Messages received through shm */ uint64_t rx_shm_num; /** Number of system buffers allocated */ uint64_t rx_sysbuf_num; /** Bytes allcoated for system buffers */ uint64_t rx_sysbuf_bytes; /** Internally reserved for future use */ uint64_t _reserved[16]; }; #define PSM2_MQ_NUM_STATS 13 /**< How many stats are currently used in @ref psm2_mq_stats */ /*! @see psm2_mq_stats */ typedef struct psm2_mq_stats psm2_mq_stats_t; /** @brief Retrieve statistics from an instantiated MQ */ void psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats); /*! @} */ #ifdef __cplusplus } /* extern "C" */ #endif #endif opa-psm2-PSM2_11.2.185/psm_am.c000066400000000000000000000244741370564314600156540ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_am.h" #include "psm_am_internal.h" #include "psm_mq_internal.h" int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); /* AM capabilities parameters are initialized once in psmi_am_init_internal and copied out in __psm2_am_get_parameters. When debugging is enabled, various assertions reference these parameters for sanity checking. */ struct psm2_am_parameters psmi_am_parameters = { 0 }; static int _ignore_handler(PSMI_AM_ARGS_DEFAULT) { return 0; } int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT) { abort(); return 0; } static void psmi_am_min_parameters(struct psm2_am_parameters *dest, struct psm2_am_parameters *src) { dest->max_handlers = min(dest->max_handlers, src->max_handlers); dest->max_nargs = min(dest->max_nargs, src->max_nargs); dest->max_request_short = min(dest->max_request_short, src->max_request_short); dest->max_reply_short = min(dest->max_reply_short, src->max_reply_short); } psm2_error_t psmi_am_init_internal(psm2_ep_t ep) { int i; struct psm2_ep_am_handle_entry *am_htable; struct psm2_am_parameters params; psmi_am_parameters.max_handlers = INT_MAX; psmi_am_parameters.max_nargs = INT_MAX; psmi_am_parameters.max_request_short = INT_MAX; psmi_am_parameters.max_reply_short = INT_MAX; if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { ep->ptl_self.am_get_parameters(ep, ¶ms); psmi_am_min_parameters(&psmi_am_parameters, ¶ms); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { ep->ptl_ips.am_get_parameters(ep, ¶ms); psmi_am_min_parameters(&psmi_am_parameters, ¶ms); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { ep->ptl_amsh.am_get_parameters(ep, ¶ms); psmi_am_min_parameters(&psmi_am_parameters, ¶ms); } ep->am_htable = psmi_malloc(ep, UNDEFINED, sizeof(struct psm2_ep_am_handle_entry) * PSMI_AM_NUM_HANDLERS); if (ep->am_htable == NULL) return PSM2_NO_MEMORY; am_htable = (struct psm2_ep_am_handle_entry *) ep->am_htable; for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) { am_htable[i].hfn = _ignore_handler; am_htable[i].hctx = NULL; am_htable[i].version = PSM2_AM_HANDLER_V2; } return PSM2_OK; } void psmi_am_fini_internal(psm2_ep_t ep) { if(ep->am_htable != NULL) { psmi_free(ep->am_htable); } } psm2_error_t __psm2_am_register_handlers(psm2_ep_t ep, const psm2_am_handler_fn_t *handlers, int num_handlers, int *handlers_idx) { int i, j; psmi_assert_always(ep->am_htable != NULL); PSM2_LOG_MSG("entering"); /* For now just assign any free one */ for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) { if (ep->am_htable[i].hfn == _ignore_handler) { ep->am_htable[i].hfn = handlers[j]; ep->am_htable[i].hctx = NULL; ep->am_htable[i].version = PSM2_AM_HANDLER_V1; handlers_idx[j] = i; if (++j == num_handlers) /* all registered */ break; } } if (j < num_handlers) { /* Not enough free handlers, restore unused handlers */ for (i = 0; i < j; i++) { ep->am_htable[handlers_idx[i]].hfn = _ignore_handler; ep->am_htable[handlers_idx[i]].hctx = NULL; ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2; } PSM2_LOG_MSG("leaving"); return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES, "Insufficient " "available AM handlers: registered %d of %d requested handlers", j, num_handlers); } else { PSM2_LOG_MSG("leaving"); return PSM2_OK; } } PSMI_API_DECL(psm2_am_register_handlers) psm2_error_t __psm2_am_register_handlers_2(psm2_ep_t ep, const psm2_am_handler_2_fn_t *handlers, int num_handlers, void **hctx, int *handlers_idx) { int i, j; psmi_assert_always(ep->am_htable != NULL); PSM2_LOG_MSG("entering"); /* For now just assign any free one */ for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) { if (ep->am_htable[i].hfn == _ignore_handler) { ep->am_htable[i].hfn = handlers[j]; ep->am_htable[i].hctx = hctx[j]; ep->am_htable[i].version = PSM2_AM_HANDLER_V2; handlers_idx[j] = i; if (++j == num_handlers) /* all registered */ break; } } if (j < num_handlers) { /* Not enough free handlers, restore unused handlers */ for (i = 0; i < j; i++) { ep->am_htable[handlers_idx[i]].hfn = _ignore_handler; ep->am_htable[handlers_idx[i]].hctx = NULL; ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2; } PSM2_LOG_MSG("leaving"); return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES, "Insufficient " "available AM handlers: registered %d of %d requested handlers", j, num_handlers); } else { PSM2_LOG_MSG("leaving"); return PSM2_OK; } } PSMI_API_DECL(psm2_am_register_handlers_2) void __psm2_am_unregister_handlers(psm2_ep_t ep) { int i; PSM2_LOG_MSG("entering"); for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) { if (ep->am_htable[i].hfn != _ignore_handler) { ep->am_htable[i].hfn = _ignore_handler; ep->am_htable[i].hctx = NULL; ep->am_htable[i].version = PSM2_AM_HANDLER_V2; } } PSM2_LOG_MSG("leaving"); } PSMI_API_DECL(psm2_am_unregister_handlers) psm2_error_t __psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { psm2_error_t err; ptl_ctl_t *ptlc = epaddr->ptlctl; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); psmi_assert(epaddr != NULL); psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers); psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs); psmi_assert(nargs > 0 ? args != NULL : 1); psmi_assert(len >= 0 && len <= psmi_am_parameters.max_request_short); psmi_assert(len > 0 ? src != NULL : 1); PSMI_LOCK(ptlc->ep->mq->progress_lock); err = ptlc->am_short_request(epaddr, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt); PSMI_UNLOCK(ptlc->ep->mq->progress_lock); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_am_request_short) psm2_error_t __psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { psm2_error_t err; struct psmi_am_token *tok; psm2_epaddr_t epaddr; ptl_ctl_t *ptlc; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); psmi_assert_always(token != NULL); psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers); psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs); psmi_assert(nargs > 0 ? args != NULL : 1); psmi_assert(len >= 0 && len <= psmi_am_parameters.max_reply_short); psmi_assert(len > 0 ? src != NULL : 1); tok = (struct psmi_am_token *)token; epaddr = tok->epaddr_incoming; ptlc = epaddr->ptlctl; /* No locking here since we are already within handler context and already * locked */ err = ptlc->am_short_reply(token, handler, args, nargs, src, len, flags, completion_fn, completion_ctxt); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_am_reply_short) psm2_error_t __psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out) { struct psmi_am_token *tok; PSM2_LOG_MSG("entering"); if (token == NULL || epaddr_out == NULL) { PSM2_LOG_MSG("leaving"); return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid %s parameters", __FUNCTION__); } tok = (struct psmi_am_token *)token; *epaddr_out = tok->epaddr_incoming; PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSMI_API_DECL(psm2_am_get_source) psm2_error_t __psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters, size_t sizeof_parameters_in, size_t *sizeof_parameters_out) { size_t s; PSM2_LOG_MSG("entering"); if (parameters == NULL) { PSM2_LOG_MSG("leaving"); return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid %s parameters", __FUNCTION__); } memset(parameters, 0, sizeof_parameters_in); s = min(sizeof(psmi_am_parameters), sizeof_parameters_in); memcpy(parameters, &psmi_am_parameters, s); *sizeof_parameters_out = s; PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSMI_API_DECL(psm2_am_get_parameters) opa-psm2-PSM2_11.2.185/psm_am_internal.h000066400000000000000000000072301370564314600175440ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PSM2_AM_INTERNAL_H #define _PSM2_AM_INTERNAL_H #define PSMI_AM_MAX_ARGS 10 #define PSMI_AM_NUM_HANDLERS 256 /* must be power of 2 */ #define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token, \ psm2_amarg_t *args, int nargs, \ void *src, uint32_t len, \ void *hctx enum psm2_am_handler_version { PSM2_AM_HANDLER_V1 = 0, PSM2_AM_HANDLER_V2, }; struct psm2_ep_am_handle_entry { void *hfn; void *hctx; enum psm2_am_handler_version version; }; struct psmi_am_token { psm2_epaddr_t epaddr_incoming; uint32_t flags; /* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */ uint32_t can_reply; /* PTLs may add other stuff here */ }; /* AM capabilities parameters are initialized once in psmi_am_init_internal and copied out in __psm2_am_get_parameters. When debugging is enabled, various assertions reference these parameters for sanity checking. */ extern struct psm2_am_parameters psmi_am_parameters; PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry * psm_am_get_handler_function(psm2_ep_t ep, psm2_handler_t handler_idx)) { int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1); struct psm2_ep_am_handle_entry *hentry = &ep->am_htable[hidx]; psmi_assert_always(hentry != NULL); return hentry; } /* PSM internal initialization */ psm2_error_t psmi_am_init_internal(psm2_ep_t ep); void psmi_am_fini_internal(psm2_ep_t ep); #endif opa-psm2-PSM2_11.2.185/psm_config.h000066400000000000000000000142311370564314600165170ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2018 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2018 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef PSM_CONFIG_H #define PSM_CONFIG_H /* * The following flags can be used instead of `make` switches in order to * change behavior achieved when using `make` without parameters. */ #ifndef RDPMC_PERF_FRAMEWORK /* #define RDPMC_PERF_FRAMEWORK */ #endif #ifndef PSM2_MOCK_TESTING /* #define PSM2_MOCK_TESTING */ #endif #ifndef PSM_CUDA /* #define PSM_CUDA */ /* #define NVIDIA_GPU_DIRECT */ #endif #ifndef HFI_BRAKE_DEBUG /* #define HFI_BRAKE_DEBUG */ #endif #ifndef PSM_DEBUG /* #define PSM_DEBUG */ /* #define _HFI_DEBUGGING 1 */ /* #define _FORTIFY_SOURCE 2 */ #endif #ifndef PSM_HEAP_DEBUG /* #define PSM_HEAP_DEBUG */ #endif #ifndef PSM_PROFILE /* #define PSM_PROFILE */ #endif #define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL) #define PSMI_MIN_EP_CLOSE_TIMEOUT (1 * SEC_ULL) #define PSMI_MAX_EP_CLOSE_TIMEOUT (2 * SEC_ULL) #define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL) #define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (2 * SEC_ULL) #define HFI_MAX_RAILS 4 #define AFFINITY_SHM_BASENAME "/psm2_hfi_affinity_shm" #define AFFINITY_SHMEMSIZE sysconf(_SC_PAGE_SIZE) #define AFFINITY_SHM_REF_COUNT_LOCATION 0 #define AFFINITY_SHM_HFI_INDEX_LOCATION 1 #define SEM_AFFINITY_SHM_RW_BASENAME "/psm2_hfi_affinity_shm_rw_mutex" #define PSMI_RCVTHREAD_FLAGS 0x1 /**< * Default setting for Receive thread * * 0x0 disables rcvthread by default * 0x1 enables ips receive thread by default */ /* * Define one of these below. * * Spinlock gives the best performance and makes sense with the progress thread * only because the progress thread does a "trylock" and then goes back to * sleep in a poll. * * Mutexlock should be used for experimentation while the more useful * mutexlock-debug should be enabled during development to catch potential * errors. */ #ifdef PSM_DEBUG #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG #else #define PSMI_LOCK_IS_SPINLOCK /* #define PSMI_LOCK_IS_MUTEXLOCK */ /* #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG */ /* #define PSMI_PLOCK_IS_NOLOCK */ #endif #ifdef PSM_CUDA /* XXX TODO: Getting the gpu page size from driver at init time */ #define PSMI_GPU_PAGESIZE 65536 #define CUDA_SMALLHOSTBUF_SZ (256*1024) #define CUDA_WINDOW_PREFETCH_DEFAULT 2 #define GPUDIRECT_THRESH_RV 3 #define GDR_COPY_THRESH_SEND 32 #define GDR_COPY_THRESH_RECV 64000 /* All GPU transfers beyond this threshold use * RNDV protocol. It is mostly a send side knob. */ #define CUDA_THRESH_RNDV 32768 #endif #define MQ_HFI_THRESH_TINY 8 #define MQ_HFI_THRESH_EGR_SDMA_XEON 34000 /* Eager Xeon blocking */ #define MQ_HFI_THRESH_EGR_SDMA_PHI2 200000 /* Eager Phi2 blocking */ #define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16384 /* Eager Xeon non-blocking */ #define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2 65536 /* Eager Phi2 non-blocking */ #define MQ_HFI_THRESH_RNDV_PHI2 200000 #define MQ_HFI_THRESH_RNDV_XEON 64000 #define MQ_HFI_WINDOW_RNDV_PHI2 4194304 #define MQ_HFI_WINDOW_RNDV_XEON 131072 #ifdef PSM_CUDA #define MQ_HFI_WINDOW_RNDV_CUDA 2097152 #endif #define MQ_SHM_THRESH_RNDV 16000 #define NUM_HASH_BUCKETS 64 #define HASH_THRESHOLD 65 #define NUM_HASH_CONFIGS 3 #define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1) #define REMOVE_ENTRY 1 /* Keep timer stats */ #define PSMI_TIMER_STATS 0 /* Psm context */ #define HAL_CONTEXT_OPEN_RETRY_MAX 3 /* * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is * tested for reachability to each peer. First self, then shm and finally * hfi. The order should really only affect endpoints that happen to be on * the same node. PSM will correctly detect that two endpoints are on the same * node even though they may be using different host interfaces. */ #define PSMI_DEVICES_DEFAULT "self,shm,hfi" /* Lock */ #define PSMI_USE_PTHREAD_SPINLOCKS 0 /* Utils */ #define PSMI_EPID_TABSIZE_CHUNK 128 #define PSMI_EPID_TABLOAD_FACTOR ((float)0.7) #define PSMI_EP_HOSTNAME_LEN 64 /* hostname only */ #define PSMI_EP_NAME_LEN 96 /* hostname:LID:context:subcontext */ #define PSMI_FAULTINJ_SPEC_NAMELEN 32 #endif /* PSM_CONFIG_H */ opa-psm2-PSM2_11.2.185/psm_context.c000066400000000000000000000515611370564314600167400ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #include #include #include "psm_user.h" #include "psm2_hal.h" static int psmi_get_hfi_selection_algorithm(void); psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable) { int poll_type; int ret; if (!enable == !psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED)) return PSM2_OK; if (enable) poll_type = PSMI_HAL_POLL_TYPE_URGENT; else poll_type = 0; ret = psmi_hal_poll_type(poll_type, context->psm_hw_ctxt); if (ret != 0) return PSM2_EP_NO_RESOURCES; else { if (enable) psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); else psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); return PSM2_OK; } } int psmi_context_interrupt_isenabled(psmi_context_t *context) { return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); } /* Returns 1 when all of the active units have their free contexts * equal the number of contexts. This is an indication that no * jobs are currently running. * * Note that this code is clearly racy (this code may happen concurrently * by two or more processes, and this point of observation, * occurs earlier in time to when the decision is made for deciding which * context to assign, which will also occurs earlier in time to when the * context is actually assigned. And, when the context is finally * assigned, this will change the "nfreectxts" observed below.) */ static int psmi_all_active_units_have_max_freecontexts(int nunits) { int u; for (u=0;u < nunits;u++) { if (psmi_hal_get_unit_active(u) > 0) { int nfreectxts=psmi_hal_get_num_free_contexts(u), nctxts=psmi_hal_get_num_contexts(u); if (nfreectxts > 0 && nctxts > 0) { if (nfreectxts != nctxts) return 0; } } } return 1; } /* returns the integer value of an environment variable, or 0 if the environment * variable is not set. */ static int psmi_get_envvar(const char *env) { const char *env_val = getenv(env); if (env_val && *env_val) { int r = atoi(env_val); return (r >= 0) ? r : 0; } return 0; } /* returns the 8-bit hash value of an uuid. */ static inline uint8_t psmi_get_uuid_hash(psm2_uuid_t const uuid) { int i; uint8_t hashed_uuid = 0; for (i=0; i < sizeof(psm2_uuid_t); ++i) hashed_uuid ^= *((uint8_t const *)uuid + i); return hashed_uuid; } int psmi_get_current_proc_location() { int core_id, node_id; core_id = sched_getcpu(); if (core_id < 0) return -EINVAL; node_id = numa_node_of_cpu(core_id); if (node_id < 0) return -EINVAL; return node_id; } static void psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start, long *unit_end, int nunits) { /* if the number of ranks on the host is 1 and ... */ if ((psmi_get_envvar("MPI_LOCALNRANKS") == 1) && /* * All of the active units have free contexts equal the * number of contexts. */ psmi_all_active_units_have_max_freecontexts(nunits)) { /* we start looking at unit 0, and end at nunits-1: */ *unit_start = 0; *unit_end = nunits - 1; } else { /* else, we are going to look at: (a hash of the job key plus the local rank id) mod nunits. */ *unit_start = (psmi_get_envvar("MPI_LOCALRANKID") + psmi_get_uuid_hash(job_key)) % nunits; if (*unit_start > 0) *unit_end = *unit_start - 1; else *unit_end = nunits-1; } } static int psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key) { int shm_fd, ret; int first_to_create = 0; size_t shm_name_len = 256; shared_affinity_ptr = NULL; affinity_shm_name = NULL; affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); psmi_assert_always(affinity_shm_name != NULL); snprintf(affinity_shm_name, shm_name_len, AFFINITY_SHM_BASENAME".%d", psmi_get_uuid_hash(job_key)); shm_fd = shm_open(affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if ((shm_fd < 0) && (errno == EEXIST)) { shm_fd = shm_open(affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); if (shm_fd < 0) { _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", affinity_shm_name, errno); return shm_fd; } } else if (shm_fd > 0) { first_to_create = 1; } else { _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", affinity_shm_name, errno); } ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE); if ( ret < 0 ) { _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", affinity_shm_name, errno); if (shm_fd >= 0) close(shm_fd); return ret; } shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); if (shared_affinity_ptr == MAP_FAILED) { _HFI_VDBG("Cannot mmap affinity shared memory. errno=%d\n", errno); close(shm_fd); return -1; } close(shm_fd); psmi_affinity_shared_file_opened = 1; if (first_to_create) { _HFI_VDBG("Creating shm to store HFI affinity per socket\n"); memset(shared_affinity_ptr, 0, AFFINITY_SHMEMSIZE); /* * Once shm object is initialized, unlock others to be able to * use it. */ psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); } else { _HFI_VDBG("Opening shm object to read/write HFI affinity per socket\n"); } /* * Start critical section to increment reference count when creating * or opening shm object. Decrement of ref count will be done before * closing the shm. */ if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) { _HFI_VDBG("Could not enter critical section to update shm refcount\n"); return -1; } shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; /* End critical section */ psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); return 0; } /* * Spread HFI selection between units if we find more than one within a socket. */ static void psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, int *saved_hfis, int found, psm2_uuid_t const job_key) { int ret, shm_location; /* * Take affinity lock and open shared memory region to be able to * accurately determine which HFI to pick for this process. If any * issues, bail by picking first known HFI. */ if (!psmi_affinity_semaphore_open) goto spread_hfi_fallback; ret = psmi_create_and_open_affinity_shm(job_key); if (ret < 0) goto spread_hfi_fallback; shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; if (shm_location > AFFINITY_SHMEMSIZE) goto spread_hfi_fallback; /* Start critical section to read/write shm object */ if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) { _HFI_VDBG("Could not enter critical section to update HFI index\n"); goto spread_hfi_fallback; } *unit_start = *unit_end = shared_affinity_ptr[shm_location]; shared_affinity_ptr[shm_location] = (shared_affinity_ptr[shm_location] + 1) % found; _HFI_VDBG("Selected HFI index= %ld, Next HFI=%ld, node = %d, local rank=%d, found=%d.\n", *unit_start, shared_affinity_ptr[shm_location], node_id, psmi_get_envvar("MPI_LOCALRANKID"), found); /* End Critical Section */ psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); return; spread_hfi_fallback: *unit_start = *unit_end = saved_hfis[0]; } static void psmi_create_affinity_semaphores(psm2_uuid_t const job_key) { int ret; sem_affinity_shm_rw_name = NULL; size_t sem_len = 256; /* * If already opened, no need to do anything else. * This could be true for Multi-EP cases where a different thread has * already created the semaphores. We don't need separate locks here as * we are protected by the overall "psmi_creation_lock" which each * thread will take in psm2_ep_open() */ if (psmi_affinity_semaphore_open) return; sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); psmi_assert_always(sem_affinity_shm_rw_name != NULL); snprintf(sem_affinity_shm_rw_name, sem_len, SEM_AFFINITY_SHM_RW_BASENAME".%d", psmi_get_uuid_hash(job_key)); ret = psmi_init_semaphore(&sem_affinity_shm_rw, sem_affinity_shm_rw_name, S_IRUSR | S_IWUSR, 0); if (ret) { _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", sem_affinity_shm_rw_name); sem_close(sem_affinity_shm_rw); psmi_free(sem_affinity_shm_rw_name); sem_affinity_shm_rw_name = NULL; return; } _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", sem_affinity_shm_rw_name); psmi_affinity_semaphore_open = 1; return; } static psm2_error_t psmi_compute_start_and_end_unit(long unit_param,int nunitsactive,int nunits, psm2_uuid_t const job_key, long *unit_start,long *unit_end) { unsigned short hfi_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; int node_id, unit_id, found = 0; int saved_hfis[nunits]; /* if the user did not set HFI_UNIT then ... */ if (unit_param == HFI_UNIT_ID_ANY) { /* Get the actual selection algorithm from the environment: */ hfi_sel_alg = psmi_get_hfi_selection_algorithm(); /* If round-robin is selection algorithm and ... */ if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && /* there are more than 1 active units then ... */ (nunitsactive > 1)) { /* * Pick first HFI we find on same root complex * as current task. If none found, fall back to * load-balancing algorithm. */ node_id = psmi_get_current_proc_location(); if (node_id >= 0) { for (unit_id = 0; unit_id < nunits; unit_id++) { if (psmi_hal_get_unit_active(unit_id) <= 0) continue; int node_id_i; if (!psmi_hal_get_node_id(unit_id, &node_id_i)) { if (node_id_i == node_id) { saved_hfis[found] = unit_id; found++; } } } if (found > 1) { psmi_create_affinity_semaphores(job_key); psmi_spread_hfi_within_socket(unit_start, unit_end, node_id, saved_hfis, found, job_key); } else if (found == 1) { *unit_start = *unit_end = saved_hfis[0]; } } if (node_id < 0 || !found) { psmi_spread_hfi_selection(job_key, unit_start, unit_end, nunits); } } else if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && (nunitsactive > 1)) { psmi_spread_hfi_selection(job_key, unit_start, unit_end, nunits); } else { *unit_start = 0; *unit_end = nunits - 1; } } else if (unit_param >= 0) { /* the user specified HFI_UNIT, we use it. */ *unit_start = *unit_end = unit_param; } else { psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "PSM2 can't open unit: %ld for reading and writing", unit_param); return PSM2_EP_DEVICE_FAILURE; } return PSM2_OK; } psm2_error_t psmi_context_open(const psm2_ep_t ep, long unit_param, long port, psm2_uuid_t const job_key, int64_t timeout_ns, psmi_context_t *context) { long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev; psm2_error_t err = PSM2_OK; int nunits = psmi_hal_get_num_units(), nunitsactive=0; /* * If shared contexts are enabled, try our best to schedule processes * across one or many devices */ /* if no units, then no joy. */ if (nunits <= 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "PSM2 no hfi units are available"); goto ret; } /* Calculate the number of active units: */ for (unit_id=0;unit_id < nunits;unit_id++) { if (psmi_hal_get_unit_active(unit_id) > 0) nunitsactive++; } /* if no active units, then no joy. */ if (nunitsactive == 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "PSM2 no hfi units are active"); goto ret; } if (timeout_ns > 0) open_timeout = (long)(timeout_ns / MSEC_ULL); unit_start = 0; unit_end = nunits - 1; err = psmi_compute_start_and_end_unit(unit_param, nunitsactive, nunits, job_key, &unit_start, &unit_end); if (err != PSM2_OK) return err; /* this is the start of a loop that starts at unit_start and goes to unit_end. but note that the way the loop computes the loop control variable is by an expression involving the mod operator. */ int success = 0; unit_id_prev = unit_id = unit_start; do { /* close previous opened unit fd before attempting open of current unit. */ if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) { psmi_hal_close_context(&context->psm_hw_ctxt); context->psm_hw_ctxt = 0; } /* if the unit_id is not active, go to next one. */ if (psmi_hal_get_unit_active(unit_id) <= 0) { unit_id_prev = unit_id; unit_id = (unit_id + 1) % nunits; continue; } /* open this unit. */ int rv = psmi_hal_context_open(unit_id, port, open_timeout, ep, job_key, context, psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED), HAL_CONTEXT_OPEN_RETRY_MAX); /* go to next unit if failed to open. */ if (rv || context->psm_hw_ctxt == NULL) { unit_id_prev = unit_id; unit_id = (unit_id + 1) % nunits; continue; } success = 1; break; } while (unit_id_prev != unit_end); if (!success) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "PSM2 can't open hfi unit: %ld",unit_param); goto bail; } context->ep = (psm2_ep_t) ep; /* Check backward compatibility bits here and save the info */ if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT)) { #ifdef PSM_CUDA is_driver_gpudirect_enabled = 1; #else psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "FATAL ERROR: " "CUDA version of hfi1 driver is loaded with non-CUDA version of " "psm2 library.\n"); #endif } #ifdef PSM_CUDA else fprintf(stderr,"WARNING: running CUDA version of libpsm2 with non CUDA version of hfi1 driver.\n"); #endif _HFI_VDBG("hfi_userinit() passed.\n"); /* Fetch hw parameters from HAL (that were obtained during opening the context above. */ int lid = psmi_hal_get_lid(context->psm_hw_ctxt); ep->unit_id = psmi_hal_get_unit_id(context->psm_hw_ctxt); ep->portnum = psmi_hal_get_port_num(context->psm_hw_ctxt); ep->gid_lo = psmi_hal_get_gid_lo(context->psm_hw_ctxt); ep->gid_hi = psmi_hal_get_gid_hi(context->psm_hw_ctxt); int ctxt = psmi_hal_get_context(context->psm_hw_ctxt); int subctxt = psmi_hal_get_subctxt(context->psm_hw_ctxt); uint32_t hfi_type = psmi_hal_get_hfi_type(context->psm_hw_ctxt); context->ep = (psm2_ep_t) ep; /* Construct epid for this Endpoint */ switch (PSMI_EPID_VERSION) { case PSMI_EPID_V1: context->epid = PSMI_EPID_PACK_V1(lid, ctxt, subctxt, ep->unit_id, PSMI_EPID_VERSION, 0x3ffffff); break; case PSMI_EPID_V2: context->epid = PSMI_EPID_PACK_V2(lid, ctxt, subctxt, PSMI_EPID_IPS_SHM, /*Not a only-shm epid */ PSMI_EPID_VERSION, ep->gid_hi); break; default: /* Epid version is greater than max supportd version. */ psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2); break; } _HFI_VDBG ("construct epid: lid %d ctxt %d subctxt %d hcatype %d mtu %d\n", lid, ctxt, subctxt, hfi_type, ep->mtu); goto ret; bail: _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno)); if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) psmi_hal_close_context(&context->psm_hw_ctxt); ret: _HFI_VDBG("psmi_context_open() return %d\n", err); return err; } psm2_error_t psmi_context_close(psmi_context_t *context) { if (psmi_hal_get_fd(context->psm_hw_ctxt) > 0) psmi_hal_close_context(&context->psm_hw_ctxt); return PSM2_OK; } /* * This function works whether a context is initialized or not in a psm2_ep. * * Returns one of * * PSM2_OK: Port status is ok (or context not initialized yet but still "ok") * PSM2_OK_NO_PROGRESS: Cable pulled * PSM2_EP_NO_NETWORK: No network, no lid, ... * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. * The message follows the per-port status * As of 7322-ready driver, need to check port-specific qword for IB * as well as older unit-only. For now, we don't have the port interface * defined, so just check port 0 qword for spi_status */ psm2_error_t psmi_context_check_status(const psmi_context_t *contexti) { psm2_error_t err = PSM2_OK; psmi_context_t *context = (psmi_context_t *) contexti; char *errmsg = NULL; uint64_t status = psmi_hal_get_hw_status(context->psm_hw_ctxt); /* Fatal chip-related errors */ if (!(status & PSM_HAL_HW_STATUS_CHIP_PRESENT) || !(status & PSM_HAL_HW_STATUS_INITTED) || (status & PSM_HAL_HW_STATUS_HWERROR)) { err = PSM2_EP_DEVICE_FAILURE; if (err != context->status_lasterr) { /* report once */ volatile char *errmsg_sp="no err msg"; psmi_hal_get_hw_status_freezemsg(&errmsg_sp, context->psm_hw_ctxt); if (*errmsg_sp) psmi_handle_error(context->ep, err, "Hardware problem: %s", errmsg_sp); else { if (status & PSM_HAL_HW_STATUS_HWERROR) errmsg = "Hardware error"; else errmsg = "Hardware not found"; psmi_handle_error(context->ep, err, "%s", errmsg); } } } /* Fatal network-related errors with timeout: */ else if (!(status & PSM_HAL_HW_STATUS_IB_CONF) || !(status & PSM_HAL_HW_STATUS_IB_READY)) { err = PSM2_EP_NO_NETWORK; if (err != context->status_lasterr) { /* report once */ context->networkLostTime = time(NULL); } else { time_t now = time(NULL); static const double seventySeconds = 70.0; /* The linkup time duration for a system should allow the time needed to complete 3 LNI passes which is: 50 seconds for a passive copper channel 65 seconds for optical channel. (we add 5 seconds of margin.) */ if (difftime(now,context->networkLostTime) > seventySeconds) { volatile char *errmsg_sp="no err msg"; psmi_hal_get_hw_status_freezemsg(&errmsg_sp, context->psm_hw_ctxt); psmi_handle_error(context->ep, err, "%s", *errmsg_sp ? errmsg_sp : "Network down"); } } } if (err == PSM2_OK && context->status_lasterr != PSM2_OK) context->status_lasterr = PSM2_OK; /* clear error */ else if (err != PSM2_OK) context->status_lasterr = err; /* record error */ return err; } static int psmi_get_hfi_selection_algorithm(void) { union psmi_envvar_val env_hfi1_alg; int hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; /* If a specific unit is set in the environment, use that one. */ psmi_getenv("HFI_SELECTION_ALG", "HFI Device Selection Algorithm to use. Round Robin (Default) " ", Packed or Round Robin All.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"Round Robin", &env_hfi1_alg); if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin")) hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; else if (!strcasecmp(env_hfi1_alg.e_str, "Packed")) hfi1_alg = PSMI_UNIT_SEL_ALG_WITHIN; else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All")) hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; else { _HFI_ERROR ("Unknown HFI selection algorithm %s. Defaulting to Round Robin " "allocation of HFIs.\n", env_hfi1_alg.e_str); hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; } return hfi1_alg; } opa-psm2-PSM2_11.2.185/psm_context.h000066400000000000000000000102561370564314600167410ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PSMI_IN_USER_H #error psm_context.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSM_CONTEXT_H #define _PSM_CONTEXT_H typedef struct psmi_context { /* The following three member variables are used for sharing contexts among subcontexts and they have the following common properties: a. They are all initialized below HAL layer when the context is opened. b. If they are NULL that means no context is being shared among subcontexts, non-NULL means a context is being shared among some number of subcontexts. c. The initialization code is currently found in the gen1 hal instance. */ void *spio_ctrl; void *tid_ctrl; void *tf_ctrl; /* end of shared context member variables. */ psmi_hal_hw_context psm_hw_ctxt; psm2_ep_t ep; /* psm ep handle */ psm2_epid_t epid; /* psm integral ep id */ psm2_error_t status_lasterr; time_t networkLostTime; } psmi_context_t; psm2_error_t psmi_context_open(const psm2_ep_t ep, long unit_id, long port, psm2_uuid_t const job_key, int64_t timeout_ns, psmi_context_t *context); psm2_error_t psmi_context_close(psmi_context_t *context); /* Check status of context */ psm2_error_t psmi_context_check_status(const psmi_context_t *context); psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable); int psmi_context_interrupt_isenabled(psmi_context_t *context); /* * round robin contexts across HFIs, then * ports; this is the default. * This option spreads the HFI selection within the local socket. * If it is preferred to spread job over over entire set of * HFIs within the system, see ALG_ACROSS_ALL below. */ #define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS #define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL /* * use all contexts on an HFI (round robin * active ports within), then next HFI */ #define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN #endif /* PSM_CONTEXT_H */ opa-psm2-PSM2_11.2.185/psm_diags.c000066400000000000000000000244601370564314600163410ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm_mq_internal.h" typedef void (*memcpy_fn_t) (void *dst, const void *src, size_t n); static int psmi_test_memcpy(memcpy_fn_t, const char *name); static int psmi_test_epid_table(int numelems); int psmi_diags(void); #define diags_assert(x) do { \ if (!(x)) { \ _HFI_ERROR("Diags assertion failure: %s\n", \ #x); \ goto fail; \ } \ } while (0) #define DIAGS_RETURN_PASS(str) \ do { _HFI_INFO("%s: PASSED %s\n", __func__, str); return 0; } \ while (0) #define DIAGS_RETURN_FAIL(str) \ do { _HFI_INFO("%s: FAILED %s\n", __func__, str); return 1; } \ while (0) int psmi_diags(void) { int ret = 0; ret |= psmi_test_epid_table(2048); ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo"); /* ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); */ if (ret) DIAGS_RETURN_FAIL(""); else DIAGS_RETURN_PASS(""); } /* * Hash table test */ #define NALLOC 1024 static int psmi_test_epid_table(int numelems) { ptl_ctl_t ctl; psm2_epaddr_t *ep_array, epaddr, ep_alloc; psm2_epid_t *epid_array, epid_tmp; psm2_ep_t ep = (psm2_ep_t) (uintptr_t) 0xabcdef00; struct psmi_epid_table *tab; int i, j; struct drand48_data drand48_data; ep_alloc = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(struct psm2_epaddr)); ep_array = (psm2_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(struct psm2_epaddr *)); epid_array = (psm2_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(psm2_epid_t)); diags_assert(ep_alloc != NULL); diags_assert(ep_array != NULL); diags_assert(epid_array != NULL); srand48_r(12345678, &drand48_data); psmi_epid_init(); tab = &psmi_epid_table; ctl.ep = ep; for (i = 0; i < numelems; i++) { epid_array[i] = i; ep_alloc[i].ptlctl = &ctl; ep_alloc[i].epid = epid_array[i]; ep_array[i] = &ep_alloc[i]; } for (i = 0; i < numelems; i++) { psmi_epid_add(ep, epid_array[i], ep_array[i]); } /* Randomize epid_array */ for (i = 0; i < numelems; i++) { long int rand_result; lrand48_r(&drand48_data, &rand_result); j = (int)(rand_result % numelems); epid_tmp = epid_array[i]; epid_array[i] = epid_array[j]; epid_array[j] = epid_tmp; } /* Lookup. */ for (i = 0; i < numelems; i++) { epaddr = psmi_epid_lookup(ep, epid_array[i]); diags_assert(epaddr != NULL); diags_assert(epaddr->epid == epid_array[i]); diags_assert(epaddr->ptlctl->ep == ep); } /* Randomize epid_array again */ for (i = 0; i < numelems; i++) { long int rand_result; lrand48_r(&drand48_data, &rand_result); j = (int)(rand_result % numelems); epid_tmp = epid_array[i]; epid_array[i] = epid_array[j]; epid_array[j] = epid_tmp; } /* Delete half */ for (i = 0; i < numelems / 2; i++) { epaddr = psmi_epid_remove(ep, epid_array[i]); diags_assert(epaddr != NULL); diags_assert(epaddr->epid == epid_array[i]); diags_assert(epaddr->ptlctl->ep == ep); } /* Lookup other half -- expect non-NULL, then delete */ for (i = numelems / 2; i < numelems; i++) { epaddr = psmi_epid_lookup(ep, epid_array[i]); diags_assert(epaddr != NULL); diags_assert(epaddr->epid == epid_array[i]); diags_assert(epaddr->ptlctl->ep == ep); epaddr = psmi_epid_remove(ep, epid_array[i]); epaddr = psmi_epid_lookup(ep, epid_array[i]); diags_assert(epaddr == NULL); } /* Lookup whole thing, expect done */ for (i = 0; i < numelems; i++) { epaddr = psmi_epid_lookup(ep, epid_array[i]); diags_assert(epaddr == NULL); } for (i = 0; i < tab->tabsize; i++) { diags_assert(tab->table[i].entry == NULL || tab->table[i].entry == EPADDR_DELETED); } /* Make sure we're not leaking memory somewhere... */ diags_assert(tab->tabsize > tab->tabsize_used && tab->tabsize * PSMI_EPID_TABLOAD_FACTOR > tab->tabsize_used); /* Only free on success */ psmi_epid_fini(); psmi_free(epid_array); psmi_free(ep_array); psmi_free(ep_alloc); DIAGS_RETURN_PASS(""); fail: /* Klocwork scan report memory leak. */ psmi_epid_fini(); if (epid_array) psmi_free(epid_array); if (ep_array) psmi_free(ep_array); if (ep_alloc) psmi_free(ep_alloc); DIAGS_RETURN_FAIL(""); } /* * Memcpy correctness test */ static int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n); static void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n); static int psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name) { const int CORNERS = 0; const long long lo = 1; const long long hi = 16 * 1024 * 1024; const long long below = 32; const long long above = 32; long long n, m; char buf[128]; int ret = 0; int memcpy_passed; int memcpy_failed; memcpy_passed = 0; memcpy_failed = 0; ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0); if (ret < 0) DIAGS_RETURN_FAIL("no heap space"); for (n = lo; n <= hi; n <<= 1) { _HFI_INFO("%s %d align=0..16\n", memcpy_name, (int)n); for (m = n - below; m <= n + above; m++) { if (m == n) { ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, n); if (ret < 0) DIAGS_RETURN_FAIL("no heap space"); } else if (CORNERS && m >= lo && m <= hi && m > (n >> 1) && m < max(n, ((n << 1) - below))) { ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, (size_t) m); if (ret < 0) DIAGS_RETURN_FAIL("no heap space"); } } } int total = memcpy_passed + memcpy_failed; if (total > 0) { _HFI_INFO("%d memcpy tests with %d passed (%.2f%%) " "and %d failed (%.2f%%)\n", total, memcpy_passed, (100.0 * memcpy_passed) / total, memcpy_failed, (100.0 * memcpy_failed) / total); } if (memcpy_failed) { snprintf(buf, sizeof(buf), "%s %.2f%% of tests memcpy_failed", memcpy_name, (100.0 * memcpy_failed) / total); DIAGS_RETURN_FAIL(buf); } else { DIAGS_RETURN_PASS(memcpy_name); } } void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n) { int ok = 1; unsigned int seed = (unsigned int) ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n); size_t i; struct drand48_data drand48_data; if (!n) return dst; memset(src, 0x55, n); memset(dst, 0xaa, n); srand48_r(seed, &drand48_data); for (i = 0; i < n; i++) { long int rand_result; lrand48_r(&drand48_data, &rand_result); ((uint8_t *) src)[i] = (((int)(rand_result & INT_MAX)) >> 16) & 0xff; } fn(dst, src, n); memset(src, 0, n); srand48_r(seed, &drand48_data); for (i = 0; i < n; i++) { long int rand_result; lrand48_r(&drand48_data, &rand_result); int value = (int)(uint8_t) (((int)(rand_result % INT_MAX)) >> 16); int v = (int)((uint8_t *) dst)[i]; if (v != value) { _HFI_ERROR ("Error on index %llu : got %d instead of %d\n", (unsigned long long)i, v, value); ok = 0; } } return ok ? dst : NULL; } int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n) { #define num_aligns 16 #define USE_MALLOC 0 #define DEBUG 0 uint8_t *src; uint8_t *dst; size_t size = n * 2 + num_aligns; if (USE_MALLOC) { src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); if (src == NULL || dst == NULL) { if (src) psmi_free(src); if (dst) psmi_free(dst); return -1; } } else { void *src_p = NULL, *dst_p = NULL; if (posix_memalign(&src_p, 64, size) != 0 || posix_memalign(&dst_p, 64, size) != 0) { if (src_p) free(src_p); if (dst_p) free(dst_p); return -1; } src = (uint8_t *) src_p; dst = (uint8_t *) dst_p; } int src_align, dst_align; for (src_align = 0; src_align < num_aligns; src_align++) { for (dst_align = 0; dst_align < num_aligns; dst_align++) { uint8_t *d = ((uint8_t *) dst) + dst_align; uint8_t *s = ((uint8_t *) src) + src_align; int ok = (memcpy_check_one(fn, d, s, n) != NULL); if (DEBUG || !ok) { _HFI_INFO("memcpy(%p, %p, %llu) : %s\n", d, s, (unsigned long long)n, ok ? "passed" : "failed"); } if (ok) { (*p)++; } else { (*f)++; } } } if (USE_MALLOC) { psmi_free(src); psmi_free(dst); } else { free(src); free(dst); } return 0; } opa-psm2-PSM2_11.2.185/psm_ep.c000066400000000000000000001273111370564314600156550ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include #include #include /* cpu_set */ #include /* isalpha */ #include #include "psm_user.h" #include "psm2_hal.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" #ifdef PSM_CUDA #include "psm_gdrcpy.h" #endif /* * Endpoint management */ psm2_ep_t psmi_opened_endpoint = NULL; int psmi_opened_endpoint_count = 0; static uint16_t *hfi_lids; static uint32_t nlids; static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep, const struct psm2_ep_open_opts *opts, const psm2_uuid_t unique_job_key, struct psmi_context *context, psm2_epid_t *epid); /* * Device management * * PSM uses "devices" as components to manage communication to self, to peers * reachable via shared memory and finally to peers reachable only through * hfi. */ static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstr); static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o) { static int num_units = -1; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); if (num_units == -1) { num_units = psmi_hal_get_num_units(); if (num_units == -1) num_units = 0; } *num_units_o = (uint32_t) num_units; PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSMI_API_DECL(psm2_ep_num_devunits) static int cmpfunc(const void *p1, const void *p2) { uint64_t a = ((uint64_t *) p1)[0]; uint64_t b = ((uint64_t *) p2)[0]; if (a < b) return -1; if (a == b) return 0; return 1; } static psm2_error_t psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port) { uint32_t num_units; uint64_t gid_hi, gid_lo; int i, j, ret, count = 0; char *env; psm2_error_t err = PSM2_OK; uint64_t gidh[HFI_MAX_RAILS][3]; union psmi_envvar_val env_multirail; int multirail_within_socket_used = 0; int node_id = -1, found = 0; psmi_getenv("PSM2_MULTIRAIL", "Use all available HFIs in the system for communication.\n" "0: Disabled (default),\n" "1: Enable multirail across all available HFIs,\n" "2: Enable multirail within socket.\n" "\t For multirail within a socket, we try to find at\n" "\t least one HFI on the same socket as current task.\n" "\t If none found, we continue to use other HFIs within\n" "\t the system.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)0, &env_multirail); if (!env_multirail.e_int) { *num_rails = 0; return err; } if (env_multirail.e_int == 2) multirail_within_socket_used = 1; /* * map is in format: unit:port,unit:port,... */ if ((env = getenv("PSM2_MULTIRAIL_MAP"))) { if (sscanf(env, "%d:%d", &i, &j) == 2) { char *comma = strchr(env, ','); unit[count] = i; port[count] = j; count++; while (comma) { if (sscanf(comma, ",%d:%d", &i, &j) != 2) { break; } unit[count] = i; port[count] = j; count++; if (count == HFI_MAX_RAILS) break; comma = strchr(comma + 1, ','); } } *num_rails = count; /* * Check if any of the port is not usable. */ for (i = 0; i < count; i++) { ret = psmi_hal_get_port_active(unit[i], port[i]); if (ret <= 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Unit/port: %d:%d is not active.", unit[i], port[i]); return err; } ret = psmi_hal_get_port_lid(unit[i], port[i]); if (ret <= 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Couldn't get lid for unit %d:%d", unit[i], port[i]); return err; } ret = psmi_hal_get_port_gid(unit[i], port[i], &gid_hi, &gid_lo); if (ret == -1) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Couldn't get gid for unit %d:%d", unit[i], port[i]); return err; } } return err; } if ((err = psm2_ep_num_devunits(&num_units))) { return err; } if (num_units > HFI_MAX_RAILS) { _HFI_INFO ("Found %d units, max %d units are supported, use %d\n", num_units, HFI_MAX_RAILS, HFI_MAX_RAILS); num_units = HFI_MAX_RAILS; } /* * PSM2_MULTIRAIL=2 functionality- * - Try to find at least find one HFI in the same root * complex. If none found, continue to run and * use remaining HFIs in the system. * - If we do find at least one HFI in same root complex, we * go ahead and add to list. */ if (multirail_within_socket_used) { node_id = psmi_get_current_proc_location(); for (i = 0; i < num_units; i++) { if (psmi_hal_get_unit_active(i) <= 0) continue; int node_id_i; if (!psmi_hal_get_node_id(i, &node_id_i)) { if (node_id_i == node_id) { found = 1; break; } } } } /* * Get all the ports with a valid lid and gid, one per unit. */ for (i = 0; i < num_units; i++) { int node_id_i; if (!psmi_hal_get_node_id(i, &node_id_i)) { if (multirail_within_socket_used && found && (node_id_i != node_id)) continue; } for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) { ret = psmi_hal_get_port_lid(i, j); if (ret <= 0) continue; ret = psmi_hal_get_port_gid(i, j, &gid_hi, &gid_lo); if (ret == -1) continue; gidh[count][0] = gid_hi; gidh[count][1] = i; gidh[count][2] = j; count++; break; } } /* * Sort all the ports with gidh from small to big. * This is for multiple fabrics, and we use fabric with the * smallest gid to make the master connection. */ qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc); for (i = 0; i < count; i++) { unit[i] = (uint32_t) gidh[i][1]; port[i] = (uint16_t) (uint32_t) gidh[i][2]; } *num_rails = count; return err; } static psm2_error_t psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o, uint64_t my_gid_hi, uint64_t my_gid_lo) { uint32_t num_units; int i; psm2_error_t err = PSM2_OK; PSMI_ERR_UNLESS_INITIALIZED(NULL); if (hfi_lids == NULL) { if ((err = psm2_ep_num_devunits(&num_units))) goto fail; hfi_lids = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_units * psmi_hal_get_num_ports(), sizeof(uint16_t)); if (hfi_lids == NULL) { err = psmi_handle_error(NULL, PSM2_NO_MEMORY, "Couldn't allocate memory for dev_lids structure"); goto fail; } for (i = 0; i < num_units; i++) { int j; for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) { int lid = psmi_hal_get_port_lid(i, j); int ret; uint64_t gid_hi = 0, gid_lo = 0; if (lid <= 0) continue; ret = psmi_hal_get_port_gid(i, j, &gid_hi, &gid_lo); if (ret == -1) continue; else if (my_gid_hi != gid_hi) { _HFI_VDBG("LID %d, unit %d, port %d, " "mismatched GID %llx:%llx and " "%llx:%llx\n", lid, i, j, (unsigned long long)gid_hi, (unsigned long long)gid_lo, (unsigned long long)my_gid_hi, (unsigned long long) my_gid_lo); continue; } _HFI_VDBG("LID %d, unit %d, port %d, " "matching GID %llx:%llx and " "%llx:%llx\n", lid, i, j, (unsigned long long)gid_hi, (unsigned long long)gid_lo, (unsigned long long)my_gid_hi, (unsigned long long)my_gid_lo); hfi_lids[nlids++] = (uint16_t) lid; } } if (nlids == 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Couldn't get lid&gid from any unit/port"); goto fail; } } *lids = hfi_lids; *num_lids_o = nlids; fail: return err; } static psm2_error_t psmi_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey) { int i, ret; psm2_error_t err; for (i = 0; i < 16; i++) { ret = psmi_hal_get_port_index2pkey(ep->unit_id, ep->portnum, i); if (ret < 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get a valid pkey value from pkey table\n"); return err; } else if ((ret & 0x7fff) == 0x7fff) { continue; /* management pkey, not for app traffic. */ } if ((pkey & 0x7fff) == (uint16_t)(ret & 0x7fff)) { break; } } /* if pkey does not match */ if (i == 16) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Wrong pkey 0x%x, please use PSM2_PKEY to specify a valid pkey\n", pkey); return err; } if (((uint16_t)ret & 0x8000) == 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Limited Member pkey 0x%x, please use PSM2_PKEY to specify a valid pkey\n", (uint16_t)ret); return err; } /* return the final pkey */ *opkey = (uint16_t)ret; return PSM2_OK; } uint64_t __psm2_epid_nid(psm2_epid_t epid) { uint64_t rv; PSM2_LOG_MSG("entering"); rv = (uint64_t) PSMI_EPID_GET_LID(epid); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_epid_nid) /* Currently not exposed to users, we don't acknowledge the existence of * subcontexts */ uint64_t psmi_epid_subcontext(psm2_epid_t epid) { return (uint64_t) PSMI_EPID_GET_SUBCONTEXT(epid); } /* Currently not exposed to users, we don't acknowledge the existence of * service levels encoding within epids. This may require * changing to expose SLs */ uint64_t psmi_epid_version(psm2_epid_t epid) { return (uint64_t) PSMI_EPID_GET_EPID_VERSION(epid); } uint64_t __psm2_epid_context(psm2_epid_t epid) { uint64_t rv; PSM2_LOG_MSG("entering"); rv = (uint64_t) PSMI_EPID_GET_CONTEXT(epid); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_epid_context) uint64_t __psm2_epid_port(psm2_epid_t epid) { uint64_t rv; PSM2_LOG_MSG("entering"); rv = __psm2_epid_context(epid); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_epid_port) psm2_error_t __psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) { psm2_error_t err = PSM2_OK; int i; psm2_ep_t ep; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); if (*num_of_epinfo <= 0) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid psm2_ep_query parameters"); PSM2_LOG_MSG("leaving"); return err; } if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); PSM2_LOG_MSG("leaving"); return err; } ep = psmi_opened_endpoint; for (i = 0; i < *num_of_epinfo; i++) { if (ep == NULL) break; array_of_epinfo[i].ep = ep; array_of_epinfo[i].epid = ep->epid; array_of_epinfo[i].jkey = ep->jkey; memcpy(array_of_epinfo[i].uuid, (void *)ep->uuid, sizeof(psm2_uuid_t)); psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str); ep = ep->user_ep_next; } *num_of_epinfo = i; PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_query) psm2_error_t __psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn) { psm2_error_t err = PSM2_OK; psm2_epaddr_t epaddr; psm2_ep_t ep; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); /* Need to have an opened endpoint before we can resolve epids */ if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); PSM2_LOG_MSG("leaving"); return err; } ep = psmi_opened_endpoint; while (ep) { epaddr = psmi_epid_lookup(ep, epid); if (!epaddr) { ep = ep->user_ep_next; continue; } /* Found connection for epid. Return info about endpoint to caller. */ psmi_assert_always(epaddr->ptlctl->ep == ep); epconn->addr = epaddr; epconn->ep = ep; epconn->mq = ep->mq; PSM2_LOG_MSG("leaving"); return err; } err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN, "Endpoint connection status unknown"); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_epid_lookup); psm2_error_t __psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn) { psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); /* Need to have an opened endpoint before we can resolve epids */ if (ep == NULL) { err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); PSM2_LOG_MSG("leaving"); return err; } if (epconn == NULL) { err = psmi_handle_error(ep, PSM2_PARAM_ERR, "Invalid output parameter"); PSM2_LOG_MSG("leaving"); return err; } psm2_epaddr_t epaddr = psmi_epid_lookup(ep, epid); if (epaddr) { /* Found connection for epid. Return info about endpoint to caller. */ psmi_assert_always(epaddr->ptlctl->ep == ep); epconn->addr = epaddr; epconn->ep = ep; epconn->mq = ep->mq; PSM2_LOG_MSG("leaving"); return err; } err = psmi_handle_error(ep, PSM2_EPID_UNKNOWN, "Endpoint connection status unknown"); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_epid_lookup2); psm2_error_t __psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid) { psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); if (epaddr && epid) { *epid = epaddr->epid; } else { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid input epaddr or output epid parameter"); } PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_epaddr_to_epid); psm2_error_t __psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o) { uint32_t num_lids = 0; uint16_t *lids = NULL; int i; uint16_t epid_lid; int result = 0; psm2_error_t err; PSM2_LOG_MSG("entering"); psmi_assert_always(ep != NULL); PSMI_ERR_UNLESS_INITIALIZED(ep); if ((!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) || (psmi_epid_version(epid) == PSMI_EPID_VERSION_SHM)) { /* If we are in the no hfi-mode, or the other process is, * the epid doesn't help us - so assume both we're on the same * machine and try to connect. */ result = 1; } else { epid_lid = (uint16_t) psm2_epid_nid(epid); err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo); if (err) { PSM2_LOG_MSG("leaving"); return err; } for (i = 0; i < num_lids; i++) { if (epid_lid == lids[i]) { /* we share memory if the lid is the same. */ result = 1; break; } } } *result_o = result; PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSMI_API_DECL(psm2_ep_epid_share_memory) psm2_error_t __psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts) { PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); if (!opts) return PSM2_PARAM_ERR; /* Set in order in the structure. */ opts->timeout = 30000000000LL; /* 30 sec */ opts->unit = HFI_UNIT_ID_ANY; opts->affinity = PSM2_EP_OPEN_AFFINITY_SET; opts->shm_mbytes = 0; /* deprecated in psm2.h */ opts->sendbufs_num = 1024; opts->network_pkey = psmi_hal_get_default_pkey(); opts->port = HFI_PORT_NUM_ANY; opts->outsl = PSMI_SL_DEFAULT; opts->service_id = HFI_DEFAULT_SERVICE_ID; opts->path_res_type = PSM2_PATH_RES_NONE; opts->senddesc_num = 4096; opts->imm_size = 128; PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSMI_API_DECL(psm2_ep_open_opts_get_defaults) psm2_error_t psmi_poll_noop(ptl_t *ptl, int replyonly); psm2_error_t __psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, struct psm2_ep_open_opts const *opts_i, psm2_mq_t mq, psm2_ep_t *epo, psm2_epid_t *epido) { psm2_ep_t ep = NULL; uint32_t num_units; size_t len; psm2_error_t err; psm2_epaddr_t epaddr = NULL; char buf[128], *p, *e; union psmi_envvar_val envvar_val; size_t ptl_sizes; struct psm2_ep_open_opts opts; ptl_t *amsh_ptl, *ips_ptl, *self_ptl; int i; /* First get the set of default options, we overwrite with the user's * desired values afterwards */ if ((err = psm2_ep_open_opts_get_defaults(&opts))) goto fail; if (opts_i != NULL) { if (opts_i->timeout != -1) opts.timeout = opts_i->timeout; if (opts_i->unit != -1) opts.unit = opts_i->unit; if (opts_i->affinity != -1) opts.affinity = opts_i->affinity; if (opts_i->sendbufs_num != -1) opts.sendbufs_num = opts_i->sendbufs_num; if (opts_i->network_pkey != psmi_hal_get_default_pkey()) opts.network_pkey = opts_i->network_pkey; if (opts_i->port != 0) opts.port = opts_i->port; if (opts_i->outsl != -1) opts.outsl = opts_i->outsl; if (opts_i->service_id) opts.service_id = (uint64_t) opts_i->service_id; if (opts_i->path_res_type != PSM2_PATH_RES_NONE) opts.path_res_type = opts_i->path_res_type; if (opts_i->senddesc_num) opts.senddesc_num = opts_i->senddesc_num; if (opts_i->imm_size) opts.imm_size = opts_i->imm_size; } /* Get Service ID from environment */ if (!psmi_getenv("PSM2_IB_SERVICE_ID", "HFI Service ID for path resolution", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG_ULONG, (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, &envvar_val)) { opts.service_id = (uint64_t) envvar_val.e_ulonglong; } /* Get Path resolution type from environment Possible choices are: * * NONE : Default same as previous instances. Utilizes static data. * OPP : Use OFED Plus Plus library to do path record queries. * UMAD : Use raw libibumad interface to form and process path records. */ if (!psmi_getenv("PSM2_PATH_REC", "Mechanism to query HFI path record (default is no path query)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"none", &envvar_val)) { if (!strcasecmp(envvar_val.e_str, "none")) opts.path_res_type = PSM2_PATH_RES_NONE; else if (!strcasecmp(envvar_val.e_str, "opp")) opts.path_res_type = PSM2_PATH_RES_OPP; else if (!strcasecmp(envvar_val.e_str, "umad")) opts.path_res_type = PSM2_PATH_RES_UMAD; else { _HFI_ERROR("Unknown path resolution type %s. " "Disabling use of path record query.\n", envvar_val.e_str); opts.path_res_type = PSM2_PATH_RES_NONE; } } /* If a specific unit is set in the environment, use that one. */ if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) { opts.unit = envvar_val.e_long; } /* Get user specified port number to use. */ if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_PORT_NUM_ANY, &envvar_val)) { opts.port = envvar_val.e_long; } /* Get service level from environment, path-query overrides it */ if (!psmi_getenv ("HFI_SL", "HFI outging ServiceLevel number (default 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { opts.outsl = envvar_val.e_long; } /* Get network key from environment. MVAPICH and other vendor MPIs do not * specify it on ep open and we may require it for vFabrics. * path-query will override it. */ if (!psmi_getenv("PSM2_PKEY", "HFI PKey to use for endpoint", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG, (union psmi_envvar_val)((unsigned int)(psmi_hal_get_default_pkey())), &envvar_val)) { opts.network_pkey = (uint64_t) envvar_val.e_ulong; } /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of 0x7FFF. That's no longer a valid default, so override it if the client was compiled against PSM v1 */ if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 && opts.network_pkey == 0x7FFF) { opts.network_pkey = psmi_hal_get_default_pkey();; } /* Get number of default send buffers from environment */ if (!psmi_getenv("PSM2_NUM_SEND_BUFFERS", "Number of send buffers to allocate [1024]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1024, &envvar_val)) { opts.sendbufs_num = envvar_val.e_uint; } /* Get immediate data size - transfers less than immediate data size do * not consume a send buffer and require just a send descriptor. */ if (!psmi_getenv("PSM2_SEND_IMMEDIATE_SIZE", "Immediate data send size not requiring a buffer [128]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)128, &envvar_val)) { opts.imm_size = envvar_val.e_uint; } /* Get number of send descriptors - by default this is 4 times the number * of send buffers - mainly used for short/inlined messages. */ if (!psmi_getenv("PSM2_NUM_SEND_DESCRIPTORS", "Number of send descriptors to allocate [4096]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)4096, &envvar_val)) { opts.senddesc_num = envvar_val.e_uint; } if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { if ((err = psm2_ep_num_devunits(&num_units)) != PSM2_OK) goto fail; } else num_units = 0; /* do some error checking */ if (opts.timeout < -1) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid timeout value %lld", (long long)opts.timeout); goto fail; } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid Device Unit ID %d (%d units found)", opts.unit, num_units); goto fail; } else if ((opts.port < HFI_MIN_PORT || opts.port > HFI_MAX_PORT) && opts.port != HFI_PORT_NUM_ANY) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid Device port number %d", opts.port); goto fail; } else if (opts.affinity < 0 || opts.affinity > PSM2_EP_OPEN_AFFINITY_FORCE) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid Affinity option: %d", opts.affinity); goto fail; } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid SL number: %lld", (unsigned long long)opts.outsl); goto fail; } /* Allocate end point structure storage */ ptl_sizes = (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? psmi_ptl_self.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? psmi_ptl_ips.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? psmi_ptl_amsh.sizeof_ptl() : 0); if (ptl_sizes == 0) return PSM2_EP_NO_DEVICE; ep = (psm2_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64, sizeof(struct psm2_ep) + ptl_sizes); epaddr = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, 1, sizeof(struct psm2_epaddr)); if (ep == NULL || epaddr == NULL) { err = psmi_handle_error(NULL, PSM2_NO_MEMORY, "Couldn't allocate memory for %s structure", ep == NULL ? "psm2_ep" : "psm2_epaddr"); goto fail; } memset(ep, 0, sizeof(struct psm2_ep) + ptl_sizes); /* Copy PTL enabled status */ for (i = 0; i < PTL_MAX_INIT; i++) ep->devid_enabled[i] = devid_enabled[i]; /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. */ ep->mq = mq; /* Get ready for PTL initialization */ memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm2_uuid_t)); ep->epaddr = epaddr; ep->memmode = mq->memmode; ep->hfi_num_sendbufs = opts.sendbufs_num; ep->service_id = opts.service_id; ep->path_res_type = opts.path_res_type; ep->hfi_num_descriptors = opts.senddesc_num; ep->hfi_imm_size = opts.imm_size; ep->errh = psmi_errhandler_global; /* by default use the global one */ ep->ptl_amsh.ep_poll = psmi_poll_noop; ep->ptl_ips.ep_poll = psmi_poll_noop; ep->connections = 0; /* See how many iterations we want to spin before yielding */ psmi_getenv("PSM2_YIELD_SPIN_COUNT", "Spin poll iterations before yield", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, &envvar_val); ep->yield_spin_cnt = envvar_val.e_uint; /* Set skip_affinity flag if PSM is not allowed to set affinity */ if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP) ep->skip_affinity = true; ptl_sizes = 0; amsh_ptl = ips_ptl = self_ptl = NULL; if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_ips.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_self.sizeof_ptl(); } if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, &(ep->context), &ep->epid))) goto fail; psmi_assert_always(ep->epid != 0); ep->epaddr->epid = ep->epid; _HFI_VDBG("psmi_ep_open_device() passed\n"); /* Set our new label as soon as we know what it is */ strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); buf[sizeof(buf) - 1] = '\0'; p = buf + strlen(buf); /* If our rank is set, use it. If not, use context.subcontext notation */ if (((e = getenv("MPI_RANKID")) != NULL && *e) || ((e = getenv("PSC_MPI_RANK")) != NULL && *e)) len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e)); else len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.", (uint32_t) psm2_epid_context(ep->epid), (uint32_t) psmi_epid_subcontext(ep->epid)); *(p + len) = '\0'; ep->context_mylabel = psmi_strdup(ep, buf); if (ep->context_mylabel == NULL) { err = PSM2_NO_MEMORY; goto fail; } /* hfi_set_mylabel(ep->context_mylabel); */ if ((err = psmi_epid_set_hostname(psm2_epid_nid(ep->epid), buf, 0))) goto fail; _HFI_VDBG("start ptl device init...\n"); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) goto fail; } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) goto fail; } /* If we're shm-only, this device is enabled above */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) goto fail; } else { /* We may have pre-attached as part of getting our rank for enabling * shared contexts. */ } _HFI_VDBG("finish ptl device init...\n"); /* * Keep only IPS since only IPS support multi-rail, other devices * are only setup once. IPS device can come to this function again. */ for (i = 0; i < PTL_MAX_INIT; i++) { if (devid_enabled[i] != PTL_DEVID_IPS) { devid_enabled[i] = -1; } } *epido = ep->epid; *epo = ep; return PSM2_OK; fail: if (ep != NULL) { psmi_hal_close_context(&ep->context.psm_hw_ctxt); psmi_free(ep); } if (epaddr != NULL) psmi_free(epaddr); return err; } psm2_error_t __psm2_ep_open(psm2_uuid_t const unique_job_key, struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo, psm2_epid_t *epido) { psm2_error_t err; psm2_mq_t mq; psm2_epid_t epid; psm2_ep_t ep, tmp; uint32_t units[HFI_MAX_RAILS]; uint16_t ports[HFI_MAX_RAILS]; int i, num_rails = 0; char *uname = "HFI_UNIT"; char *pname = "HFI_PORT"; char uvalue[6], pvalue[6]; int devid_enabled[PTL_MAX_INIT]; union psmi_envvar_val devs; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); if (!epo || !epido) return PSM2_PARAM_ERR; /* Allowing only one EP (unless explicitly enabled). */ if (psmi_opened_endpoint_count > 0 && !psmi_multi_ep_enabled) { PSM2_LOG_MSG("leaving"); return PSM2_TOO_MANY_ENDPOINTS; } /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. */ err = psmi_mq_malloc(&mq); PSMI_LOCK(psmi_creation_lock); if (err != PSM2_OK) goto fail; /* Set some of the MQ thresholds from the environment. Do this before ptl initialization - the ptl may have other constraints that will limit the MQ's settings. */ err = psmi_mq_initialize_defaults(mq); if (err != PSM2_OK) goto fail; psmi_init_lock(&(mq->progress_lock)); /* See which ptl devices we want to use for this ep to be opened */ psmi_getenv("PSM2_DEVICES", "Ordered list of PSM-level devices", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs); if ((err = psmi_parse_devices(devid_enabled, devs.e_str))) goto fail; if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { err = psmi_ep_multirail(&num_rails, units, ports); if (err != PSM2_OK) goto fail; /* If multi-rail is used, set the first ep unit/port */ if (num_rails > 0) { snprintf(uvalue, 6, "%1d", units[0]); snprintf(pvalue, 6, "%1d", ports[0]); setenv(uname, uvalue, 1); setenv(pname, pvalue, 1); } } #ifdef PSM_CUDA if (PSMI_IS_GDR_COPY_ENABLED) hfi_gdr_open(); #endif err = __psm2_ep_open_internal(unique_job_key, devid_enabled, opts_i, mq, &ep, &epid); if (err != PSM2_OK) goto fail; if (psmi_opened_endpoint == NULL) { psmi_opened_endpoint = ep; } else { tmp = psmi_opened_endpoint; while (tmp->user_ep_next) tmp = tmp->user_ep_next; tmp->user_ep_next = ep; } psmi_opened_endpoint_count++; ep->mctxt_prev = ep->mctxt_next = ep; ep->mctxt_master = ep; mq->ep = ep; /* Active Message initialization */ err = psmi_am_init_internal(ep); if (err != PSM2_OK) goto fail; *epo = ep; *epido = epid; if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { for (i = 1; i < num_rails; i++) { snprintf(uvalue, 6, "%1d", units[i]); snprintf(pvalue, 6, "%1d", ports[i]); setenv(uname, uvalue, 1); setenv(pname, pvalue, 1); /* Create slave EP */ err = __psm2_ep_open_internal(unique_job_key, devid_enabled, opts_i, mq, &tmp, &epid); if (err) goto fail; /* Point back to shared resources on the master EP */ tmp->am_htable = ep->am_htable; /* Link slave EP after master EP. */ PSM_MCTXT_APPEND(ep, tmp); } } _HFI_VDBG("psm2_ep_open() OK....\n"); fail: PSMI_UNLOCK(psmi_creation_lock); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_open) psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) { psm2_error_t err = PSM2_OK; #if _HFI_DEBUGGING uint64_t t_start = 0; if (_HFI_PRDBG_ON) { t_start = get_cycles(); } #endif #ifdef PSM_CUDA /* * The close on the gdr fd needs to be called before the * close on the hfi fd as the the gdr device will hold * reference count on the hfi device which will make the close * on the hfi fd return without actually closing the fd. */ if (PSMI_IS_GDR_COPY_ENABLED) hfi_gdr_close(); #endif union psmi_envvar_val timeout_intval; psm2_ep_t tmp; psm2_mq_t mmq; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); psmi_assert_always(ep->mctxt_master == ep); PSMI_LOCK(psmi_creation_lock); psmi_am_fini_internal(ep); if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); PSM2_LOG_MSG("leaving"); PSMI_UNLOCK(psmi_creation_lock); return err; } tmp = psmi_opened_endpoint; while (tmp && tmp != ep) { tmp = tmp->user_ep_next; } if (!tmp) { err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); PSM2_LOG_MSG("leaving"); PSMI_UNLOCK(psmi_creation_lock); return err; } psmi_getenv("PSM2_CLOSE_TIMEOUT", "End-point close timeout over-ride.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM2_CLOSE_TIMEOUT")) { timeout_in = timeout_intval.e_uint * SEC_ULL; } else if (timeout_in > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100); } if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT) timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT; /* Infinite and excessive close time-out are limited here to a max. * The "rationale" is that there is no point waiting around forever for * graceful termination. Normal (or forced) process termination should clean * up the context state correctly even if termination is not graceful. */ if (timeout_in <= 0 || timeout_in > PSMI_MAX_EP_CLOSE_TIMEOUT) timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT; _HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and " "%d connections\n", ep, mode == PSM2_EP_CLOSE_FORCE ? "YES" : "NO", (double)timeout_in / 1e9, (int)ep->connections); /* XXX We currently cheat in the sense that we leave each PTL the allowed * timeout. There's no good way to do this until we change the PTL * interface to allow asynchronous finalization */ /* Check if transfer ownership of receive thread is needed before closing ep. * In case of PSM2_MULTI_EP support receive thread is created and assigned * to first opened endpoint. Receive thread is killed when closing this * endpoint. */ if (ep->user_ep_next != NULL) { /* Receive thread will be transfered and assigned to ep->user_ep_next * only if currently working receive thread (which will be killed) is * assigned to ep and there isn't any assigned to ep->user_ep_next. */ if ((psmi_ptl_ips_rcvthread.is_enabled(ep->ptl_ips.ptl)) && (!psmi_ptl_ips_rcvthread.is_enabled(ep->user_ep_next->ptl_ips.ptl))) psmi_ptl_ips_rcvthread.transfer_ownership(ep->ptl_ips.ptl, ep->user_ep_next->ptl_ips.ptl); } /* * Before freeing the master ep itself, * remove it from the global linklist. * We do it here to let atexit handler in ptl_am directory * to search the global linklist and free the shared memory file. */ if (psmi_opened_endpoint == ep) { /* Removing ep from global endpoint list. */ psmi_opened_endpoint = ep->user_ep_next; } else { tmp = psmi_opened_endpoint; while (tmp->user_ep_next != ep) { tmp = tmp->user_ep_next; } /* Removing ep from global endpoint list. */ tmp->user_ep_next = ep->user_ep_next; } psmi_opened_endpoint_count--; /* * This do/while loop is used to close and free memory of endpoints. * * If MULTIRAIL feature is disable this loop will be passed only once * and only endpoint passed in psm2_ep_close will be closed/removed. * * If MULTIRAIL feature is enabled then this loop will be passed * multiple times (depending on number of rails). The order in which * endpoints will be closed is shown below: * * |--this is master endpoint in case of multirail * | this endpoint is passed to psm2_ep_close and * V this is only endpoint known to user. * +<-Ep0<-Ep1<-Ep2<-Ep3 * |__________________| Ep3->mctxt_prev points to Ep2 * (3) (2) (1) (4) Ep2->mctxt_prev points to Ep1 * ^ Ep1->mctxt_prev points to Ep0 * | Ep0->mctxt_prev points to Ep3 (master ep) * | * |---- order in which endpoints will be closed. * * Closing MULTIRAILs starts by closing slaves (Ep2, Ep1, Ep0) * If MULTIRAIL is enabled then Ep3->mctxt_prev will point to Ep2, if * feature is disabled then Ep3->mctxt_prev will point to Ep3 and * do/while loop will have one pass. * * In case of MULTIRAIL enabled Ep3 which is master endpoint will be * closed as the last one. */ mmq = ep->mq; tmp = ep->mctxt_prev; do { ep = tmp; tmp = ep->mctxt_prev; PSMI_LOCK(ep->mq->progress_lock); PSM_MCTXT_REMOVE(ep); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) err = psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, timeout_in); if ((err == PSM2_OK || err == PSM2_TIMEOUT) && psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) err = psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, timeout_in); /* If there's timeouts in the disconnect requests, * still make sure that we still get to close the *endpoint and mark it closed */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) psmi_context_close(&ep->context); psmi_epid_remove_all(ep); psmi_free(ep->epaddr); psmi_free(ep->context_mylabel); PSMI_UNLOCK(ep->mq->progress_lock); ep->mq = NULL; psmi_free(ep); } while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep); if (mmq) { psmi_destroy_lock(&(mmq->progress_lock)); err = psmi_mq_free(mmq); } if (hfi_lids) { psmi_free(hfi_lids); hfi_lids = NULL; nlids = 0; } PSMI_UNLOCK(psmi_creation_lock); if (_HFI_PRDBG_ON) { _HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n", (double)cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL); } PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_close) static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep, const struct psm2_ep_open_opts *opts, const psm2_uuid_t unique_job_key, struct psmi_context *context, psm2_epid_t *epid) { psm2_error_t err = PSM2_OK; /* Skip affinity. No affinity if: * 1. User explicitly sets no-affinity=YES in environment. * 2. User doesn't set affinity in environment and PSM is opened with * option affinity skip. */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { union psmi_envvar_val env_rcvthread; static int norcvthread; /* only for first rail */ ep->out_sl = opts->outsl; if ((err = psmi_context_open(ep, opts->unit, opts->port, unique_job_key, opts->timeout, context)) != PSM2_OK) goto fail; _HFI_DBG("[%d]use unit %d port %d\n", getpid(), psmi_hal_get_unit_id(ep->context.psm_hw_ctxt), 1); /* At this point, we have the unit id and port number, so * check if pkey is not 0x0/0x7fff/0xffff, and match one * of the pkey in table. */ if ((err = psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey, &ep->network_pkey)) != PSM2_OK) goto fail; /* See if we want to activate support for receive thread */ psmi_getenv("PSM2_RCVTHREAD", "Recv thread flags (0 disables thread)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)(norcvthread++ ? 0 : PSMI_RCVTHREAD_FLAGS), &env_rcvthread); /* If enabled, use the polling capability to implement a receive * interrupt thread that can handle urg packets */ if (env_rcvthread.e_uint) { psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD); #ifdef PSMI_PLOCK_IS_NOLOCK psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "#define PSMI_PLOCK_IS_NOLOCK not functional yet " "with RCVTHREAD on"); #endif } *epid = context->epid; } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { int rank; /* In shm-only mode, we need to derive a valid epid * based on our rank. We try to get it from the * environment if its available, or resort to using * our PID as the rank. */ union psmi_envvar_val env_rankid; if (psmi_getenv ("MPI_LOCALRANKID", "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid)) { if (psmi_getenv ("PSC_MPI_NODE_RANK", "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid)) { rank = getpid(); } else rank = env_rankid.e_int; } else rank = env_rankid.e_int; /* * We use a LID of 0 for non-HFI communication. * Since a jobkey is not available from IPS, pull the * first 16 bits from the UUID. */ switch (PSMI_EPID_VERSION) { case PSMI_EPID_V1: *epid = PSMI_EPID_PACK_V1(((uint16_t *) unique_job_key)[0], (rank >> 3), rank, 0, PSMI_EPID_VERSION_SHM, rank); break; case PSMI_EPID_V2: /* Construct epid for this Endpoint */ *epid = PSMI_EPID_PACK_V2_SHM(getpid(), PSMI_EPID_SHM_ONLY, /*is a only-shm epid */ PSMI_EPID_VERSION); break; default: /* Epid version is greater than max supportd version. */ psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2); break; } } else { /* Self-only, meaning only 1 proc max */ switch (PSMI_EPID_VERSION) { case PSMI_EPID_V1: *epid = PSMI_EPID_PACK_V1( 0, 0, 0, 0, PSMI_EPID_VERSION_SHM, 0x3ffffff); break; case PSMI_EPID_V2: *epid = PSMI_EPID_PACK_V2_SHM(0, PSMI_EPID_SHM_ONLY, /*is a only-shm epid */ PSMI_EPID_VERSION); break; default: /* Epid version is greater than max supportd version. */ psmi_assert_always(PSMI_EPID_VERSION <= PSMI_EPID_V2); break; } } fail: return err; } /* Get a list of PTLs we want to use. The order is important, it affects * whether node-local processes use shm or ips */ static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring) { char *devstr = NULL; char *b_new, *e, *ee, *b; psm2_error_t err = PSM2_OK; int len; int i = 0; psmi_assert_always(devstring != NULL); len = strlen(devstring) + 1; for (i = 0; i < PTL_MAX_INIT; i++) devices[i] = -1; devstr = (char *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len); if (devstr == NULL) goto fail; b_new = (char *)devstr; e = b_new + len; strncpy(e, devstring, len); ee = e + len; i = 0; while (e < ee && *e && i < PTL_MAX_INIT) { while (*e && !isalpha(*e)) e++; b = e; while (*e && isalpha(*e)) e++; *e = '\0'; if (*b) { if (!strcasecmp(b, "self")) { devices[i++] = PTL_DEVID_SELF; b_new = strcpy(b_new, "self,"); b_new += 5; } else if (!strcasecmp(b, "shm") || !strcasecmp(b, "shmem") || !strcasecmp(b, "amsh")) { devices[i++] = PTL_DEVID_AMSH; strcpy(b_new, "amsh,"); b_new += 5; } else if (!strcasecmp(b, "hfi") || !strcasecmp(b, "ipath") || !strcasecmp(b, "ips")) { devices[i++] = PTL_DEVID_IPS; strcpy(b_new, "ips,"); b_new += 4; } else { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "%s set in environment variable PSM_PTL_DEVICES=\"%s\" " "is not one of the recognized PTL devices (%s)", b, devstring, PSMI_DEVICES_DEFAULT); goto fail; } e++; } } if (b_new != devstr) /* we parsed something, remove trailing comma */ *(b_new - 1) = '\0'; _HFI_PRDBG("PSM Device allocation order: %s\n", devstr); fail: if (devstr != NULL) psmi_free(devstr); return err; } static int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid) { int i; for (i = 0; i < PTL_MAX_INIT; i++) if (devid_enabled[i] == devid) return 1; return 0; } int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid) { return psmi_device_is_enabled(ep->devid_enabled, devid); } opa-psm2-PSM2_11.2.185/psm_ep.h000066400000000000000000000172771370564314600156730ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #ifndef _PSMI_IN_USER_H #error psm2_ep.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSMI_EP_H #define _PSMI_EP_H /* * EPIDs encode the following information: * * LID:16 bits - LID for endpoint * CONTEXT:8 bits - Context used for bits (upto 256 contexts) * SUBCONTEXT:3 bits - Subcontext used for endpoint * HFIUNIT: 2 bits - HFI unit number * HFITYPE: 3 bits - OPA1, OPA2, ... * RANK: 26 bits - process rank * reserved: 6 bit - for future usage */ #define PSMI_HFI_TYPE_UNKNOWN 0 #define PSMI_HFI_TYPE_OPA1 1 #define PSMI_HFI_TYPE_OPA2 2 #define PSMI_SL_DEFAULT 0 #define PSMI_SC_DEFAULT 0 #define PSMI_VL_DEFAULT 0 #define PSMI_SL_MIN 0 #define PSMI_SL_MAX 31 #define PSMI_SC_ADMIN 15 #define PSMI_VL_ADMIN 15 #define PSMI_SC_NBITS 5 /* Number of bits in SC */ #define PSMI_N_SCS (1 << PSMI_SC_NBITS) /* The number of SC's */ #define PSMI_EPID_PACK_V1(lid, context, subcontext, hfiunit, epid_version, rank) \ (((((uint64_t)lid)&0xffff)<<16) | \ ((((uint64_t)context)&0xff)<<8) | \ ((((uint64_t)subcontext)&0x7)<<5) | \ ((((uint64_t)hfiunit)&0x3)<<3) | \ ((((uint64_t)epid_version)&0x7)<<0) | \ ((((uint64_t)rank)&0x3ffffff)<<32)) #define PSMI_EPID_PACK_V2(lid, context, subcontext, shmbool, epid_version, subnet_id) \ (((((uint64_t)lid)&0xffffff)<<16) | \ ((((uint64_t)context)&0xff)<<8) | \ ((((uint64_t)subcontext)&0x7)<<5) | \ ((((uint64_t)shmbool)&0x1)<<3) | \ ((((uint64_t)epid_version)&0x7)<<0) | \ ((((uint64_t)subnet_id)&0xffff)<<48)) #define PSMI_EPID_PACK_V2_SHM(process_id, shmbool, epid_version) \ (((((uint64_t)process_id)&0xffffffff)<<32) | \ ((((uint64_t)shmbool)&0x1)<<3) | \ ((((uint64_t)epid_version)&0x7)<<0)) #define PSMI_EPID_GET_LID_V1(epid) (((epid)>>16)&0xffff) #define PSMI_EPID_GET_LID_V2(epid) (((epid)>>16)&0xffffff) #define PSMI_EPID_GET_CONTEXT(epid) (((epid)>>8)&0xff) #define PSMI_EPID_GET_SUBCONTEXT(epid) (((epid)>>5)&0x7) #define PSMI_EPID_GET_HFIUNIT(epid) (((epid)>>3)&0x3) #define PSMI_EPID_GET_EPID_VERSION(epid) (((epid)>>0)&0x7) #define PSMI_EPID_GET_RANK(epid) (((epid)>>32)&0x3ffffff) #define PSMI_EPID_GET_SHMBOOL(epid) (((epid)>>3)&0x1) #define PSMI_EPID_GET_SUBNET_ID(epid) (((epid)>>48)&0xffff) #define PSMI_EPID_GET_PROCESS_ID(epid) (((epid)>>32)&0xffffffff) #define PSM_MCTXT_APPEND(head, node) \ node->mctxt_prev = head->mctxt_prev; \ node->mctxt_next = head; \ head->mctxt_prev->mctxt_next = node; \ head->mctxt_prev = node; \ node->mctxt_master = head #define PSM_MCTXT_REMOVE(node) \ node->mctxt_prev->mctxt_next = node->mctxt_next; \ node->mctxt_next->mctxt_prev = node->mctxt_prev; \ node->mctxt_next = node->mctxt_prev = node; \ node->mctxt_master = NULL struct psm2_ep { psm2_epid_t epid; /**> This endpoint's Endpoint ID */ psm2_epaddr_t epaddr; /**> This ep's ep address */ psm2_mq_t mq; /**> only 1 MQ */ int unit_id; uint16_t portnum; uint16_t out_sl; uint16_t mtu; /* out_sl-->vl-->mtu in sysfs */ uint16_t network_pkey; /**> OPA Pkey */ int did_syslog; psm2_uuid_t uuid; uint16_t jkey; uint64_t service_id; /* OPA service ID */ psm2_path_res_t path_res_type; /* Path resolution for endpoint */ psm2_ep_errhandler_t errh; int devid_enabled[PTL_MAX_INIT]; int memmode; /**> min, normal, large memory mode */ uint32_t hfi_num_sendbufs;/**> Number of allocated send buffers */ uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/ uint32_t hfi_imm_size; /** Immediate data size */ uint32_t connections; /**> Number of connections */ psmi_context_t context; char *context_mylabel; uint32_t yield_spin_cnt; /* EP link-lists */ struct psm2_ep *user_ep_next; /* EP link-lists for multi-context. */ struct psm2_ep *mctxt_prev; struct psm2_ep *mctxt_next; struct psm2_ep *mctxt_master; /* Active Message handler table */ struct psm2_ep_am_handle_entry *am_htable; uint64_t gid_hi; uint64_t gid_lo; ptl_ctl_t ptl_amsh; ptl_ctl_t ptl_ips; ptl_ctl_t ptl_self; /* All ptl data is allocated inline below */ uint8_t ptl_base_data[0] __attribute__ ((aligned(64))); bool skip_affinity; }; struct mqq { psm2_mq_req_t first; psm2_mq_req_t last; }; typedef union psmi_seqnum { struct { uint32_t psn_seq:11; uint32_t psn_gen:20; }; struct { uint32_t psn_num:31; }; uint32_t psn_val; } psmi_seqnum_t; /* * PSM end point address. One per connection and per rail. */ struct psm2_epaddr { psm2_epid_t epid; /* peer's epid */ ptl_ctl_t *ptlctl; /* The control structure for the ptl */ struct ips_proto *proto; /* only for ips protocol */ void *usr_ep_ctxt; /* User context associated with endpoint */ }; #ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD # define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD 250 #endif /* * Users of BLOCKUNTIL should check the value of err upon return */ #define PSMI_BLOCKUNTIL(ep, err, cond) do { \ int spin_cnt = 0; \ PSMI_PROFILE_BLOCK(); \ while (!(cond)) { \ err = psmi_poll_internal(ep, 1); \ if (err == PSM2_OK_NO_PROGRESS) { \ PSMI_PROFILE_REBLOCK(1); \ if (++spin_cnt == (ep)->yield_spin_cnt) { \ spin_cnt = 0; \ PSMI_YIELD((ep)->mq->progress_lock); \ } \ } \ else if (err == PSM2_OK) { \ PSMI_PROFILE_REBLOCK(0); \ spin_cnt = 0; \ } \ else \ break; \ } \ PSMI_PROFILE_UNBLOCK(); \ } while (0) #endif /* _PSMI_EP_H */ opa-psm2-PSM2_11.2.185/psm_ep_connect.c000066400000000000000000000441031370564314600173630ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm_mq_internal.h" int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); #if _HFI_DEBUGGING PSMI_ALWAYS_INLINE( char *psmi_getdevice(int type)) { switch (type) { case PTL_DEVID_IPS: return "ips"; case PTL_DEVID_AMSH: return "amsh"; case PTL_DEVID_SELF: return "self"; default: return "ips"; } } #endif psm2_error_t __psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, int const *array_of_epid_mask, /* can be NULL */ psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, int64_t timeout) { psm2_error_t err = PSM2_OK; ptl_ctl_t *ptlctl; ptl_t *ptl; int i, j, dup_idx; int num_toconnect = 0; int *epid_mask = NULL; int *epid_mask_isdupof = NULL; uint64_t t_start = get_cycles(); uint64_t t_left; union psmi_envvar_val timeout_intval; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); /* * Normally we would lock here, but instead each implemented ptl component * does its own locking. This is mostly because the ptl components are * ahead of the PSM2 interface in that they can disconnect their peers. */ if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || num_of_epid < 1) { err = psmi_handle_error(ep, PSM2_PARAM_ERR, "Invalid psm2_ep_connect parameters"); goto fail_nolock; } PSMI_LOCK(ep->mq->progress_lock); /* We need two of these masks to detect duplicates */ err = PSM2_NO_MEMORY; epid_mask = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask == NULL) goto fail; epid_mask_isdupof = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask_isdupof == NULL) goto fail; err = PSM2_OK; /* Eventually handle timeouts across all connects. */ for (j = 0; j < num_of_epid; j++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) epid_mask[j] = 0; else { epid_mask[j] = 1; array_of_errors[j] = PSM2_EPID_UNKNOWN; array_of_epaddr[j] = NULL; if (psmi_epid_version(array_of_epid[j]) > PSMI_EPID_VERSION) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Unknown version of EPID - %"PRIu64" \n" "Please upgrade PSM2 or set PSM2_ADDR_FMT=1 in the environment to force EPID version 1 \n", psmi_epid_version(array_of_epid[j])); } num_toconnect++; } epid_mask_isdupof[j] = -1; } psmi_getenv("PSM2_CONNECT_TIMEOUT", "End-point connection timeout over-ride. 0 for no time-out.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM2_CONNECT_TIMEOUT")) { timeout = timeout_intval.e_uint * SEC_ULL; } else if (timeout > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); } if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", num_toconnect, (double)timeout / 1e9); /* Look for duplicates in input array */ for (i = 0; i < num_of_epid; i++) { for (j = i + 1; j < num_of_epid; j++) { if (array_of_epid[i] == array_of_epid[j] && epid_mask[i] && epid_mask[j]) { epid_mask[j] = 0; /* don't connect more than once */ epid_mask_isdupof[j] = i; } } } for (i = 0; i < PTL_MAX_INIT; i++) { if (ep->devid_enabled[i] == -1) continue; /* Set up the right connect ptrs */ switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: ptlctl = &ep->ptl_ips; ptl = ep->ptl_ips.ptl; break; case PTL_DEVID_AMSH: ptlctl = &ep->ptl_amsh; ptl = ep->ptl_amsh.ptl; break; case PTL_DEVID_SELF: ptlctl = &ep->ptl_self; ptl = ep->ptl_self.ptl; break; default: ptlctl = &ep->ptl_ips; /*no-unused */ ptl = ep->ptl_ips.ptl; /*no-unused */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]); break; } t_left = psmi_cycles_left(t_start, timeout); if (_HFI_VDBG_ON) { _HFI_VDBG_ALWAYS ("Trying to connect with device %s\n", psmi_getdevice(ep->devid_enabled[i])); } if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, epid_mask, array_of_errors, array_of_epaddr, cycles_to_nanosecs(t_left)))) { if (_HFI_PRDBG_ON) { _HFI_PRDBG_ALWAYS ("Connect failure in device %s err=%d\n", psmi_getdevice(ep->devid_enabled[i]), err); } goto connect_fail; } /* Now process what's been connected */ for (j = 0; j < num_of_epid; j++) { dup_idx = epid_mask_isdupof[j]; if (!epid_mask[j] && dup_idx == -1) continue; if (dup_idx != -1) { /* dup */ array_of_epaddr[j] = array_of_epaddr[dup_idx]; array_of_errors[j] = array_of_errors[dup_idx]; epid_mask_isdupof[j] = -1; } if (array_of_errors[j] == PSM2_OK) { epid_mask[j] = 0; /* don't try on next ptl */ ep->connections++; } } } for (i = 0; i < num_of_epid; i++) { ptl_ctl_t *c = NULL; if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; /* If we see unreachable here, that means some PTLs were not enabled */ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { err = PSM2_EPID_UNREACHABLE; break; } psmi_assert_always(array_of_epaddr[i] != NULL); c = array_of_epaddr[i]->ptlctl; psmi_assert_always(c != NULL); _HFI_VDBG("%-20s DEVICE %s (%p)\n", psmi_epaddr_get_name(array_of_epid[i]), c == &ep->ptl_ips ? "hfi" : (c == &ep->ptl_amsh ? "amsh" : "self"), (void *)array_of_epaddr[i]->ptlctl->ptl); } if (err == PSM2_OK) for (i=0; idevid_enabled[i]) { case PTL_DEVID_IPS: devname = "hfi"; break; case PTL_DEVID_AMSH: devname = "shm"; break; case PTL_DEVID_SELF: default: devname = "self"; break; } len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%s,", devname); } if (len < sizeof(errbuf) - 1 && devname != NULL) /* parsed something, remove trailing comma */ errbuf[len - 1] = ')'; } else len = snprintf(errbuf, sizeof(errbuf) - 1, "%s", err == PSM2_TIMEOUT ? "Detected connection timeout" : psm2_error_get_string(err)); /* first pass, look for all nodes with the error */ for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; if (array_of_errors[i] == PSM2_OK) continue; if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && err != PSM2_EPID_UNREACHABLE) continue; if (err == array_of_errors[i]) { len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%c %s", j == 0 ? ':' : ',', psmi_epaddr_get_hostname (array_of_epid[i])); j++; } } errbuf[sizeof(errbuf) - 1] = '\0'; err = psmi_handle_error(ep, err, "%s", errbuf); } fail: PSMI_UNLOCK(ep->mq->progress_lock); fail_nolock: if (epid_mask != NULL) psmi_free(epid_mask); if (epid_mask_isdupof != NULL) psmi_free(epid_mask_isdupof); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_connect) psm2_error_t __psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr, psm2_epaddr_t *array_of_epaddr, const int *array_of_epaddr_mask, psm2_error_t *array_of_errors, int64_t timeout) { return psm2_ep_disconnect2(ep, num_of_epaddr, array_of_epaddr, array_of_epaddr_mask, array_of_errors, PSM2_EP_DISCONNECT_GRACEFUL, timeout); } PSMI_API_DECL(psm2_ep_disconnect) psm2_error_t __psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, psm2_epaddr_t *array_of_epaddr, const int *array_of_epaddr_mask, psm2_error_t *array_of_errors, int mode, int64_t timeout) { psm2_error_t err = PSM2_OK; ptl_ctl_t *ptlctl; ptl_t *ptl; int i, j, dup_idx; int num_todisconnect = 0; int *epaddr_mask = NULL; int *epaddr_mask_isdupof = NULL; uint64_t t_start = get_cycles(); uint64_t t_left; union psmi_envvar_val timeout_intval; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); /* * Normally we would lock here, but instead each implemented ptl component * does its own locking. This is mostly because the ptl components are * ahead of the PSM2 interface in that they can disconnect their peers. */ if (ep == NULL || array_of_epaddr == NULL || num_of_epaddr < 1) { err = psmi_handle_error(ep, PSM2_PARAM_ERR, "Invalid psm2_ep_disconnect parameters"); goto fail_nolock; } PSMI_LOCK(ep->mq->progress_lock); /* We need two of these masks to detect duplicates */ err = PSM2_NO_MEMORY; epaddr_mask = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr); if (epaddr_mask == NULL) goto fail; epaddr_mask_isdupof = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr); if (epaddr_mask_isdupof == NULL) goto fail; err = PSM2_OK; /* Eventually handle timeouts across all connects. */ for (j = 0; j < num_of_epaddr; j++) { if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[j]) epaddr_mask[j] = 0; else { epaddr_mask[j] = 1; array_of_errors[j] = PSM2_EPID_UNKNOWN; num_todisconnect++; } epaddr_mask_isdupof[j] = -1; } psmi_getenv("PSM2_DISCONNECT_TIMEOUT", "End-point disconnection timeout over-ride. 0 for no time-out.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM2_DISCONNECT_TIMEOUT")) { timeout = timeout_intval.e_uint * SEC_ULL; } else if (timeout > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout = max(timeout, (num_todisconnect * SEC_ULL) / 100); } if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; _HFI_PRDBG("Disconnect %d endpoints with time-out of %.2f secs\n", num_todisconnect, (double)timeout / 1e9); /* Look for duplicates in input array */ for (i = 0; i < num_of_epaddr; i++) { for (j = i + 1; j < num_of_epaddr; j++) { if (array_of_epaddr[i] == array_of_epaddr[j] && epaddr_mask[i] && epaddr_mask[j]) { epaddr_mask[j] = 0; /* don't disconnect more than once */ epaddr_mask_isdupof[j] = i; } } } for (i = 0; i < PTL_MAX_INIT; i++) { if (ep->devid_enabled[i] == -1) continue; /* Set up the right connect ptrs */ switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: ptlctl = &ep->ptl_ips; ptl = ep->ptl_ips.ptl; break; case PTL_DEVID_AMSH: ptlctl = &ep->ptl_amsh; ptl = ep->ptl_amsh.ptl; break; case PTL_DEVID_SELF: ptlctl = &ep->ptl_self; ptl = ep->ptl_self.ptl; break; default: ptlctl = &ep->ptl_ips; /*no-unused */ ptl = ep->ptl_ips.ptl; /*no-unused */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]); break; } t_left = psmi_cycles_left(t_start, timeout); if (_HFI_VDBG_ON) { _HFI_VDBG_ALWAYS ("Trying to disconnect with device %s\n", psmi_getdevice(ep->devid_enabled[i])); } if ((err = ptlctl->ep_disconnect(ptl, (mode == PSM2_EP_DISCONNECT_FORCE), num_of_epaddr, array_of_epaddr, epaddr_mask, array_of_errors, cycles_to_nanosecs(t_left)))) { if (_HFI_PRDBG_ON) { _HFI_PRDBG_ALWAYS ("Disconnect failure in device %s err=%d\n", psmi_getdevice(ep->devid_enabled[i]), err); } goto disconnect_fail; } /* Now process what's been disconnected */ for (j = 0; j < num_of_epaddr; j++) { dup_idx = epaddr_mask_isdupof[j]; if (!epaddr_mask[j] && dup_idx == -1) continue; if (dup_idx != -1) { /* dup */ array_of_errors[j] = array_of_errors[dup_idx]; epaddr_mask_isdupof[j] = -1; } if (array_of_errors[j] == PSM2_OK) { epaddr_mask[j] = 0; /* don't try on next ptl */ array_of_epaddr[j] = NULL; ep->connections--; } } } for (i = 0; i < num_of_epaddr; i++) { if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i]) continue; /* If we see unreachable here, that means some PTLs were not enabled */ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { err = PSM2_EPID_UNREACHABLE; break; } } disconnect_fail: /* If the error is a timeout (at worse) and the client is OPA MPI, * just return timeout to let OPA MPI handle the hostnames that * timed out */ if (err != PSM2_OK) { char errbuf[PSM2_ERRSTRING_MAXLEN]; size_t len; int j = 0; if (err == PSM2_EPID_UNREACHABLE) { char *deverr = "of an incorrect setting"; char *eperr = ""; char *devname = NULL; if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { deverr = "there is no shared memory PSM2 device (shm)"; eperr = " shared memory"; } else if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { deverr = "there is no OPA PSM2 device (hfi)"; eperr = " OPA"; } len = snprintf(errbuf, sizeof(errbuf) - 1, "Some%s endpoints could not be disconnected because %s " "in the currently enabled PSM2_DEVICES (", eperr, deverr); for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) { switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: devname = "hfi"; break; case PTL_DEVID_AMSH: devname = "shm"; break; case PTL_DEVID_SELF: default: devname = "self"; break; } len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%s,", devname); } if (len < sizeof(errbuf) - 1 && devname != NULL) /* parsed something, remove trailing comma */ errbuf[len - 1] = ')'; } else len = snprintf(errbuf, sizeof(errbuf) - 1, "%s", err == PSM2_TIMEOUT ? "Detected disconnect timeout" : psm2_error_get_string(err)); /* first pass, look for all nodes with the error */ for (i = 0; i < num_of_epaddr && len < sizeof(errbuf) - 1; i++) { if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i]) continue; if (array_of_errors[i] == PSM2_OK) continue; if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && err != PSM2_EPID_UNREACHABLE) continue; if (err == array_of_errors[i]) { len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%c %s", j == 0 ? ':' : ',', psmi_epaddr_get_hostname (array_of_epaddr[i]->epid)); j++; } } errbuf[sizeof(errbuf) - 1] = '\0'; err = psmi_handle_error(ep, err, "%s", errbuf); } fail: PSMI_UNLOCK(ep->mq->progress_lock); fail_nolock: if (epaddr_mask != NULL) psmi_free(epaddr_mask); if (epaddr_mask_isdupof != NULL) psmi_free(epaddr_mask_isdupof); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_ep_disconnect2) opa-psm2-PSM2_11.2.185/psm_error.c000066400000000000000000000277211370564314600164060ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #define PSMI_NOLOG -1 struct psm2_error_token { psm2_ep_t ep; psm2_error_t error; char err_string[PSM2_ERRSTRING_MAXLEN]; }; static psm2_error_t psmi_errhandler_noop(psm2_ep_t ep, const psm2_error_t err, const char *error_string, psm2_error_token_t token) { return err; } static psm2_error_t psmi_errhandler_psm(psm2_ep_t ep, const psm2_error_t err, const char *error_string, psm2_error_token_t token) { /* we want the error to be seen through ssh, etc., so we flush and then * sleep a bit. Not perfect, but not doing so means it almost never * gets seen. */ fprintf(stderr, "%s%s\n", hfi_get_mylabel(), token->err_string); fflush(stdout); fflush(stderr); /* XXX Eventually, this will hook up to a connection manager, and we'll * issue an upcall into the connection manager at shutdown time */ sleep(3); /* We use this "special" ep internally to handle internal errors that are * triggered from within code that is not expected to return to the user. * Errors of this sort on not expected to be handled by users and always * mean we have an internal PSM bug. */ if (err == PSM2_INTERNAL_ERR) abort(); else exit(-1); } psm2_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop; psm2_error_t __psm2_error_defer(psm2_error_token_t token) { psm2_error_t rv; PSM2_LOG_MSG("entering"); rv = psmi_errhandler_psm(token->ep, token->error, token->err_string, token); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_error_defer) psm2_error_t __psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler) { psm2_ep_errhandler_t *errh; PSM2_LOG_MSG("entering"); if (ep == NULL) errh = &psmi_errhandler_global; else errh = &ep->errh; if (errhandler == PSM2_ERRHANDLER_PSM_HANDLER) *errh = psmi_errhandler_psm; else if (errhandler == PSM2_ERRHANDLER_NO_HANDLER) *errh = psmi_errhandler_noop; else *errh = errhandler; PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSMI_API_DECL(psm2_error_register_handler) psm2_error_t MOCKABLE (psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...) { va_list argptr; int syslog_level; int console_print = 0; psm2_error_t newerr; struct psm2_error_token token; char *c, fullmsg[PSM2_ERRSTRING_MAXLEN]; token.error = error; snprintf(fullmsg, PSM2_ERRSTRING_MAXLEN - 1, "%s", buf); fullmsg[PSM2_ERRSTRING_MAXLEN - 1] = '\0'; va_start(argptr, buf); vsnprintf(token.err_string, PSM2_ERRSTRING_MAXLEN - 1, fullmsg, argptr); va_end(argptr); token.err_string[PSM2_ERRSTRING_MAXLEN - 1] = '\0'; /* Unless the user has set PSM2_NO_VERBOSE_ERRORS, always print errors to * console */ c = getenv("PSM2_NO_VERBOSE_ERRORS"); console_print = 0; if (ep == PSMI_EP_LOGEVENT) console_print = 1; else if (!c || *c == '\0') { /* no desire to prevent verbose errors */ /* Remove the console print if we're internally handling the error */ if (ep == PSMI_EP_NORETURN) console_print = 0; else if (ep == NULL && psmi_errhandler_global != psmi_errhandler_psm) console_print = 1; else if (ep != NULL && ep->errh != psmi_errhandler_psm) console_print = 1; } /* Before we let the user even handle the error, send to syslog */ syslog_level = psmi_error_syslog_level(error); if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT) psmi_syslog(ep, console_print, ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level, "%s (err=%d)", token.err_string, error); if (ep == PSMI_EP_LOGEVENT) /* we're just logging */ newerr = PSM2_OK; else if (ep == PSMI_EP_NORETURN) newerr = psmi_errhandler_psm(NULL, error, token.err_string, &token); else if (ep == NULL) newerr = psmi_errhandler_global(NULL, error, token.err_string, &token); else newerr = ep->errh(ep, error, token.err_string, &token); return newerr; } MOCK_DEF_EPILOGUE(psmi_handle_error); /* Returns the "worst" error out of errA and errB */ psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB) { #define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err) /* Bad runtime or before initialization */ _PSMI_ERR_IS(PSM2_NO_MEMORY); _PSMI_ERR_IS(PSM2_INTERNAL_ERR); _PSMI_ERR_IS(PSM2_INIT_NOT_INIT); _PSMI_ERR_IS(PSM2_INIT_BAD_API_VERSION); /* Before we cget an endpoint */ _PSMI_ERR_IS(PSM2_EP_NO_DEVICE); _PSMI_ERR_IS(PSM2_EP_UNIT_NOT_FOUND); _PSMI_ERR_IS(PSM2_EP_DEVICE_FAILURE); _PSMI_ERR_IS(PSM2_EP_NO_PORTS_AVAIL); _PSMI_ERR_IS(PSM2_TOO_MANY_ENDPOINTS); /* As we open/close the endpoint */ _PSMI_ERR_IS(PSM2_EP_NO_NETWORK); _PSMI_ERR_IS(PSM2_SHMEM_SEGMENT_ERR); _PSMI_ERR_IS(PSM2_EP_CLOSE_TIMEOUT); _PSMI_ERR_IS(PSM2_EP_INVALID_UUID_KEY); _PSMI_ERR_IS(PSM2_EP_NO_RESOURCES); /* In connect phase */ _PSMI_ERR_IS(PSM2_EPID_NETWORK_ERROR); _PSMI_ERR_IS(PSM2_EPID_INVALID_NODE); _PSMI_ERR_IS(PSM2_EPID_INVALID_CONNECT); _PSMI_ERR_IS(PSM2_EPID_INVALID_PKEY); _PSMI_ERR_IS(PSM2_EPID_INVALID_VERSION); _PSMI_ERR_IS(PSM2_EPID_INVALID_UUID_KEY); _PSMI_ERR_IS(PSM2_EPID_INVALID_MTU); /* Timeout if nothing else */ _PSMI_ERR_IS(PSM2_TIMEOUT); /* Last resort */ return max(errA, errB); } struct psmi_error_item { int syslog_level; const char *error_string; }; static struct psmi_error_item psmi_error_items[] = { {PSMI_NOLOG, "Success"}, /* PSM2_OK = 0, */ {PSMI_NOLOG, "No events were progressed in psm_poll"}, /* PSM2_OK_NO_PROGRESS = 1 */ {PSMI_NOLOG, "unknown 2"}, {PSMI_NOLOG, "Error in a function parameter"}, /* PSM2_PARAM_ERR = 3 */ {LOG_CRIT, "Ran out of memory"}, /* PSM2_NO_MEMORY = 4 */ {PSMI_NOLOG, "PSM has not been initialized by psm2_init"}, /* PSM2_INIT_NOT_INIT = 5 */ {LOG_INFO, "API version passed in psm2_init is incompatible"}, /* PSM2_INIT_BAD_API_VERSION = 6 */ {PSMI_NOLOG, "PSM Could not set affinity"}, /* PSM2_NO_AFFINITY = 7 */ {LOG_ALERT, "PSM Unresolved internal error"}, /* PSM2_INTERNAL_ERR = 8 */ {LOG_CRIT, "PSM could not set up shared memory segment"}, /* PSM2_SHMEM_SEGMENT_ERR = 9 */ {PSMI_NOLOG, "PSM option is a read-only option"}, /* PSM2_OPT_READONLY = 10 */ {PSMI_NOLOG, "Operation timed out"}, /* PSM2_TIMEOUT = 11 */ {LOG_INFO, "Exceeded supported amount of endpoints"}, /* PSM2_TOO_MANY_ENDPOINTS = 12 */ {PSMI_NOLOG, "PSM is in the finalized state"}, /* PSM2_IS_FINALIZED = 13 */ {PSMI_NOLOG, "unknown 14"}, {PSMI_NOLOG, "unknown 15"}, {PSMI_NOLOG, "unknown 16"}, {PSMI_NOLOG, "unknown 17"}, {PSMI_NOLOG, "unknown 18"}, {PSMI_NOLOG, "unknown 19"}, {PSMI_NOLOG, "Endpoint was closed"}, /* PSM2_EP_WAS_CLOSED = 20 */ {LOG_ALERT, "PSM Could not find an OPA Unit"}, /* PSM2_EP_NO_DEVICE = 21 */ {PSMI_NOLOG, "User passed a bad unit number"}, /* PSM2_EP_UNIT_NOT_FOUND = 22 */ {LOG_ALERT, "Failure in initializing endpoint"}, /* PSM2_EP_DEVICE_FAILURE = 23 */ {PSMI_NOLOG, "Error closing the endpoing error"}, /* PSM2_EP_CLOSE_TIMEOUT = 24 */ {PSMI_NOLOG, "No free contexts could be obtained"}, /* PSM2_EP_NO_PORTS_AVAIL = 25 */ {LOG_ALERT, "Could not detect network connectivity"}, /* PSM2_EP_NO_NETWORK = 26 */ {LOG_INFO, "Invalid Unique job-wide UUID Key"}, /* PSM2_EP_INVALID_UUID_KEY = 27 */ {LOG_INFO, "Out of endpoint resources"}, /* PSM2_EP_NO_RESOURCES = 28 */ {PSMI_NOLOG, "unknown 29"}, {PSMI_NOLOG, "unknown 30"}, {PSMI_NOLOG, "unknown 31"}, {PSMI_NOLOG, "unknown 32"}, {PSMI_NOLOG, "unknown 33"}, {PSMI_NOLOG, "unknown 34"}, {PSMI_NOLOG, "unknown 35"}, {PSMI_NOLOG, "unknown 36"}, {PSMI_NOLOG, "unknown 37"}, {PSMI_NOLOG, "unknown 38"}, {PSMI_NOLOG, "unknown 39"}, {PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)"}, /* PSM2_EPID_UNKNOWN = 40 */ {PSMI_NOLOG, "Endpoint could not be reached"}, /* PSM2_EPID_UNREACHABLE = 41 */ {PSMI_NOLOG, "unknown 42"}, {LOG_CRIT, "Invalid node (mismatch in bit width 32/64 or byte order)"}, /* PSM2_EPID_INVALID_NODE = 43 */ {LOG_CRIT, "Invalid MTU"}, /* PSM2_EPID_INVALID_MTU = 44 */ {PSMI_NOLOG, "UUID key mismatch"}, /* PSM2_EPID_INVALID_UUID_KEY = 45 */ {LOG_ERR, "Incompatible PSM version"}, /* PSM2_EPID_INVALID_VERSION = 46 */ {LOG_CRIT, "Connect received garbled connection information"}, /* PSM2_EPID_INVALID_CONNECT = 47 */ {PSMI_NOLOG, "Endpoint was already connected"}, /* PSM2_EPID_ALREADY_CONNECTED = 48 */ {LOG_CRIT, "Two or more endpoints have the same network id (LID)"}, /* PSM2_EPID_NETWORK_ERROR = 49 */ {LOG_CRIT, "Endpoint provided incompatible Partition Key"}, {LOG_CRIT, "Unable to resolve network path. Is the SM running?"}, {PSMI_NOLOG, "unknown 52"}, {PSMI_NOLOG, "unknown 53"}, {PSMI_NOLOG, "unknown 54"}, {PSMI_NOLOG, "unknown 55"}, {PSMI_NOLOG, "unknown 56"}, {PSMI_NOLOG, "unknown 57"}, {PSMI_NOLOG, "unknown 58"}, {PSMI_NOLOG, "unknown 59"}, {PSMI_NOLOG, "MQ Non-blocking request is incomplete"}, /* PSM2_MQ_NO_COMPLETIONS = 60 */ {PSMI_NOLOG, "MQ Message has been truncated at the receiver"}, /* PSM2_MQ_TRUNCATION = 61 */ {PSMI_NOLOG, "unknown 62"}, {PSMI_NOLOG, "unknown 63"}, {PSMI_NOLOG, "unknown 64"}, {PSMI_NOLOG, "unknown 65"}, {PSMI_NOLOG, "unknown 66"}, {PSMI_NOLOG, "unknown 67"}, {PSMI_NOLOG, "unknown 68"}, {PSMI_NOLOG, "unknown 69"}, {PSMI_NOLOG, "Invalid AM reply"}, {PSMI_NOLOG, "unknown 71"}, {PSMI_NOLOG, "unknown 72"}, {PSMI_NOLOG, "unknown 73"}, {PSMI_NOLOG, "unknown 74"}, {PSMI_NOLOG, "unknown 75"}, {PSMI_NOLOG, "unknown 76"}, {PSMI_NOLOG, "unknown 77"}, {PSMI_NOLOG, "unknown 78"}, {PSMI_NOLOG, "unknown 79"}, {PSMI_NOLOG, "unknown 80"}, }; const char *__psm2_error_get_string(psm2_error_t error) { PSM2_LOG_MSG("entering"); if (error >= PSM2_ERROR_LAST) { PSM2_LOG_MSG("leaving"); return "unknown"; } else { PSM2_LOG_MSG("leaving"); return psmi_error_items[error].error_string; } } PSMI_API_DECL(psm2_error_get_string) int psmi_error_syslog_level(psm2_error_t error) { if (error >= PSM2_ERROR_LAST) return PSMI_NOLOG; else return psmi_error_items[error].syslog_level; } opa-psm2-PSM2_11.2.185/psm_error.h000066400000000000000000000057201370564314600164060ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm2_mock_testing.h" #ifndef _PSMI_IN_USER_H #error psm_error.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSMI_ERROR_H #define _PSMI_ERROR_H #define PSMI_EP_NONE (NULL) #define PSMI_EP_NORETURN ((psm2_ep_t) -2) #define PSMI_EP_LOGEVENT ((psm2_ep_t) -3) extern psm2_ep_errhandler_t psmi_errhandler_global; psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...) __attribute__((format(printf, 3, 4))); MOCK_DCL_EPILOGUE(psmi_handle_error); psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB); int psmi_error_syslog_level(psm2_error_t error); #endif /* _PSMI_ERROR_H */ opa-psm2-PSM2_11.2.185/psm_gdrcpy.h000066400000000000000000000051501370564314600165420ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2018 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2018 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2018 Intel Corporation. All rights reserved. */ #ifndef GDR_CPY_H #define GDR_CPY_H #ifdef PSM_CUDA #include "ptl_ips/ips_proto.h" #define GDR_FD get_gdr_fd() int get_gdr_fd(); void hfi_gdr_open(); void hfi_gdr_close(); void * gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, size_t size, int flags, struct ips_proto* proto); uint64_t gdr_cache_evict(); #endif #endif opa-psm2-PSM2_11.2.185/psm_hal_gen1/000077500000000000000000000000001370564314600165565ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/psm_hal_gen1/Makefile000066400000000000000000000055751370564314600202320ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2017 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2017 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # OUTDIR = . this_srcdir = $(shell readlink -m .) top_srcdir := $(this_srcdir)/.. INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips ${TARGLIB}-objs := psm_hal_gen1.o opa_service_gen1.o opa_utils_gen1.o \ opa_proto_gen1.o opa_i2cflash_gen1.o psm_gdrcpy.o ${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) DEPS:= $(${TARGLIB}-objs:.o=.d) -include $(DEPS) all: ${${TARGLIB}-objs} $(OUTDIR)/%.o: $(this_srcdir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -MMD -c $< -o $@ clean: @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno; \ cd -; \ fi install: @echo "Nothing to do for install." opa-psm2-PSM2_11.2.185/psm_hal_gen1/hfi1_deprecated_gen1.h000066400000000000000000000134751370564314600226620ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* hfi1_deprecated_gen1.h Contains certain features of the hfi1 module that have been deprecated. These features may still need to be supported by the psm library for reasons of backwards compatibility. */ #ifndef __HFI1_DEPRECATED_GEN1_H__ #define __HFI1_DEPRECATED_GEN1_H__ /* First, include the current hfi1_user.h file: */ #include /* Determine if we need to define and declare deprecated entities based on the IB_IOCTL_MAGIC macro. */ #if defined( IB_IOCTL_MAGIC ) /* The macro: PSM2_SUPPORT_IW_CMD_API is used to stipulate adding compile-time support of either the ioctl() or write() command interfaces to the driver. Note though that the final decision whether to support this depends on factors only known at runtime. */ #define PSM2_SUPPORT_IW_CMD_API 1 /* IOCTL_CMD_API_MODULE_MAJOR defines the first version of the hfi1 * module that supports the ioctl() command interface. Prior to this * (IOCTL_CMD_API_MODULE_MAJOR - 1 and smaller), the module used * write() for the command interface. */ #define IOCTL_CMD_API_MODULE_MAJOR 6 /* * round robin contexts across HFIs, then * ports; this is the default. * This option spreads the HFI selection within the local socket. * If it is preferred to spread job over over entire set of * HFIs within the system, see ALG_ACROSS_ALL below. */ #define HFI1_ALG_ACROSS_DEP 0 /* * use all contexts on an HFI (round robin * active ports within), then next HFI */ #define HFI1_ALG_WITHIN_DEP 1 struct hfi1_cmd_deprecated { __u32 type; /* command type */ __u32 len; /* length of struct pointed to by add */ __u64 addr; /* pointer to user structure */ }; #define hfi1_cmd hfi1_cmd_deprecated #define HFI1_ALG_ACROSS HFI1_ALG_ACROSS_DEP #define HFI1_ALG_WITHIN HFI1_ALG_WITHIN_DEP #else #define HFI1_SWMAJOR_SHIFT 16 #endif /* defined( IB_IOCTL_MAGIC )*/ #define HFI1_ALG_ACROSS_ALL_DEP 2 #define HFI1_ALG_ACROSS_ALL HFI1_ALG_ACROSS_ALL_DEP /* Note that struct hfi1_user_info_dep declaration is identical to the struct hfi1_user_info declaration from MAJOR version 5 of the hfi1_user.h file. */ struct hfi1_user_info_dep { /* * version of user software, to detect compatibility issues. * Should be set to HFI1_USER_SWVERSION. */ __u32 userversion; __u16 pad; /* HFI selection algorithm, if unit has not selected */ __u16 hfi1_alg; /* * If two or more processes wish to share a context, each process * must set the subcontext_cnt and subcontext_id to the same * values. The only restriction on the subcontext_id is that * it be unique for a given node. */ __u16 subctxt_cnt; __u16 subctxt_id; /* 128bit UUID passed in by PSM. */ __u8 uuid[16]; }; /* * We assume here that we have the hfi1_user.h file installed in the system path * with the 'flags' field defined in struct sdma_req_info. (At least, when the * user needs to run GPU workloads, this _should_ be the version of hfi1_user.h * file installed by the IFS.) */ struct sdma_req_info_v6_3 { /* * bits 0-3 - version (currently unused) * bits 4-7 - opcode (enum sdma_req_opcode) * bits 8-15 - io vector count */ __u16 ctrl; /* * Number of fragments contained in this request. * User-space has already computed how many * fragment-sized packet the user buffer will be * split into. */ __u16 npkts; /* * Size of each fragment the user buffer will be * split into. */ __u16 fragsize; /* * Index of the slot in the SDMA completion ring * this request should be using. User-space is * in charge of managing its own ring. */ __u16 comp_idx; } __attribute__((packed)); #endif /* #ifndef __HFI1_DEPRECATED_GEN1_H__ */ opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_common_gen1.h000066400000000000000000000046461370564314600220020ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef OPA_COMMON_GEN1_H #define OPA_COMMON_GEN1_H #include #include "hfi1_deprecated_gen1.h" #endif /* OPA_COMMON_GEN1_H */ opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_i2cflash_gen1.c000066400000000000000000000057231370564314600221750ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include #include #include #include #include #include #include "opa_user_gen1.h" uint8_t hfi_flash_csum(struct hfi_flash *ifp, int adjust) { uint8_t *ip = (uint8_t *) ifp; uint8_t csum = 0, len; /* * Limit length checksummed to max length of actual data. * Checksum of erased eeprom will still be bad, but we avoid * reading past the end of the buffer we were passed. */ len = ifp->if_length; if (len > sizeof(struct hfi_flash)) len = sizeof(struct hfi_flash); while (len--) csum += *ip++; csum -= ifp->if_csum; csum = ~csum; if (adjust) ifp->if_csum = csum; return csum; } opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_proto_gen1.c000066400000000000000000000440001370564314600216340ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file contains the initialization functions used by the low level hfi protocol code. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "opa_user_gen1.h" #include "opa_udebug.h" #include size_t arrsz[MAPSIZE_MAX] = { 0 }; static int map_hfi_mem(int fd, struct _hfi_ctrl *ctrl, size_t subctxt_cnt) { #define CREDITS_NUM 64 struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info; struct hfi1_base_info *binfo = &ctrl->base_info; size_t sz; __u64 off; void *maddr; /* 1. Map the PIO credits address */ off = binfo->sc_credits_addr &~ HFI_MMAP_PGMASK; sz = HFI_MMAP_PGSIZE; maddr = HFI_MMAP_ERRCHECK(fd, binfo, sc_credits_addr, sz, PROT_READ); hfi_touch_mmap(maddr, sz); arrsz[SC_CREDITS] = sz; binfo->sc_credits_addr |= off; /* 2. Map the PIO buffer SOP address * Skipping the cast of cinfo->credits to size_t. This causes the outcome of the multiplication * to be sign-extended in the event of too large input values. This results in a very large product * when treated as unsigned which in turn will make the HFI_MMAP_ERRCHECK() macro fail and give an * adequate error report. TODO: Consider sanitizing the credits value explicitly */ sz = cinfo->credits * CREDITS_NUM; HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase_sop, sz, PROT_WRITE); arrsz[PIO_BUFBASE_SOP] = sz; /* 3. Map the PIO buffer address */ sz = cinfo->credits * CREDITS_NUM; HFI_MMAP_ERRCHECK(fd, binfo, pio_bufbase, sz, PROT_WRITE); arrsz[PIO_BUFBASE] = sz; /* 4. Map the receive header queue * (u16 * u16 -> max value 0xfffe0001) */ sz = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize; maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvhdr_bufbase, sz, PROT_READ); hfi_touch_mmap(maddr, sz); arrsz[RCVHDR_BUFBASE] = sz; /* 5. Map the receive eager buffer * (u16 * u32. Assuming size_t's precision is 64 bits - no overflow) */ sz = (size_t)cinfo->egrtids * cinfo->rcvegr_size; maddr = HFI_MMAP_ERRCHECK(fd, binfo, rcvegr_bufbase, sz, PROT_READ); hfi_touch_mmap(maddr, sz); arrsz[RCVEGR_BUFBASE] = sz; /* 6. Map the sdma completion queue */ if (cinfo->runtime_flags & HFI1_CAP_SDMA) { sz = cinfo->sdma_ring_size * sizeof(struct hfi1_sdma_comp_entry); HFI_MMAP_ERRCHECK(fd, binfo, sdma_comp_bufbase, sz, PROT_READ); } else { sz = 0; binfo->sdma_comp_bufbase = (__u64)0; } arrsz[SDMA_COMP_BUFBASE] = sz; /* 7. Map RXE per-context CSRs */ sz = HFI_MMAP_PGSIZE; HFI_MMAP_ERRCHECK(fd, binfo, user_regbase, sz, PROT_WRITE|PROT_READ); arrsz[USER_REGBASE] = sz; /* Set up addresses for optimized register writeback routines. * This is for the real onchip registers, shared context or not */ uint64_t *regbasep = (uint64_t *)binfo->user_regbase; ctrl->__hfi_rcvhdrtail = (volatile __le64 *)(regbasep + ur_rcvhdrtail); ctrl->__hfi_rcvhdrhead = (volatile __le64 *)(regbasep + ur_rcvhdrhead); ctrl->__hfi_rcvegrtail = (volatile __le64 *)(regbasep + ur_rcvegrindextail); ctrl->__hfi_rcvegrhead = (volatile __le64 *)(regbasep + ur_rcvegrindexhead); ctrl->__hfi_rcvofftail = (volatile __le64 *)(regbasep + ur_rcvegroffsettail); if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) { ctrl->__hfi_rcvtidflow = (volatile __le64 *)(regbasep + ur_rcvtidflowtable); ctrl->__hfi_tfvalid = 1; } else { ctrl->__hfi_rcvtidflow = ctrl->regs; ctrl->__hfi_tfvalid = 0; } /* 8. Map the rcvhdrq tail register address */ if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { sz = HFI_MMAP_PGSIZE; HFI_MMAP_ERRCHECK(fd, binfo, rcvhdrtail_base, sz, PROT_READ); } else { /* We don't use receive header queue tail register to detect new packets, * but here we save the address for false-eager-full recovery */ sz = 0; /* This points inside the previously established mapping (user_rehbase). Don't munmap()! */ binfo->rcvhdrtail_base = (uint64_t) (uintptr_t) ctrl->__hfi_rcvhdrtail; } ctrl->__hfi_rcvtail = (__le64 *)binfo->rcvhdrtail_base; arrsz[RCVHDRTAIL_BASE] = sz; /* 9. Map the event page */ off = binfo->events_bufbase &~ HFI_MMAP_PGMASK; sz = HFI_MMAP_PGSIZE; HFI_MMAP_ERRCHECK(fd, binfo, events_bufbase, sz, PROT_READ); arrsz[EVENTS_BUFBASE] = sz; /* keep the offset in the address */ binfo->events_bufbase |= off; /* 10. Map the status page */ sz = HFI_MMAP_PGSIZE; HFI_MMAP_ERRCHECK(fd, binfo, status_bufbase, sz, PROT_READ); arrsz[STATUS_BUFBASE] = sz; if (!subctxt_cnt) return 0; /* 11. If subcontext is used, map the buffers */ const char *errstr = "Incorrect input values for the subcontext"; size_t factor; /* 11a) subctxt_uregbase */ sz = HFI_MMAP_PGSIZE; maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_uregbase, sz, PROT_READ|PROT_WRITE); hfi_touch_mmap(maddr, sz); arrsz[SUBCTXT_UREGBASE] = sz; /* 11b) subctxt_rcvhdrbuf * u16 * u16. Prevent promotion to int through an explicit cast to size_t */ factor = (size_t)cinfo->rcvhdrq_cnt * cinfo->rcvhdrq_entsize; factor = ALIGN(factor, HFI_MMAP_PGSIZE); sz = factor * subctxt_cnt; maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvhdrbuf, sz, PROT_READ|PROT_WRITE); hfi_touch_mmap(maddr, sz); arrsz[SUBCTXT_RCVHDRBUF] = sz; /* 11c) subctxt_rcvegrbuf * u16 * u32. Assuming size_t's precision to be 64 bits (no overflow) */ factor = (size_t)cinfo->egrtids * cinfo->rcvegr_size; factor = ALIGN(factor, HFI_MMAP_PGSIZE); sz = factor * subctxt_cnt; if (sz / subctxt_cnt != factor) { _HFI_INFO("%s (rcvegrbuf)\n", errstr); goto err_int_overflow_subctxt_rcvegrbuf; } maddr = HFI_MMAP_ERRCHECK(fd, binfo, subctxt_rcvegrbuf, sz, PROT_READ|PROT_WRITE); hfi_touch_mmap(maddr, sz); arrsz[SUBCTXT_RCVEGRBUF] = sz; return 0; err_int_overflow_subctxt_rcvegrbuf: err_mmap_subctxt_rcvegrbuf: /* if we got here, subctxt_cnt must be != 0 */ HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, arrsz[SUBCTXT_RCVHDRBUF]); err_mmap_subctxt_rcvhdrbuf: /* if we got it here, subctxt_cnt must be != 0 */ HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, arrsz[SUBCTXT_UREGBASE]); err_mmap_subctxt_uregbase: HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, arrsz[STATUS_BUFBASE]); err_mmap_status_bufbase: HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, arrsz[EVENTS_BUFBASE]); err_mmap_events_bufbase: if(cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, arrsz[RCVHDRTAIL_BASE]); } err_mmap_rcvhdrtail_base: HFI_MUNMAP_ERRCHECK(binfo, user_regbase, arrsz[USER_REGBASE]); err_mmap_user_regbase: /* the condition could be: if(cinfo->runtime_flags & HFI1_CAP_SDMA) too */ if(binfo->sdma_comp_bufbase != 0) { HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, arrsz[SDMA_COMP_BUFBASE]); } err_mmap_sdma_comp_bufbase: HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, arrsz[RCVEGR_BUFBASE]); err_mmap_rcvegr_bufbase: HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, arrsz[RCVHDR_BUFBASE]); err_mmap_rcvhdr_bufbase: HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, arrsz[PIO_BUFBASE]); err_mmap_pio_bufbase: HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, arrsz[PIO_BUFBASE_SOP]); err_mmap_pio_bufbase_sop: HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, arrsz[SC_CREDITS]); err_mmap_sc_credits_addr: return -1; } /* It is allowed to have multiple devices (and of different types) simultaneously opened and initialized, although this (still! Oct 07) implemented. This routine is used by the low level hfi protocol code (and any other code that has similar low level functionality). This is the only routine that takes a file descriptor, rather than an struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything else is returned as part of hfi1_base_info. */ struct _hfi_ctrl *hfi_userinit_internal(int fd, bool skip_affinity, struct hfi1_user_info_dep *uinfo) { struct _hfi_ctrl *spctrl = NULL; struct hfi1_ctxt_info *cinfo; struct hfi1_base_info *binfo; struct hfi1_cmd c; int __hfi_pg_sz; #ifdef PSM2_SUPPORT_IW_CMD_API /* for major version 6 of driver, we will use uinfo_new. See below for details. */ struct hfi1_user_info uinfo_new = {0}; #endif /* First get the page size */ __hfi_pg_sz = sysconf(_SC_PAGESIZE); if (!(spctrl = calloc(1, sizeof(struct _hfi_ctrl)))) { _HFI_INFO("can't allocate memory for hfi_ctrl: %s\n", strerror(errno)); goto err_calloc_hfi_ctrl; } cinfo = &spctrl->ctxt_info; binfo = &spctrl->base_info; _HFI_VDBG("uinfo: ver %x, alg %d, subc_cnt %d, subc_id %d\n", uinfo->userversion, uinfo->hfi1_alg, uinfo->subctxt_cnt, uinfo->subctxt_id); /* 1. ask driver to assign context to current process */ memset(&c, 0, sizeof(struct hfi1_cmd)); c.type = PSMI_HFI_CMD_ASSIGN_CTXT; #ifdef PSM2_SUPPORT_IW_CMD_API /* If psm is communicating with a MAJOR version 6 driver, we need to pass in an actual struct hfi1_user_info not a hfi1_user_info_dep. Else if psm is communicating with a MAJOR version 5 driver, we can just continue to pass a hfi1_user_info_dep as struct hfi1_user_info_dep is identical to the MAJOR version 5 struct hfi1_user_info. */ if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR) { /* If psm is communicating with a MAJOR version 6 driver, we copy uinfo into uinfo_new and pass uinfo_new to the driver. */ c.len = sizeof(uinfo_new); c.addr = (__u64) (&uinfo_new); uinfo_new.userversion = uinfo->userversion; uinfo_new.pad = uinfo->pad; uinfo_new.subctxt_cnt = uinfo->subctxt_cnt; uinfo_new.subctxt_id = uinfo->subctxt_id; memcpy(uinfo_new.uuid,uinfo->uuid,sizeof(uinfo_new.uuid)); } else { /* If psm is working with an old driver, we continue to use the struct hfi1_user_info_dep version of the struct: */ c.len = sizeof(*uinfo); c.addr = (__u64) uinfo; } #else c.len = sizeof(*uinfo); c.addr = (__u64) uinfo; #endif if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { if (errno == ENODEV) { _HFI_INFO("PSM2 and driver version mismatch\n"); /* Overwrite errno. One would wish that the driver * didn't return ENODEV for a version mismatch */ errno = EPROTONOSUPPORT; } else { _HFI_INFO("assign_context command failed: %s\n", strerror(errno)); } goto err_hfi_cmd_assign_ctxt; } #ifdef PSM2_SUPPORT_IW_CMD_API if (hfi_get_user_major_version() == IOCTL_CMD_API_MODULE_MAJOR) { /* for the new driver, we copy the results of the call back to uinfo from uinfo_new. */ uinfo->userversion = uinfo_new.userversion; uinfo->pad = uinfo_new.pad; uinfo->subctxt_cnt = uinfo_new.subctxt_cnt; uinfo->subctxt_id = uinfo_new.subctxt_id; memcpy(uinfo->uuid,uinfo_new.uuid,sizeof(uinfo_new.uuid)); } #endif /* 2. get context info from driver */ c.type = PSMI_HFI_CMD_CTXT_INFO; c.len = sizeof(*cinfo); c.addr = (__u64) cinfo; if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { _HFI_INFO("CTXT_INFO command failed: %s\n", strerror(errno)); goto err_hfi_cmd_ctxt_info; } /* sanity checking... */ if (cinfo->rcvtids%8) { _HFI_INFO("rcvtids not 8 multiple: %d\n", cinfo->rcvtids); goto err_sanity_check; } if (cinfo->egrtids%8) { _HFI_INFO("egrtids not 8 multiple: %d\n", cinfo->egrtids); goto err_sanity_check; } if (cinfo->rcvtids < cinfo->egrtids) { _HFI_INFO("rcvtids(%d) < egrtids(%d)\n", cinfo->rcvtids, cinfo->egrtids); goto err_sanity_check; } if (cinfo->rcvhdrq_cnt%32) { _HFI_INFO("rcvhdrq_cnt not 32 multiple: %d\n", cinfo->rcvhdrq_cnt); goto err_sanity_check; } if (cinfo->rcvhdrq_entsize%64) { _HFI_INFO("rcvhdrq_entsize not 64 multiple: %d\n", cinfo->rcvhdrq_entsize); goto err_sanity_check; } if (cinfo->rcvegr_size%__hfi_pg_sz) { _HFI_INFO("rcvegr_size not page multiple: %d\n", cinfo->rcvegr_size); goto err_sanity_check; } _HFI_VDBG("ctxtinfo: runtime_flags %llx, rcvegr_size %d\n", cinfo->runtime_flags, cinfo->rcvegr_size); _HFI_VDBG("ctxtinfo: active %d, unit %d, ctxt %d, subctxt %d\n", cinfo->num_active, cinfo->unit, cinfo->ctxt, cinfo->subctxt); _HFI_VDBG("ctxtinfo: rcvtids %d, credits %d\n", cinfo->rcvtids, cinfo->credits); _HFI_VDBG("ctxtinfo: numa %d, cpu %x, send_ctxt %d\n", cinfo->numa_node, cinfo->rec_cpu, cinfo->send_ctxt); _HFI_VDBG("ctxtinfo: rcvhdrq_cnt %d, rcvhdrq_entsize %d\n", cinfo->rcvhdrq_cnt, cinfo->rcvhdrq_entsize); _HFI_VDBG("ctxtinfo: egrtids %d, sdma_ring_size %d\n", cinfo->egrtids, cinfo->sdma_ring_size); /* if affinity has not been setup, set it */ if (getenv("HFI_FORCE_CPUAFFINITY") || (cinfo->rec_cpu != (__u16) -1 && !(getenv("HFI_NO_CPUAFFINITY") || skip_affinity))) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cinfo->rec_cpu, &cpuset); if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) { _HFI_INFO("Couldn't set runon processor %u " "(unit:context %u:%u) (%u active chips): %s\n", cinfo->rec_cpu, cinfo->unit, cinfo->ctxt, cinfo->num_active, strerror(errno)); } } /* 4. Get user base info from driver */ c.type = PSMI_HFI_CMD_USER_INFO; c.len = sizeof(*binfo); c.addr = (__u64) binfo; if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { _HFI_INFO("BASE_INFO command failed: %s\n", strerror(errno)); goto err_hfi_cmd_user_info; } hfi_set_user_version(binfo->sw_version); _HFI_VDBG("baseinfo: hwver %x, swver %x, jkey %d, qp %d\n", binfo->hw_version, binfo->sw_version, binfo->jkey, binfo->bthqp); _HFI_VDBG("baseinfo: credit_addr %llx, sop %llx, pio %llx\n", binfo->sc_credits_addr, binfo->pio_bufbase_sop, binfo->pio_bufbase); _HFI_VDBG("baseinfo: hdrbase %llx, egrbase %llx, sdmabase %llx\n", binfo->rcvhdr_bufbase, binfo->rcvegr_bufbase, binfo->sdma_comp_bufbase); _HFI_VDBG("baseinfo: ureg %llx, eventbase %llx, " "statusbase %llx, tailaddr %llx\n", binfo->user_regbase, binfo->events_bufbase, binfo->status_bufbase, binfo->rcvhdrtail_base); /* * Check if driver version matches PSM version, * this is different from PSM API version. */ if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) != hfi_get_user_major_version()) { _HFI_INFO ("User major version 0x%x not same as driver major 0x%x\n", hfi_get_user_major_version(), binfo->sw_version >> HFI1_SWMAJOR_SHIFT); if ((binfo->sw_version >> HFI1_SWMAJOR_SHIFT) < hfi_get_user_major_version()) goto err_version_mismatch; /* else assume driver knows how to be compatible */ } else if ((binfo->sw_version & 0xffff) != HFI1_USER_SWMINOR) { _HFI_PRDBG ("User minor version 0x%x not same as driver minor 0x%x\n", HFI1_USER_SWMINOR, binfo->sw_version & 0xffff); } if (map_hfi_mem(fd, spctrl, uinfo->subctxt_cnt) == -1) goto err_map_hfi_mem; /* Save some info. */ spctrl->fd = fd; spctrl->__hfi_unit = cinfo->unit; /* * driver should provide the port where the context is opened for, But * OPA driver does not have port interface to psm because there is only * one port. So we hardcode the port to 1 here. When we work on the * version of PSM for the successor to OPA, we should have port returned * from driver and will be set accordingly. */ /* spctrl->__hfi_port = cinfo->port; */ spctrl->__hfi_port = 1; spctrl->__hfi_tidegrcnt = cinfo->egrtids; spctrl->__hfi_tidexpcnt = cinfo->rcvtids - cinfo->egrtids; return spctrl; err_map_hfi_mem: err_version_mismatch: err_hfi_cmd_user_info: /* TODO: restore the original CPU affinity? */ err_sanity_check: err_hfi_cmd_ctxt_info: /* TODO: ioctl de-assign context here? */ // without de-assigning the context, all subsequent hfi_userinit_internal() // calls are going to fail _HFI_ERROR("An unrecoverable error occurred while communicating with the driver\n"); abort(); /* TODO: or do we want to include psm_user.h to use psmi_handle_error()? */ // no recovery here /* if we failed to allocate memory or to assign the context, we might still recover from this. * Returning NULL will cause the function to be reinvoked n times. Do we really want this * behavior? */ err_hfi_cmd_assign_ctxt: free(spctrl); err_calloc_hfi_ctrl: return NULL; } struct _hfi_ctrl *hfi_userinit(int fd, struct hfi1_user_info_dep *uinfo) { return hfi_userinit_internal(fd, false, uinfo); } opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_service_gen1.c000066400000000000000000000514301370564314600221360ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2018 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2018 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file contains hfi service routine interface used by the low level hfi protocol code. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opa_service_gen1.h" #include "psmi_wrappers.h" typedef union { struct { uint16_t minor; uint16_t major; }; uint32_t version; } sw_version_t; static sw_version_t sw_version = { { .major = HFI1_USER_SWMAJOR, .minor = HFI1_USER_SWMINOR } }; /* fwd declaration */ ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count); #ifdef PSM2_SUPPORT_IW_CMD_API /* fwd declaration */ ustatic int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count); /* Function pointer. */ static int (*_hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_ioctl; #else /* Function pointer. */ static int (*const _hfi_cmd_send)(int fd, struct hfi1_cmd *cmd, size_t count) = _hfi_cmd_write; #endif uint16_t hfi_get_user_major_version(void) { return sw_version.major; } void hfi_set_user_major_version(uint16_t major_version) { sw_version.major = major_version; } uint16_t hfi_get_user_minor_version(void) { return sw_version.minor; } void hfi_set_user_version(uint32_t version) { sw_version.version = version; } int hfi_context_open(int unit, int port, uint64_t open_timeout) { char dev_name_ignored[256]; return hfi_context_open_ex(unit, port, open_timeout, dev_name_ignored, sizeof(dev_name_ignored)); } int hfi_context_open_ex(int unit, int port, uint64_t open_timeout, char *dev_name,size_t dev_name_len) { int fd; if (unit != HFI_UNIT_ID_ANY && unit >= 0) snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1, unit); else snprintf(dev_name, dev_name_len, "%s_%u", HFI_DEVICE_PATH_GEN1, 0); if ((fd = open(dev_name, O_RDWR)) == -1) { _HFI_DBG("(host:Can't open %s for reading and writing", dev_name); return -1; } if (fcntl(fd, F_SETFD, FD_CLOEXEC)) _HFI_INFO("Failed to set close on exec for device: %s\n", strerror(errno)); #ifdef PSM2_SUPPORT_IW_CMD_API { /* if hfi1DriverMajor == -1, then we are potentially talking to a new driver. Let's confirm by issuing an ioctl version request: */ struct hfi1_cmd c; memset(&c, 0, sizeof(struct hfi1_cmd)); c.type = PSMI_HFI_CMD_GET_VERS; c.len = 0; c.addr = 0; if (hfi_cmd_write(fd, &c, sizeof(c)) == -1) { /* Let's assume that the driver is the old driver */ hfi_set_user_major_version(IOCTL_CMD_API_MODULE_MAJOR - 1); /* the old driver uses write() for its command interface: */ _hfi_cmd_send = _hfi_cmd_write; } else { int major = c.addr >> HFI1_SWMAJOR_SHIFT; if (major != hfi_get_user_major_version()) { /* If there is a skew between the major version of the driver that is executing and the major version which was used during compilation of PSM, we treat that is a fatal error. */ _HFI_INFO("PSM2 and driver version mismatch: (%d != %d)\n", major, hfi_get_user_major_version()); close(fd); return -1; } } } #endif return fd; } /* * Check if non-double word multiple message size for SDMA is allowed to be * pass to the driver. Starting from 6.2 driver version, PSM is able to pass * to the driver message which size is not a multiple of double word for SDMA. */ uint32_t hfi_check_non_dw_mul_sdma(void) { uint16_t major = hfi_get_user_major_version(); uint16_t minor = hfi_get_user_minor_version(); if ((major > HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) || ((major == HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED) && (minor >= HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED))) return 1; return 0; } void hfi_context_close(int fd) { (void)close(fd); } int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt) { return writev(fd, iov, iovcnt); } int hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count) { return _hfi_cmd_send(fd, cmd, count); } ustatic int _hfi_cmd_write(int fd, struct hfi1_cmd *cmd, size_t count) { const static unsigned int cmdTypeToWriteNum[PSMI_HFI_CMD_LAST] = { [PSMI_HFI_CMD_ASSIGN_CTXT] = LEGACY_HFI1_CMD_ASSIGN_CTXT, [PSMI_HFI_CMD_CTXT_INFO] = LEGACY_HFI1_CMD_CTXT_INFO, [PSMI_HFI_CMD_USER_INFO] = LEGACY_HFI1_CMD_USER_INFO, [PSMI_HFI_CMD_TID_UPDATE] = LEGACY_HFI1_CMD_TID_UPDATE, [PSMI_HFI_CMD_TID_FREE] = LEGACY_HFI1_CMD_TID_FREE, [PSMI_HFI_CMD_CREDIT_UPD] = LEGACY_HFI1_CMD_CREDIT_UPD, [PSMI_HFI_CMD_RECV_CTRL] = LEGACY_HFI1_CMD_RECV_CTRL, [PSMI_HFI_CMD_POLL_TYPE] = LEGACY_HFI1_CMD_POLL_TYPE, [PSMI_HFI_CMD_ACK_EVENT] = LEGACY_HFI1_CMD_ACK_EVENT, [PSMI_HFI_CMD_SET_PKEY] = LEGACY_HFI1_CMD_SET_PKEY, [PSMI_HFI_CMD_CTXT_RESET] = LEGACY_HFI1_CMD_CTXT_RESET, [PSMI_HFI_CMD_TID_INVAL_READ] = LEGACY_HFI1_CMD_TID_INVAL_READ, [PSMI_HFI_CMD_GET_VERS] = LEGACY_HFI1_CMD_GET_VERS, }; if (cmd->type < PSMI_HFI_CMD_LAST) { cmd->type = cmdTypeToWriteNum[cmd->type]; return psmi_write(fd, cmd, count); } else { errno = EINVAL; return -1; } } #ifdef PSM2_SUPPORT_IW_CMD_API ustatic int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count) { uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr }; const static struct { unsigned int ioctlCmd; unsigned int addrOrLiteralIdx; } cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = { [PSMI_HFI_CMD_ASSIGN_CTXT] = {HFI1_IOCTL_ASSIGN_CTXT , 0}, [PSMI_HFI_CMD_CTXT_INFO] = {HFI1_IOCTL_CTXT_INFO , 0}, [PSMI_HFI_CMD_USER_INFO] = {HFI1_IOCTL_USER_INFO , 0}, [PSMI_HFI_CMD_TID_UPDATE] = {HFI1_IOCTL_TID_UPDATE , 0}, [PSMI_HFI_CMD_TID_FREE] = {HFI1_IOCTL_TID_FREE , 0}, [PSMI_HFI_CMD_CREDIT_UPD] = {HFI1_IOCTL_CREDIT_UPD , 1}, [PSMI_HFI_CMD_RECV_CTRL] = {HFI1_IOCTL_RECV_CTRL , 1}, [PSMI_HFI_CMD_POLL_TYPE] = {HFI1_IOCTL_POLL_TYPE , 1}, [PSMI_HFI_CMD_ACK_EVENT] = {HFI1_IOCTL_ACK_EVENT , 1}, [PSMI_HFI_CMD_SET_PKEY] = {HFI1_IOCTL_SET_PKEY , 1}, [PSMI_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1}, [PSMI_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0}, [PSMI_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1}, #ifdef PSM_CUDA [PSMI_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0}, #endif }; if (cmd->type < PSMI_HFI_CMD_LAST) return psmi_ioctl(fd, cmdTypeToIoctlNum[cmd->type].ioctlCmd, addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]); else { errno = EINVAL; return -1; } } #endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */ /* we use mmap64() because we compile in both 32 and 64 bit mode, and we have to map physical addresses that are > 32 bits long. While linux implements mmap64, it doesn't have a man page, and isn't declared in any header file, so we declare it here ourselves. We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and redirects mmap to mmap64 for us, but at least through suse10 and fc4, it doesn't work when the address being mapped is > 32 bits. It chips off bits 32 and above. So we stay with mmap64. */ void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd, __off64_t offset) { return mmap64(addr, length, prot, flags, fd, offset); } /* get the number of units supported by the driver. Does not guarantee */ /* that a working chip has been found for each possible unit #. */ /* number of units >=0 (0 means none found). */ /* formerly used sysfs file "num_units" */ int hfi_get_num_units(void) { int ret; for (ret = 0;; ret++) { char pathname[PATH_MAX]; struct stat st; int r; snprintf(pathname, sizeof(pathname), HFI_DEVICE_PATH_GEN1 "_%d", ret); r = stat(pathname, &st); if (!r) continue; else break; } return ret; } /* Given a unit number, returns 1 if any port on the unit is active. returns 0 if no port on the unit is active. returns -1 when an error occurred. */ int hfi_get_unit_active(int unit) { int p,rv; for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) if ((rv=hfi_get_port_lid(unit, p)) > 0) break; if (p <= HFI_MAX_PORT) { return 1; } return rv; } /* get the number of contexts from the unit id. */ /* Returns 0 if no unit or no match. */ int hfi_get_num_contexts(int unit_id) { int n = 0; int units; int64_t val; uint32_t p = HFI_MIN_PORT; units = hfi_get_num_units(); if_pf(units <= 0) return 0; if (unit_id == HFI_UNIT_ID_ANY) { uint32_t u; for (u = 0; u < units; u++) { for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) if (hfi_get_port_lid(u, p) > 0) break; if (p <= HFI_MAX_PORT && !hfi_sysfs_unit_read_s64(u, "nctxts", &val, 0)) n += (uint32_t) val; } } else { for (; p <= HFI_MAX_PORT; p++) if (hfi_get_port_lid(unit_id, p) > 0) break; if (p <= HFI_MAX_PORT && !hfi_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0)) n += (uint32_t) val; } return n; } /* Given a unit number and port number, returns 1 if the unit and port are active. returns 0 if the unit and port are not active. returns -1 when an error occurred. */ int hfi_get_port_active(int unit, int port) { int ret; char *state; ret = hfi_sysfs_port_read(unit, port, "phys_state", &state); if (ret == -1) { if (errno == ENODEV) /* this is "normal" for port != 1, on single port chips */ _HFI_VDBG ("Failed to get phys_state for unit %u:%u: %s\n", unit, port, strerror(errno)); else _HFI_DBG ("Failed to get phys_state for unit %u:%u: %s\n", unit, port, strerror(errno)); return -1; } else { if (strncmp(state, "5: LinkUp", 9)) { _HFI_DBG("Link is not Up for unit %u:%u\n", unit, port); free(state); return 0; } free(state); return 1; } } /* Given the unit number, return an error, or the corresponding LID For now, it's used only so the MPI code can determine it's own LID, and which other LIDs (if any) are also assigned to this node Returns an int, so -1 indicates an error. 0 may indicate that the unit is valid, but no LID has been assigned. No error print because we call this for both potential ports without knowing if both ports exist (or are connected) */ int hfi_get_port_lid(int unit, int port) { int ret; int64_t val; if (hfi_get_port_active(unit,port) != 1) return -2; ret = hfi_sysfs_port_read_s64(unit, port, "lid", &val, 0); _HFI_VDBG("hfi_get_port_lid: ret %d, unit %d port %d\n", ret, unit, port); if (ret == -1) { if (errno == ENODEV) /* this is "normal" for port != 1, on single port chips */ _HFI_VDBG("Failed to get LID for unit %u:%u: %s\n", unit, port, strerror(errno)); else _HFI_DBG("Failed to get LID for unit %u:%u: %s\n", unit, port, strerror(errno)); } else { ret = val; /* disable this feature since we don't have a way to provide file descriptor in multiple context case. */ #if 0 if (getenv("HFI_DIAG_LID_LOOP")) { /* provides diagnostic ability to run MPI, etc. even */ /* on loopback, by claiming a different LID for each context */ struct hfi1_ctxt_info info; struct hfi1_cmd cmd; cmd.type = PSMI_HFI_CMD_CTXT_INFO; cmd.cmd.ctxt_info = (uintptr_t) &info; if (__hfi_lastfd == -1) _HFI_INFO ("Can't run CONTEXT_INFO for lid_loop, fd not set\n"); else if (write(__hfi_lastfd, &cmd, sizeof(cmd)) == -1) _HFI_INFO("CONTEXT_INFO command failed: %s\n", strerror(errno)); else if (!info.context) _HFI_INFO("CONTEXT_INFO returned context 0!\n"); else { _HFI_PRDBG ("Using lid 0x%x, base %x, context %x\n", ret + info.context, ret, info.context); ret += info.context; } } #endif } return ret; } /* Given the unit number, return an error, or the corresponding GID For now, it's used only so the MPI code can determine its fabric ID. Returns an int, so -1 indicates an error. No error print because we call this for both potential ports without knowing if both ports exist (or are connected) */ int hfi_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo) { int ret; char *gid_str = NULL; ret = hfi_sysfs_port_read(unit, port, "gids/0", &gid_str); if (ret == -1) { if (errno == ENODEV) /* this is "normal" for port != 1, on single * port chips */ _HFI_VDBG("Failed to get GID for unit %u:%u: %s\n", unit, port, strerror(errno)); else _HFI_DBG("Failed to get GID for unit %u:%u: %s\n", unit, port, strerror(errno)); } else { uint32_t gid[8]; if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x", &gid[0], &gid[1], &gid[2], &gid[3], &gid[4], &gid[5], &gid[6], &gid[7]) != 8) { _HFI_DBG("Failed to parse GID for unit %u:%u: %s\n", unit, port, gid_str); ret = -1; } else { *hi = (((uint64_t) gid[0]) << 48) | (((uint64_t) gid[1]) << 32) | (((uint64_t) gid[2]) << 16) | (((uint64_t) gid[3]) << 0); *lo = (((uint64_t) gid[4]) << 48) | (((uint64_t) gid[5]) << 32) | (((uint64_t) gid[6]) << 16) | (((uint64_t) gid[7]) << 0); } free(gid_str); } return ret; } /* Given the unit number, return an error, or the corresponding LMC value for the port */ /* Returns an int, so -1 indicates an error. 0 */ int hfi_get_port_lmc(int unit, int port) { int ret; int64_t val; ret = hfi_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0); if (ret == -1) { _HFI_INFO("Failed to get LMC for unit %u:%u: %s\n", unit, port, strerror(errno)); } else ret = val; return ret; } /* Given the unit number, return an error, or the corresponding link rate for the port */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_rate(int unit, int port) { int ret; double rate; char *data_rate = NULL, *newptr; ret = hfi_sysfs_port_read(unit, port, "rate", &data_rate); if (ret == -1) goto get_port_rate_error; else { rate = strtod(data_rate, &newptr); if ((rate == 0) && (data_rate == newptr)) goto get_port_rate_error; } free(data_rate); return ((int)(rate * 2) >> 1); get_port_rate_error: _HFI_INFO("Failed to get link rate for unit %u:%u: %s\n", unit, port, strerror(errno)); return ret; } /* Given a unit, port and SL, return an error, or the corresponding SC for the SL as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_sl2sc(int unit, int port, int sl) { int ret; int64_t val; char sl2scpath[16]; snprintf(sl2scpath, sizeof(sl2scpath), "sl2sc/%d", sl); ret = hfi_sysfs_port_read_s64(unit, port, sl2scpath, &val, 0); if (ret == -1) { _HFI_DBG ("Failed to get SL2SC mapping for SL %d unit %u:%u: %s\n", sl, unit, port, strerror(errno)); } else ret = val; return ret; } /* Given a unit, port and SC, return an error, or the corresponding VL for the SC as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_sc2vl(int unit, int port, int sc) { int ret; int64_t val; char sc2vlpath[16]; snprintf(sc2vlpath, sizeof(sc2vlpath), "sc2vl/%d", sc); ret = hfi_sysfs_port_read_s64(unit, port, sc2vlpath, &val, 0); if (ret == -1) { _HFI_DBG ("Failed to get SC2VL mapping for SC %d unit %u:%u: %s\n", sc, unit, port, strerror(errno)); } else ret = val; return ret; } /* Given a unit, port and VL, return an error, or the corresponding MTU for the VL as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_vl2mtu(int unit, int port, int vl) { int ret; int64_t val; char vl2mtupath[16]; snprintf(vl2mtupath, sizeof(vl2mtupath), "vl2mtu/%d", vl); ret = hfi_sysfs_port_read_s64(unit, port, vl2mtupath, &val, 0); if (ret == -1) { _HFI_DBG ("Failed to get VL2MTU mapping for VL %d unit %u:%u: %s\n", vl, unit, port, strerror(errno)); } else ret = val; return ret; } /* Given a unit, port and index, return an error, or the corresponding pkey value for the index as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_index2pkey(int unit, int port, int index) { int ret; int64_t val; char index2pkeypath[16]; snprintf(index2pkeypath, sizeof(index2pkeypath), "pkeys/%d", index); ret = hfi_sysfs_port_read_s64(unit, port, index2pkeypath, &val, 0); if (ret == -1) { _HFI_DBG ("Failed to get index2pkey mapping for index %d unit %u:%u: %s\n", index, unit, port, strerror(errno)); } else ret = val; return ret; } int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf) { int fd; /* * 4 bytes for 'control map' * 2 bytes 'port control' * 32 (#SLs) * 6 bytes 'congestion setting' (per-SL) */ const size_t count = 4 + 2 + (32 * 6); if (count > len_ccabuf) return -2; /* * Check qib driver CCA setting, and try to use it if available. * Fall to self CCA setting if errors. */ if (snprintf(ccabuf, len_ccabuf, "%s%d/ports/%d/CCMgtA/cc_settings_bin", hfi_sysfs_path(), unit, port) >= (len_ccabuf-1)) return -1; fd = open(ccabuf, O_RDONLY); if (fd < 0) { return 0; } if (read(fd, ccabuf, count) != count) { _HFI_CCADBG("Read cc_settings_bin failed. using static CCA\n"); close(fd); return 0; } close(fd); return 1; } int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp) { int i; unsigned short ccti_limit; uint16_t *cct; int fd; char pathname[256]; *cctp = NULL; if (snprintf(pathname,sizeof(pathname), "%s%d/ports/%d/CCMgtA/cc_table_bin", hfi_sysfs_path(), unit, port) >= (sizeof(pathname)-1)) return -1; fd = open(pathname, O_RDONLY); if (fd < 0) { _HFI_CCADBG("Open cc_table_bin failed. using static CCA\n"); return 0; } if (read(fd, &ccti_limit, sizeof(ccti_limit)) != sizeof(ccti_limit)) { _HFI_CCADBG("Read ccti_limit failed. using static CCA\n"); close(fd); return 0; } _HFI_CCADBG("ccti_limit = %d\n", ccti_limit); if (ccti_limit < 63) { _HFI_CCADBG("Read ccti_limit %d not in range [63, 65535], " "using static CCA.\n", ccti_limit); close(fd); return 0; } i = (ccti_limit + 1) * sizeof(uint16_t); cct = malloc(i); if (!cct) { close(fd); return -1; } if (read(fd, cct, i) != i) { _HFI_CCADBG("Read ccti_entry_list, using static CCA\n"); free(cct); close(fd); return 0; } close(fd); _HFI_CCADBG("cct[0] = 0x%04x\n", cct[0]); *cctp = cct; return ccti_limit; } /* * This is for diag function hfi_wait_for_packet() only */ int hfi_cmd_wait_for_packet(int fd) { int ret; struct pollfd pfd; pfd.fd = fd; pfd.events = POLLIN; ret = poll(&pfd, 1, 500 /* ms */); return ret; } opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_service_gen1.h000066400000000000000000000274131370564314600221470ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef OPA_SERVICE_GEN1_H #define OPA_SERVICE_GEN1_H /* This file contains all the lowest level routines calling into sysfs */ /* and qib driver. All other calls are based on these routines. */ #include #include "opa_intf.h" #include "opa_common_gen1.h" #include "opa_udebug.h" #include "opa_byteorder.h" /* upper and lower bounds for HFI port numbers */ #define HFI_MIN_PORT 1 #define HFI_MAX_PORT 1 #ifndef HFI_NUM_PORTS_GEN1 #define HFI_NUM_PORTS_GEN1 (HFI_MAX_PORT - HFI_MIN_PORT + 1) #endif /* any unit id to match. */ #define HFI_UNIT_ID_ANY ((long)-1) /* any port num to match. */ #define HFI_PORT_NUM_ANY ((long)0) /* base name of path (without unit #) for qib driver */ #ifndef HFI_DEVICE_PATH_GEN1 #define HFI_DEVICE_PATH_GEN1 "/dev/hfi1" #endif #ifdef PSM_CUDA #define GDR_DEVICE_PATH "/dev/hfi1_gdr" #endif /* The major and minor versions of driver that support non-DW multiple SDMA */ #define HFI1_USER_SWMAJOR_NON_DW_MUL_MSG_SIZE_ALLOWED 6 #define HFI1_USER_SWMINOR_NON_DW_MUL_MSG_SIZE_ALLOWED 2 /* Commands used to communicate with driver. */ enum PSMI_HFI_CMD { PSMI_HFI_CMD_ASSIGN_CTXT = 0, /* allocate HFI and context */ PSMI_HFI_CMD_CTXT_INFO, /* find out what resources we got */ PSMI_HFI_CMD_USER_INFO, /* set up userspace */ PSMI_HFI_CMD_TID_UPDATE, /* update expected TID entries */ PSMI_HFI_CMD_TID_FREE, /* free expected TID entries */ PSMI_HFI_CMD_CREDIT_UPD, /* force an update of PIO credit */ PSMI_HFI_CMD_RECV_CTRL, /* control receipt of packets */ PSMI_HFI_CMD_POLL_TYPE, /* set the kind of polling we want */ PSMI_HFI_CMD_ACK_EVENT, /* ack & clear user status bits */ PSMI_HFI_CMD_SET_PKEY, /* set context's pkey */ PSMI_HFI_CMD_CTXT_RESET, /* reset context's HW send context */ PSMI_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */ PSMI_HFI_CMD_GET_VERS, /* get the version of the user cdev */ #ifdef PSM_CUDA PSMI_HFI_CMD_TID_UPDATE_V2 = 28, #endif PSMI_HFI_CMD_LAST, }; /* Legacy commands used to communicate with driver using 'write' */ enum LEGACY_HFI1_CMD { LEGACY_HFI1_CMD_ASSIGN_CTXT = 1, /* allocate HFI and context */ LEGACY_HFI1_CMD_CTXT_INFO = 2, /* find out what resources we got */ LEGACY_HFI1_CMD_USER_INFO = 3, /* set up userspace */ LEGACY_HFI1_CMD_TID_UPDATE = 4, /* update expected TID entries */ LEGACY_HFI1_CMD_TID_FREE = 5, /* free expected TID entries */ LEGACY_HFI1_CMD_CREDIT_UPD = 6, /* force an update of PIO credit */ LEGACY_HFI1_CMD_RECV_CTRL = 8, /* control receipt of packets */ LEGACY_HFI1_CMD_POLL_TYPE = 9, /* set the kind of polling we want */ LEGACY_HFI1_CMD_ACK_EVENT = 10, /* ack & clear user status bits */ LEGACY_HFI1_CMD_SET_PKEY = 11, /* set context's pkey */ LEGACY_HFI1_CMD_CTXT_RESET = 12, /* reset context's HW send context */ LEGACY_HFI1_CMD_TID_INVAL_READ = 13, /* read TID cache invalidations */ LEGACY_HFI1_CMD_GET_VERS = 14 /* get the version of the user cdev */ }; /* Given a unit number and port number, returns 1 if the unit and port are active. returns 0 if the unit and port are not active. returns -1 when an error occurred. */ int hfi_get_port_active(int, int); /* Given the unit number and port, return an error, or the corresponding LID */ /* Returns an int, so -1 indicates a general error. -2 indicates that the unit/port are not active. 0 indicates that the unit is valid, but no LID has been assigned. */ int hfi_get_port_lid(int, int); /* Given the unit number and port, return an error, or the corresponding GID */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_gid(int, int, uint64_t *hi, uint64_t *lo); /* Given the unit number, return an error, or the corresponding LMC value for the port */ /* Returns an int, so -1 indicates an error. 0 */ int hfi_get_port_lmc(int unit, int port); /* Given the unit number, return an error, or the corresponding link rate for the port */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_rate(int unit, int port); /* Given a unit, port and SL, return an error, or the corresponding SC for the SL as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_sl2sc(int unit, int port, int sl); /* Given a unit, port and SC, return an error, or the corresponding VL for the SC as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_sc2vl(int unit, int port, int sc); /* Given a unit, port and VL, return an error, or the corresponding MTU for the VL as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_vl2mtu(int unit, int port, int vl); /* Given a unit, port and index, return an error, or the corresponding pkey for the index as programmed by the SM */ /* Returns an int, so -1 indicates an error. */ int hfi_get_port_index2pkey(int unit, int port, int index); /* Get the number of units supported by the driver. Does not guarantee that a working chip has been found for each possible unit #. Returns -1 with errno set, or number of units >=0 (0 means none found). */ int hfi_get_num_units(); /* Given a unit number, returns 1 if any port on the unit is active. returns 0 if no port on the unit is active. returns -1 when an error occurred. */ int hfi_get_unit_active(int unit); /* get the number of contexts from the unit id. */ int hfi_get_num_contexts(int unit); /* Open hfi device file, return -1 on error. */ int hfi_context_open(int unit, int port, uint64_t open_timeout); int hfi_context_open_ex(int unit, int port, uint64_t open_timeout, char *dev_name,size_t dev_name_len); uint32_t hfi_check_non_dw_mul_sdma(void); void hfi_context_close(int fd); /* hfi_get_user_major_version() returns the major version of the driver that should be used for this session of psm. Valid only after hfi_context_open has been called. */ uint16_t hfi_get_user_major_version(void); /* hfi_get_user_minor_version() return the minor version of the driver */ uint16_t hfi_get_user_minor_version(void); void hfi_set_user_version(uint32_t version); void hfi_set_user_major_version(uint16_t major_version); int hfi_cmd_write(int fd, struct hfi1_cmd *, size_t count); int hfi_cmd_writev(int fd, const struct iovec *iov, int iovcnt); /* hfi_get_cc_settings_bin() returns less than or equal to 0 on failure, returns greater than 0 on success. */ int hfi_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf); int hfi_get_cc_table_bin(int unit, int port, uint16_t **cctp); /* We use mmap64() because we compile in both 32 and 64 bit mode, and we have to map physical addresses that are > 32 bits long. While linux implements mmap64, it doesn't have a man page, and isn't declared in any header file, so we declare it here ourselves. */ /* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and redirects mmap to mmap64 for us, but at least through suse10 and fc4, it doesn't work when the address being mapped is > 32 bits. It chips off bits 32 and above. So we stay with mmap64. */ extern void *mmap64(void *, size_t, int, int, int, __off64_t); void *hfi_mmap64(void *, size_t, int, int, int, __off64_t); /* Statistics maintained by the driver */ int hfi_get_stats(uint64_t *, int); int hfi_get_stats_names(char **namep); /* Counters maintained in the chip, globally, and per-prot */ int hfi_get_ctrs_unit(int unitno, uint64_t *, int); int hfi_get_ctrs_unit_names(int unitno, char **namep); int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int); int hfi_get_ctrs_port_names(int unitno, char **namep); /* sysfs helper routines (only those currently used are exported; * try to avoid using others) */ const char *hfi_sysfs_path(void); /* read a string value */ int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, char **datap); /* read a string value into buff, no more than size bytes. returns the number of bytes read */ size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, char *buff, size_t size); /* open attribute in unit's sysfs directory via open(2) */ int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags); int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, int flags); /* print to attribute in {unit,port} sysfs directory */ int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr, const char *fmt, ...) __attribute__((format(printf, 4, 5))); int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...) __attribute__((format(printf, 3, 4))); int hfi_hfifs_unit_write(uint32_t unit, const char *attr, const void *data, size_t len); /* read up to one page of malloc'ed data (caller must free), returning number of bytes read or -1 */ int hfi_hfifs_read(const char *attr, char **datap); int hfi_hfifs_unit_read(uint32_t unit, const char *attr, char **data); /* read a signed 64-bit quantity, in some arbitrary base */ int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr, int64_t *valp, int base); int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, int64_t *valp, int base); int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit); /* these read directly into supplied buffer and take a count */ int hfi_hfifs_rd(const char *, void *, int); int hfi_hfifs_unit_rd(uint32_t unit, const char *, void *, int); int hfi_hfifs_open(const char *relname, int flags); int hfi_cmd_wait_for_packet(int fd); #endif /* OPA_SERVICE_GEN1_H */ opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_user_gen1.h000066400000000000000000000554011370564314600214630ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef OPA_USER_GEN1_H #define OPA_USER_GEN1_H /* This file contains all of the data structures and routines that are publicly visible and usable (to low level infrastructure code; it is not expected that any application, or even normal application-level library, will ever need to use any of this). Additional entry points and data structures that are used by these routines may be referenced in this file, but they should not be generally available; they are visible here only to allow use in inlined functions. Any variable, data structure, or function that starts with a leading "_" is in this category. */ /* Include header files we need that are unlikely to otherwise be needed by */ /* programs. */ #include #include #include #include #include #include #include #include #include #include #include "opa_intf.h" #include "opa_common_gen1.h" #include "opa_byteorder.h" #include "opa_udebug.h" #include "opa_service_gen1.h" #include "opa_user.h" #define HFI_RHF_USE_EGRBFR_MASK 0x1 #define HFI_RHF_USE_EGRBFR_SHIFT 15 #define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF #define HFI_RHF_EGRBFR_INDEX_SHIFT 16 #define HFI_RHF_SEQ_MASK 0xF #define HFI_RHF_SEQ_SHIFT 28 #define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF #define HFI_RHF_EGRBFR_OFFSET_SHIFT 0 #define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF #define HFI_RHF_HDRQ_OFFSET_SHIFT 12 #define HFI_RHF_TIDERR 0x08000000 /* TidFlow related bits */ #define HFI_TF_SEQNUM_SHIFT 0 #define HFI_TF_SEQNUM_MASK 0x7ff #define HFI_TF_GENVAL_SHIFT 11 #define HFI_TF_GENVAL_MASK 0xfffff #define HFI_TF_FLOWVALID_SHIFT 32 #define HFI_TF_FLOWVALID_MASK 0x1 #define HFI_TF_HDRSUPP_ENABLED_SHIFT 33 #define HFI_TF_HDRSUPP_ENABLED_MASK 0x1 #define HFI_TF_KEEP_AFTER_SEQERR_SHIFT 34 #define HFI_TF_KEEP_AFTER_SEQERR_MASK 0x1 #define HFI_TF_KEEP_ON_GENERR_SHIFT 35 #define HFI_TF_KEEP_ON_GENERR_MASK 0x1 #define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36 #define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK 0x1 #define HFI_TF_STATUS_SEQMISMATCH_SHIFT 37 #define HFI_TF_STATUS_SEQMISMATCH_MASK 0x1 #define HFI_TF_STATUS_GENMISMATCH_SHIFT 38 #define HFI_TF_STATUS_GENMISMATCH_MASK 0x1 /* PBC bits */ #define HFI_PBC_STATICRCC_SHIFT 0 #define HFI_PBC_STATICRCC_MASK 0xffff #define HFI_PBC_SC4_SHIFT 4 #define HFI_PBC_SC4_MASK 0x1 #define HFI_PBC_INTR_SHIFT 31 #define HFI_PBC_DCINFO_SHIFT 30 #define HFI_PBC_TESTEBP_SHIFT 29 #define HFI_PBC_PACKETBYPASS_SHIFT 28 #define HFI_PBC_INSERTHCRC_SHIFT 26 #define HFI_PBC_INSERTHCRC_MASK 0x3 #define HFI_PBC_CREDITRETURN_SHIFT 25 #define HFI_PBC_INSERTBYPASSICRC_SHIFT 24 #define HFI_PBC_TESTBADICRC_SHIFT 23 #define HFI_PBC_FECN_SHIFT 22 #define HFI_PBC_VL_SHIFT 12 #define HFI_PBC_VL_MASK 0xf #define HFI_PBC_LENGTHDWS_SHIFT 0 #define HFI_PBC_LENGTHDWS_MASK 0xfff /* this portion only defines what we currently use */ struct hfi_pbc { __u32 pbc0; __u16 PbcStaticRateControlCnt; __u16 fill1; }; typedef enum mapsize { SC_CREDITS, PIO_BUFBASE_SOP, PIO_BUFBASE, RCVHDR_BUFBASE, RCVEGR_BUFBASE, SDMA_COMP_BUFBASE, USER_REGBASE, RCVHDRTAIL_BASE, EVENTS_BUFBASE, STATUS_BUFBASE, SUBCTXT_UREGBASE, SUBCTXT_RCVHDRBUF, SUBCTXT_RCVEGRBUF, MAPSIZE_MAX } mapsize_t; /* TODO: consider casting in the ALIGN() macro */ #define ALIGN(x, a) (((x)+(a)-1)&~((a)-1)) #define ALIGNDOWN_PTR(x, a) ((void*)(((uintptr_t)(x))&~((uintptr_t)((a)-1)))) /* using the same flags for all the mappings */ #define HFI_MMAP_FLAGS (MAP_SHARED|MAP_LOCKED) #define HFI_MMAP_PGSIZE sysconf(_SC_PAGESIZE) /* cast to uintptr_t as opposed to intptr_t which evaluates to a signed type * * on which one should not perform bitwise operations (undefined behavior) * */ #define HFI_MMAP_PGMASK (~(uintptr_t)(HFI_MMAP_PGSIZE-1)) /* this is only an auxiliary macro for HFI_MMAP_ERRCHECK() * @off expected to be unsigned in order to AND with the page mask and avoid undefined behavior */ #define U64_TO_OFF64_PGMASK(off) ((__off64_t)((off) & HFI_MMAP_PGMASK)) #define HFI_MMAP_ALIGNOFF(fd, off, size, prot) hfi_mmap64(0,(size),(prot),HFI_MMAP_FLAGS,(fd),U64_TO_OFF64_PGMASK((off))) /* complementary */ #define HFI_MUNMAP(addr, size) munmap((addr), (size)) /* make sure uintmax_t can hold the result of unsigned int multiplication */ #if UINT_MAX > (UINTMAX_MAX / UINT_MAX) #error We cannot safely multiply unsigned integers on this platform #endif /* @member assumed to be of type u64 and validated to be so */ #define HFI_MMAP_ERRCHECK(fd, binfo, member, size, prot) ({ \ typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ (void)__tptr; \ void *__maddr = HFI_MMAP_ALIGNOFF((fd), (binfo)->member, (size), (prot)); \ do { \ if (unlikely(__maddr == MAP_FAILED)) { \ uintmax_t outval = (uintmax_t)((binfo)->member); \ _HFI_INFO("mmap of " #member " (0x%jx) size %zu failed: %s\n", \ outval, size, strerror(errno)); \ goto err_mmap_##member; \ } \ (binfo)->member = (__u64)__maddr; \ _HFI_VDBG(#member "mmap %jx successful\n", (uintmax_t)((binfo)->member)); \ } while(0); \ __maddr; \ }) /* assigns 0 to the member after unmapping */ #define HFI_MUNMAP_ERRCHECK(binfo, member, size) \ do { typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ (void)__tptr; \ void *__addr = ALIGNDOWN_PTR((binfo)->member, HFI_MMAP_PGSIZE); \ if (unlikely( __addr == NULL || (munmap(__addr, (size)) == -1))) { \ _HFI_INFO("unmap of " #member " (%p) failed: %s\n", \ __addr, strerror(errno)); \ } \ else { \ _HFI_VDBG("unmap of " #member "(%p) succeeded\n", __addr); \ (binfo)->member = 0; \ } \ } while(0) #define HFI_PCB_SIZE_IN_BYTES 8 /* Usable bytes in header (hdrsize - lrh - bth) */ #define HFI_MESSAGE_HDR_SIZE_HFI (HFI_MESSAGE_HDR_SIZE-20) /* * SDMA includes 8B sdma hdr, 8B PBC, and message header. * If we are using GPU workloads, we need to set a new * "flags" member which takes another 2 bytes in the * sdma hdr. We let the driver know of this 2 extra bytes * at runtime when we set the length for the iovecs. */ #define HFI_SDMA_HDR_SIZE (8+8+56) static inline __u32 hfi_hdrget_seq(const __le32 *rbuf) { return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT) & HFI_RHF_SEQ_MASK; } static inline __u32 hfi_hdrget_hdrq_offset(const __le32 *rbuf) { return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT) & HFI_RHF_HDRQ_OFFSET_MASK; } struct _hfi_ctrl { int32_t fd; /* device file descriptor */ /* tidflow valid */ uint32_t __hfi_tfvalid; /* unit id */ uint32_t __hfi_unit; /* port id */ uint32_t __hfi_port; /* number of eager tid entries */ uint32_t __hfi_tidegrcnt; /* number of expected tid entries */ uint32_t __hfi_tidexpcnt; /* effective mtu size, should be <= base_info.mtu */ uint32_t __hfi_mtusize; /* max PIO size, should be <= effective mtu size */ uint32_t __hfi_piosize; /* two struct output from driver. */ struct hfi1_ctxt_info ctxt_info; struct hfi1_base_info base_info; /* some local storages in some condition: */ /* as storage of __hfi_rcvtidflow in hfi_userinit_internal(). */ __le64 regs[HFI_TF_NFLOWS]; /* location to which OPA writes the rcvhdrtail register whenever it changes, so that no chip registers are read in the performance path. */ volatile __le64 *__hfi_rcvtail; /* address where ur_rcvhdrtail is written */ volatile __le64 *__hfi_rcvhdrtail; /* address where ur_rcvhdrhead is written */ volatile __le64 *__hfi_rcvhdrhead; /* address where ur_rcvegrindextail is read */ volatile __le64 *__hfi_rcvegrtail; /* address where ur_rcvegrindexhead is written */ volatile __le64 *__hfi_rcvegrhead; /* address where ur_rcvegroffsettail is read */ volatile __le64 *__hfi_rcvofftail; /* address where ur_rcvtidflow is written */ volatile __le64 *__hfi_rcvtidflow; }; /* After the device is opened, hfi_userinit() is called to give the driver the parameters the user code wants to use, and to get the implementation values, etc. back. 0 is returned on success, a positive value is a standard errno, and a negative value is reserved for future use. The first argument is the filedescriptor returned by the device open. It is allowed to have multiple devices (and of different types) simultaneously opened and initialized, although this won't be fully implemented initially. This routine is used by the low level hfi protocol code (and any other code that has similar low level functionality). This is the only routine that takes a file descriptor, rather than an struct _hfi_ctrl *. The struct _hfi_ctrl * used for everything else is returned by this routine. */ struct _hfi_ctrl *hfi_userinit(int32_t, struct hfi1_user_info_dep *); /* Internal function extends API, while original remains for backwards compatibility with external code */ struct _hfi_ctrl *hfi_userinit_internal(int32_t, bool, struct hfi1_user_info_dep *); /* don't inline these; it's all init code, and not inlining makes the */ /* overall code shorter and easier to debug */ void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline)); /* set the BTH pkey to check for this process. */ /* This is for receive checks, not for sends. It isn't necessary to set the default key, that's always allowed by the hardware. If too many pkeys are in use for the hardware to support, this will return EAGAIN, and the caller should then fail and exit or use the default key and check the pkey in the received packet checking. */ /* set send context pkey to verify, error if driver is not configured with */ /* this pkey in its pkey table. */ int hfi_set_pkey(struct _hfi_ctrl *, uint16_t); int hfi_wait_for_packet(struct _hfi_ctrl *); /* New user event mechanism, using spi_sendbuf_status HFI_EVENT_* bits obsoletes hfi_disarm_bufs(), and extends it, although old mechanism remains for binary compatibility. */ int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits); /* set whether we want an interrupt on all packets, or just urgent ones */ int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type); /* reset halted send context, error if context is not halted. */ int hfi_reset_context(struct _hfi_ctrl *ctrl); /* * Safe version of hfi_[d/q]wordcpy that is guaranteed to only copy each byte once. */ #if defined(__x86_64__) void hfi_dwordcpy_safe(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords); void hfi_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords); #else #define hfi_dwordcpy_safe hfi_dwordcpy #define hfi_qwordcpy_safe hfi_qwordcpy #endif static __inline__ void hfi_tidflow_set_entry(struct _hfi_ctrl *ctrl, uint32_t flowid, uint32_t genval, uint32_t seqnum) { /* For proper behavior with RSM interception of FECN packets for CCA, * the tidflow entry needs the KeepAfterSequenceError bit set. * A packet that is converted from expected to eager by RSM will not * trigger an update in the tidflow state. This will cause the tidflow * to incorrectly report a sequence error on any non-FECN packets that * arrive after the RSM intercepted packets. If the KeepAfterSequenceError * bit is set, PSM can properly detect this "false SeqErr" condition, * and recover without dropping packets. * Note that if CCA/RSM are not important, this change will slightly * increase the CPU load when packets are dropped. If this is significant, * consider hiding this change behind a CCA/RSM environment variable. */ ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64( ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) | ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) | ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) | (1ULL << HFI_TF_HDRSUPP_ENABLED_SHIFT) | /* KeepAfterSequenceError = 1 -- previously was 0 */ (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) | (1ULL << HFI_TF_KEEP_ON_GENERR_SHIFT) | /* KeePayloadOnGenErr = 0 */ (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) | (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT)); } static __inline__ void hfi_tidflow_reset(struct _hfi_ctrl *ctrl, uint32_t flowid, uint32_t genval, uint32_t seqnum) { /* * If a tidflow table entry is set to "Invalid", we want to drop * header if payload is dropped, we want to get a header if the payload * is delivered. * * We set a tidflow table entry "Invalid" by setting FlowValid=1 and * GenVal=0x1FFF/0xFFFFF, this is a special generation number and no * packet will use this value. We don't care SeqNum but we set it to * 0x7FF. So if GenVal does not match, the payload is dropped because * KeepPayloadOnGenErr=0; for packet header, KeepOnGenErr=0 make sure * header is not generated. But if a packet happens to have the special * generation number, the payload is delivered, HdrSuppEnabled=0 make * sure header is generated if SeqNUm matches, if SeqNum does not match, * KeepAfterSeqErr=1 makes sure the header is generated. */ ctrl->__hfi_rcvtidflow[flowid] = __cpu_to_le64( /* genval = 0x1FFF or 0xFFFFF */ ((genval & HFI_TF_GENVAL_MASK) << HFI_TF_GENVAL_SHIFT) | /* seqnum = 0x7FF */ ((seqnum & HFI_TF_SEQNUM_MASK) << HFI_TF_SEQNUM_SHIFT) | ((uint64_t)ctrl->__hfi_tfvalid << HFI_TF_FLOWVALID_SHIFT) | /* HdrSuppEnabled = 0 */ (1ULL << HFI_TF_KEEP_AFTER_SEQERR_SHIFT) | /* KeepOnGenErr = 0 */ /* KeepPayloadOnGenErr = 0 */ (1ULL << HFI_TF_STATUS_SEQMISMATCH_SHIFT) | (1ULL << HFI_TF_STATUS_GENMISMATCH_SHIFT)); } /* * This should only be used for debugging. * Normally, we shouldn't read the chip. */ static __inline__ uint64_t hfi_tidflow_get(struct _hfi_ctrl *ctrl, uint32_t flowid) { return __le64_to_cpu(ctrl->__hfi_rcvtidflow[flowid]); } static __inline__ uint32_t hfi_tidflow_get_seqnum(uint64_t val) { return (val >> HFI_TF_SEQNUM_SHIFT) & HFI_TF_SEQNUM_MASK; } static __inline__ uint32_t hfi_tidflow_get_genval(uint64_t val) { return (val >> HFI_TF_GENVAL_SHIFT) & HFI_TF_GENVAL_MASK; } static __inline__ uint32_t hfi_tidflow_get_flowvalid(uint64_t val) { return (val >> HFI_TF_FLOWVALID_SHIFT) & HFI_TF_FLOWVALID_MASK; } static __inline__ uint32_t hfi_tidflow_get_enabled(uint64_t val) { return (val >> HFI_TF_HDRSUPP_ENABLED_SHIFT) & HFI_TF_HDRSUPP_ENABLED_MASK; } static __inline__ uint32_t hfi_tidflow_get_keep_after_seqerr(uint64_t val) { return (val >> HFI_TF_KEEP_AFTER_SEQERR_SHIFT) & HFI_TF_KEEP_AFTER_SEQERR_MASK; } static __inline__ uint32_t hfi_tidflow_get_keep_on_generr(uint64_t val) { return (val >> HFI_TF_KEEP_ON_GENERR_SHIFT) & HFI_TF_KEEP_ON_GENERR_MASK; } static __inline__ uint32_t hfi_tidflow_get_keep_payload_on_generr(uint64_t val) { return (val >> HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT) & HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK; } static __inline__ uint32_t hfi_tidflow_get_seqmismatch(uint64_t val) { return (val >> HFI_TF_STATUS_SEQMISMATCH_SHIFT) & HFI_TF_STATUS_SEQMISMATCH_MASK; } static __inline__ uint32_t hfi_tidflow_get_genmismatch(uint64_t val) { return (val >> HFI_TF_STATUS_GENMISMATCH_SHIFT) & HFI_TF_STATUS_GENMISMATCH_MASK; } /* * This should only be used by a process to write the eager index into * a subcontext's eager header entry. */ static __inline__ void hfi_hdrset_use_egrbfr(__le32 *rbuf, uint32_t val) { rbuf[0] = (rbuf[0] & __cpu_to_le32(~(HFI_RHF_USE_EGRBFR_MASK << HFI_RHF_USE_EGRBFR_SHIFT))) | __cpu_to_le32((val & HFI_RHF_USE_EGRBFR_MASK) << HFI_RHF_USE_EGRBFR_SHIFT); } static __inline__ void hfi_hdrset_egrbfr_index(__le32 *rbuf, uint32_t val) { rbuf[0] = (rbuf[0] & __cpu_to_le32(~(HFI_RHF_EGRBFR_INDEX_MASK << HFI_RHF_EGRBFR_INDEX_SHIFT))) | __cpu_to_le32((val & HFI_RHF_EGRBFR_INDEX_MASK) << HFI_RHF_EGRBFR_INDEX_SHIFT); } static __inline__ void hfi_hdrset_egrbfr_offset(__le32 *rbuf, uint32_t val) { rbuf[1] = (rbuf[1] & __cpu_to_le32(~(HFI_RHF_EGRBFR_OFFSET_MASK << HFI_RHF_EGRBFR_OFFSET_SHIFT))) | __cpu_to_le32((val & HFI_RHF_EGRBFR_OFFSET_MASK) << HFI_RHF_EGRBFR_OFFSET_SHIFT); } /* * This should only be used by a process to update the receive header * error flags. */ static __inline__ void hfi_hdrset_err_flags(__le32 *rbuf, uint32_t val) { rbuf[1] |= __cpu_to_le32(val); } /* * This should only be used by a process to write the rhf seq number into * a subcontext's eager header entry. */ static __inline__ void hfi_hdrset_seq(__le32 *rbuf, uint32_t val) { rbuf[0] = (rbuf[0] & __cpu_to_le32(~(HFI_RHF_SEQ_MASK << HFI_RHF_SEQ_SHIFT))) | __cpu_to_le32((val & HFI_RHF_SEQ_MASK) << HFI_RHF_SEQ_SHIFT); } /* Manage TID entries. It is possible that not all entries requested may be allocated. A matching hfi_free_tid() must be done for each hfi_update_tid(), because currently no caching or reuse of expected tid entries is allowed, to work around malloc/free and mmap/munmap issues. The driver decides which TID entries to allocate. If hfi_free_tid is called to free entries in use by a different send by the same process, data corruption will probably occur, but only within that process, not for other processes. */ /* update tidcnt expected TID entries from the array pointed to by tidinfo. */ /* Returns 0 on success, else an errno. See full description at declaration */ static __inline__ int32_t hfi_update_tid(struct _hfi_ctrl *ctrl, uint64_t vaddr, uint32_t *length, uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) { struct hfi1_cmd cmd; struct hfi1_tid_info tidinfo; #ifdef PSM_CUDA struct hfi1_tid_info_v2 tidinfov2; #endif int err; tidinfo.vaddr = vaddr; /* base address for this send to map */ tidinfo.length = *length; /* length of vaddr */ tidinfo.tidlist = tidlist; /* driver copies tids back directly */ tidinfo.tidcnt = 0; /* clear to zero */ cmd.type = PSMI_HFI_CMD_TID_UPDATE; cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; #ifdef PSM_CUDA if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { /* Copy values to v2 struct */ tidinfov2.vaddr = tidinfo.vaddr; tidinfov2.length = tidinfo.length; tidinfov2.tidlist = tidinfo.tidlist; tidinfov2.tidcnt = tidinfo.tidcnt; tidinfov2.flags = flags; cmd.type = PSMI_HFI_CMD_TID_UPDATE_V2; cmd.len = sizeof(tidinfov2); cmd.addr = (__u64) &tidinfov2; } #endif err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); if (err != -1) { struct hfi1_tid_info *rettidinfo = (struct hfi1_tid_info *)cmd.addr; *length = rettidinfo->length; *tidcnt = rettidinfo->tidcnt; } return err; } static __inline__ int32_t hfi_free_tid(struct _hfi_ctrl *ctrl, uint64_t tidlist, uint32_t tidcnt) { struct hfi1_cmd cmd; struct hfi1_tid_info tidinfo; int err; tidinfo.tidlist = tidlist; /* input to driver */ tidinfo.tidcnt = tidcnt; cmd.type = PSMI_HFI_CMD_TID_FREE; cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); return err; } static __inline__ int32_t hfi_get_invalidation(struct _hfi_ctrl *ctrl, uint64_t tidlist, uint32_t *tidcnt) { struct hfi1_cmd cmd; struct hfi1_tid_info tidinfo; int err; tidinfo.tidlist = tidlist; /* driver copies tids back directly */ tidinfo.tidcnt = 0; /* clear to zero */ cmd.type = PSMI_HFI_CMD_TID_INVAL_READ; cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; err = hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); if (err != -1) *tidcnt = tidinfo.tidcnt; return err; } /* * Data layout in I2C flash (for GUID, etc.) * All fields are little-endian binary unless otherwise stated */ #define HFI_FLASH_VERSION 2 struct hfi_flash { /* flash layout version (HFI_FLASH_VERSION) */ __u8 if_fversion; /* checksum protecting if_length bytes */ __u8 if_csum; /* * valid length (in use, protected by if_csum), including * if_fversion and if_csum themselves) */ __u8 if_length; /* the GUID, in network order */ __u8 if_guid[8]; /* number of GUIDs to use, starting from if_guid */ __u8 if_numguid; /* the (last 10 characters of) board serial number, in ASCII */ char if_serial[12]; /* board mfg date (YYYYMMDD ASCII) */ char if_mfgdate[8]; /* last board rework/test date (YYYYMMDD ASCII) */ char if_testdate[8]; /* logging of error counts, TBD */ __u8 if_errcntp[4]; /* powered on hours, updated at driver unload */ __u8 if_powerhour[2]; /* ASCII free-form comment field */ char if_comment[32]; /* Backwards compatible prefix for longer QLogic Serial Numbers */ char if_sprefix[4]; /* 82 bytes used, min flash size is 128 bytes */ __u8 if_future[46]; }; #endif /* OPA_USER_GEN1_H */ opa-psm2-PSM2_11.2.185/psm_hal_gen1/opa_utils_gen1.c000066400000000000000000000200371370564314600216350ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file contains hfi service routine interface used by the low */ /* level hfi protocol code. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opa_user_gen1.h" /* touch the pages, with a 32 bit read */ void hfi_touch_mmap(void *m, size_t bytes) { volatile uint32_t *b = (volatile uint32_t *)m, c; size_t i; /* m is always page aligned, so pgcnt exact */ int __hfi_pg_sz; /* First get the page size */ __hfi_pg_sz = sysconf(_SC_PAGESIZE); _HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n", (unsigned long)bytes / __hfi_pg_sz, m); bytes /= sizeof(c); for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c)) c = b[i]; } /* ack event bits, and clear them. Usage is check *spi_sendbuf_status, pass bits you are prepared to handle to hfi_event_ack(), perform the appropriate actions for bits that were set, and then (if appropriate) check the bits again. */ int hfi_event_ack(struct _hfi_ctrl *ctrl, __u64 ackbits) { struct hfi1_cmd cmd; cmd.type = PSMI_HFI_CMD_ACK_EVENT; cmd.len = 0; cmd.addr = ackbits; if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { if (errno != EINVAL) /* not implemented in driver. */ _HFI_DBG("event ack failed: %s\n", strerror(errno)); return -1; } return 0; } /* Tell the driver to change the way packets can generate interrupts. HFI1_POLL_TYPE_URGENT: Generate interrupt only when packet sets HFI_KPF_INTR HFI1_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on). PSM: Uses TYPE_URGENT in ips protocol */ int hfi_poll_type(struct _hfi_ctrl *ctrl, uint16_t poll_type) { struct hfi1_cmd cmd; cmd.type = PSMI_HFI_CMD_POLL_TYPE; cmd.len = 0; cmd.addr = (uint64_t) poll_type; if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { if (errno != EINVAL) /* not implemented in driver */ _HFI_INFO("poll type failed: %s\n", strerror(errno)); return -1; } return 0; } /* set the send context pkey to check BTH pkey in each packet. driver should check its pkey table to see if it can find this pkey, if not, driver should return error. */ int hfi_set_pkey(struct _hfi_ctrl *ctrl, uint16_t pkey) { struct hfi1_cmd cmd; struct hfi1_base_info tbinfo; cmd.type = PSMI_HFI_CMD_SET_PKEY; cmd.len = 0; cmd.addr = (uint64_t) pkey; _HFI_VDBG("Setting context pkey to 0x%04x.\n", pkey); if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { _HFI_INFO("Setting context pkey to 0x%04x failed: %s\n", pkey, strerror(errno)); return -1; } else { _HFI_VDBG("Successfully set context pkey to 0x%04x.\n", pkey); } if (getenv("PSM2_SELINUX")) { /* * If SELinux is in use the kernel may have changed our JKey based on * what we supply for the PKey so go ahead and interrogate the user info * again and update our saved copy. In the future there may be a new * IOCTL to get the JKey only. For now, this temporary workaround works. */ cmd.type = PSMI_HFI_CMD_USER_INFO; cmd.len = sizeof(tbinfo); cmd.addr = (uint64_t) &tbinfo; if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { _HFI_VDBG("BASE_INFO command failed in setpkey: %s\n", strerror(errno)); return -1; } _HFI_VDBG("PSM2_SELINUX is set, updating jkey to 0x%04x\n", tbinfo.jkey); ctrl->base_info.jkey = tbinfo.jkey; } return 0; } /* Tell the driver to reset the send context. if the send context if halted, reset it, if not, return error back to caller. After context reset, the credit return should be reset to zero by a hardware credit return DMA. Driver will return ENOLCK if the reset is timeout, in this case PSM needs to re-call again. */ int hfi_reset_context(struct _hfi_ctrl *ctrl) { struct hfi1_cmd cmd; cmd.type = PSMI_HFI_CMD_CTXT_RESET; cmd.len = 0; cmd.addr = 0; retry: if (hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)) == -1) { if (errno == ENOLCK) goto retry; if (errno != EINVAL) _HFI_INFO("reset ctxt failed: %s\n", strerror(errno)); return -1; } return 0; } /* wait for a received packet for our context This allows us to not busy wait, if nothing has happened for a while, which allows better measurements of cpu utilization, and in some cases, slightly better performance. Called where we would otherwise call sched_yield(). It is not guaranteed that a packet has arrived, so the normal checking loop(s) should be done. PSM: not used as is, PSM has it's own use of polling for interrupt-only packets (sets hfi_poll_type to TYPE_URGENT) */ int hfi_wait_for_packet(struct _hfi_ctrl *ctrl) { return hfi_cmd_wait_for_packet(ctrl->fd); } /* These have been fixed to read the values, but they are not * compatible with the hfi driver, they return new info with * the qib driver */ static int hfi_count_names(const char *namep) { int n = 0; while (*namep != '\0') { if (*namep == '\n') n++; namep++; } return n; } int hfi_lookup_stat(const char *attr, char *namep, uint64_t *stats, uint64_t *s) { const char *p; int i, ret = -1, len = strlen(attr); int nelem = hfi_count_names(namep); for (i = 0; i < nelem; i++) { p = hfi_get_next_name(&namep); if (p == NULL) break; if (strncasecmp(p, attr, len + 1) == 0) { ret = i; *s = stats[i]; } } return ret; } int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *s) { int nelem, n = 0, ret = -1; char *namep = NULL; uint64_t *stats = NULL; nelem = hfi_get_ctrs_port_names(unit, &namep); if (nelem == -1 || namep == NULL) goto bail; stats = calloc(nelem, sizeof(uint64_t)); if (stats == NULL) goto bail; n = hfi_get_ctrs_port(unit, port, stats, nelem); if (n != nelem) goto bail; ret = hfi_lookup_stat(attr, namep, stats, s); bail: if (namep != NULL) free(namep); if (stats != NULL) free(stats); return ret; } opa-psm2-PSM2_11.2.185/psm_hal_gen1/psm_gdrcpy.c000066400000000000000000000151571370564314600211020ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2018 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2018 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef PSM_CUDA #include "psm_user.h" #include "psm2_hal.h" #include "psm_gdrcpy.h" #include #include #include #include "ptl_ips/ips_tid.h" #include "ptl_ips/ips_expected_proto.h" #include "opa_user_gen1.h" static int gdr_fd; int get_gdr_fd(){ return gdr_fd; } #define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1) #define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK uint64_t gdr_cache_evict() { int ret; struct hfi1_gdr_cache_evict_params params; params.evict_params_in.version = HFI1_GDR_VERSION; params.evict_params_in.pages_to_evict = 4; ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_CACHE_EVICT, ¶ms); if (ret) { /* Fatal error */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "PIN/MMAP ioctl failed ret %d errno %d\n", ret, errno); return ret; } return params.evict_params_out.pages_evicted; } uint64_t ips_sdma_gpu_cache_evict(int fd) { int ret; struct hfi1_sdma_gpu_cache_evict_params params; params.evict_params_in.version = HFI1_GDR_VERSION; params.evict_params_in.pages_to_evict = 2; ret = ioctl(fd, HFI1_IOCTL_SDMA_CACHE_EVICT, ¶ms); if (ret) { /* Fatal error */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "SDMA Cache Evict failed ret %d errno %d\n", ret, errno); return ret; } return params.evict_params_out.pages_evicted; } /* handle_out_of_bar_space is called when the driver tries * to self evict in the GDR cache and finds no entries. * This could be due to the fact that all the pages pinned * in the BAR1 region are cached in the SDMA and TID cache. * We try to evict from both the caches for 30 seconds after * which we bail out. If successful we retry to PIN/MMAP once * again */ uint64_t handle_out_of_bar_space(struct ips_proto *proto) { time_t lastEvictTime = 0; uint64_t lengthEvicted; time_t now; retry: now = time(NULL); if (!lastEvictTime) lastEvictTime = now; if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) { lengthEvicted = ips_tidcache_evict(&proto->protoexp->tidc, -1); if (lengthEvicted) { lastEvictTime = 0; return lengthEvicted; /* signals a retry of the writev command. */ } } lengthEvicted = ips_sdma_gpu_cache_evict(psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt)); if (lengthEvicted) { lastEvictTime = 0; return lengthEvicted; } static const double thirtySeconds = 30.0; if (difftime(now, lastEvictTime) > thirtySeconds) { return 0; } else { goto retry; } } void * gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, size_t size, int flags, struct ips_proto* proto) { struct hfi1_gdr_query_params query_params; void *host_addr_buf; int ret; query_params.query_params_in.version = HFI1_GDR_VERSION; uintptr_t pageaddr = buf & GPU_PAGE_MASK; /* As size is guarenteed to be in the range of 0-8kB * there is a guarentee that buf+size-1 does not overflow * 64 bits. */ uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + ((buf + size - 1) & GPU_PAGE_MASK) - pageaddr); _HFI_VDBG("(gpudirect) buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x proto=%p\n", (void *)buf, size, (void *)pageaddr, pagelen, flags, proto); query_params.query_params_in.gpu_buf_addr = pageaddr; query_params.query_params_in.gpu_buf_size = pagelen; retry: ret = ioctl(gdr_fd, HFI1_IOCTL_GDR_GPU_PIN_MMAP, &query_params); if (ret) { if (errno == ENOMEM || errno == EINVAL) { if (!handle_out_of_bar_space(proto)) { /* Fatal error */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to PIN GPU pages(Out of BAR1 space) (errno: %d)\n", errno); return NULL; } else { goto retry; } } else { /* Fatal error */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "PIN/MMAP ioctl failed ret %d errno %d\n", ret, errno); return NULL; } } host_addr_buf = (void *)query_params.query_params_out.host_buf_addr; return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK); } void hfi_gdr_open(){ gdr_fd = open(GDR_DEVICE_PATH, O_RDWR); if (-1 == gdr_fd ) { /* Non-Fatal error. If device cannot be found we assume * that the driver does not support GDR Copy and we fallback * to sending all GPU messages using rndv protocol */ _HFI_INFO(" Warning: The HFI1 driver installed does not support GPUDirect RDMA" " fast copy. Turning off GDR fast copy in PSM \n"); is_gdr_copy_enabled = 0; return; } return; } void hfi_gdr_close() { close(GDR_FD); } #endif opa-psm2-PSM2_11.2.185/psm_hal_gen1/psm_hal_gen1.c000066400000000000000000000213521370564314600212620ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "psm_user.h" #include "psm2_hal.h" #if PSMI_HAL_INST_CNT > 1 #define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_gen1_ ## KERNEL #include "psm2_hal_inline_t.h" #include "psm_hal_inline_i.h" #endif /* define the singleton that implements hal for gen1 */ static hfp_gen1_t psm_gen1_hi = { /* start of public psmi_hal_instance_t data */ .phi = { .type = PSM_HAL_INSTANCE_GEN1, .description = "PSM2 HAL instance for GEN1" #ifdef PSM_CUDA " (cuda)" #endif , .hfi_name = "hfi1", .hfi_sys_class_path = "/sys/class/infiniband/hfi1", .params = {0}, /* The following methods are alphabetized */ #if PSMI_HAL_INST_CNT > 1 .hfp_ack_hfi_event = hfp_gen1_ack_hfi_event, .hfp_check_rhf_sequence_number = hfp_gen1_check_rhf_sequence_number, .hfp_cl_q_empty = hfp_gen1_cl_q_empty, .hfp_close_context = hfp_gen1_close_context, .hfp_context_open = hfp_gen1_context_open, .hfp_dma_slot_available = hfp_gen1_dma_slot_available, .hfp_finalize_ = hfp_gen1_finalize_, .hfp_forward_packet_to_subcontext = hfp_gen1_forward_packet_to_subcontext, .hfp_free_tid = hfp_gen1_free_tid, .hfp_get_bthqp = hfp_gen1_get_bthqp, .hfp_get_cc_settings_bin = hfp_gen1_get_cc_settings_bin, .hfp_get_cc_table_bin = hfp_gen1_get_cc_table_bin, .hfp_get_cl_q_head_index = hfp_gen1_get_cl_q_head_index, .hfp_get_cl_q_tail_index = hfp_gen1_get_cl_q_tail_index, .hfp_get_context = hfp_gen1_get_context, .hfp_get_egr_buff = hfp_gen1_get_egr_buff, .hfp_get_fd = hfp_gen1_get_fd, .hfp_get_gid_hi = hfp_gen1_get_gid_hi, .hfp_get_gid_lo = hfp_gen1_get_gid_lo, .hfp_get_hfi_event_bits = hfp_gen1_get_hfi_event_bits, .hfp_get_hfi_type = hfp_gen1_get_hfi_type, .hfp_get_hw_status = hfp_gen1_get_hw_status, .hfp_get_hw_status_freezemsg = hfp_gen1_get_hw_status_freezemsg, .hfp_get_jkey = hfp_gen1_get_jkey, .hfp_get_lid = hfp_gen1_get_lid, .hfp_get_node_id = hfp_gen1_get_node_id, .hfp_get_pio_size = hfp_gen1_get_pio_size, .hfp_get_pio_stall_cnt = hfp_gen1_get_pio_stall_cnt, .hfp_get_port_gid = hfp_gen1_get_port_gid, .hfp_get_port_index2pkey = hfp_gen1_get_port_index2pkey, .hfp_get_port_lid = hfp_gen1_get_port_lid, .hfp_get_port_lmc = hfp_gen1_get_port_lmc, .hfp_get_port_num = hfp_gen1_get_port_num, .hfp_get_port_rate = hfp_gen1_get_port_rate, .hfp_get_sc2vl_map = hfp_gen1_get_sc2vl_map, .hfp_get_port_sl2sc = hfp_gen1_get_port_sl2sc, .hfp_get_receive_event = hfp_gen1_get_receive_event, .hfp_get_rhf_expected_sequence_number = hfp_gen1_get_rhf_expected_sequence_number, .hfp_get_rx_egr_tid_cnt = hfp_gen1_get_rx_egr_tid_cnt, .hfp_get_rx_hdr_q_cnt = hfp_gen1_get_rx_hdr_q_cnt, .hfp_get_rx_hdr_q_ent_size = hfp_gen1_get_rx_hdr_q_ent_size, .hfp_get_sdma_req_size = hfp_gen1_get_sdma_req_size, .hfp_get_sdma_ring_size = hfp_gen1_get_sdma_ring_size, .hfp_get_sdma_ring_slot_status = hfp_gen1_get_sdma_ring_slot_status, .hfp_get_subctxt = hfp_gen1_get_subctxt, .hfp_get_subctxt_cnt = hfp_gen1_get_subctxt_cnt, .hfp_get_tid_exp_cnt = hfp_gen1_get_tid_exp_cnt, .hfp_get_tidcache_invalidation = hfp_gen1_get_tidcache_invalidation, .hfp_get_unit_id = hfp_gen1_get_unit_id, .hfp_get_user_major_bldtime_version = hfp_gen1_get_user_major_bldtime_version, .hfp_get_user_major_bldtime_version = hfp_gen1_get_user_major_bldtime_version, .hfp_get_user_major_runtime_version = hfp_gen1_get_user_major_runtime_version, .hfp_get_user_major_runtime_version = hfp_gen1_get_user_major_runtime_version, .hfp_get_user_minor_bldtime_version = hfp_gen1_get_user_minor_bldtime_version, .hfp_get_user_minor_bldtime_version = hfp_gen1_get_user_minor_bldtime_version, .hfp_get_user_minor_runtime_version = hfp_gen1_get_user_minor_runtime_version, .hfp_get_user_minor_runtime_version = hfp_gen1_get_user_minor_runtime_version, .hfp_hfi_reset_context = hfp_gen1_hfi_reset_context, .hfp_poll_type = hfp_gen1_poll_type, .hfp_retire_hdr_q_entry = hfp_gen1_retire_hdr_q_entry, .hfp_set_cl_q_head_index = hfp_gen1_set_cl_q_head_index, .hfp_set_cl_q_tail_index = hfp_gen1_set_cl_q_tail_index, .hfp_set_effective_mtu = hfp_gen1_set_effective_mtu, .hfp_set_pbc = hfp_gen1_set_pbc, .hfp_set_pio_size = hfp_gen1_set_pio_size, .hfp_set_pkey = hfp_gen1_set_pkey, .hfp_set_rhf_expected_sequence_number = hfp_gen1_set_rhf_expected_sequence_number, .hfp_set_tf_valid = hfp_gen1_set_tf_valid, .hfp_spio_fini = hfp_gen1_spio_fini, .hfp_spio_init = hfp_gen1_spio_init, .hfp_spio_process_events = hfp_gen1_spio_process_events, .hfp_spio_transfer_frame = hfp_gen1_spio_transfer_frame, .hfp_subcontext_ureg_get = hfp_gen1_subcontext_ureg_get, .hfp_tidflow_check_update_pkt_seq = hfp_gen1_tidflow_check_update_pkt_seq, .hfp_tidflow_get = hfp_gen1_tidflow_get, .hfp_tidflow_get_enabled = hfp_gen1_tidflow_get_enabled, .hfp_tidflow_get_flowvalid = hfp_gen1_tidflow_get_flowvalid, .hfp_tidflow_get_genmismatch = hfp_gen1_tidflow_get_genmismatch, .hfp_tidflow_get_genval = hfp_gen1_tidflow_get_genval, .hfp_tidflow_get_hw = hfp_gen1_tidflow_get_hw, .hfp_tidflow_get_keep_after_seqerr = hfp_gen1_tidflow_get_keep_after_seqerr, .hfp_tidflow_get_keep_on_generr = hfp_gen1_tidflow_get_keep_on_generr, .hfp_tidflow_get_keep_payload_on_generr = hfp_gen1_tidflow_get_keep_payload_on_generr, .hfp_tidflow_get_seqmismatch = hfp_gen1_tidflow_get_seqmismatch, .hfp_tidflow_get_seqnum = hfp_gen1_tidflow_get_seqnum, .hfp_tidflow_reset = hfp_gen1_tidflow_reset, .hfp_tidflow_set_entry = hfp_gen1_tidflow_set_entry, .hfp_update_tid = hfp_gen1_update_tid, .hfp_writev = hfp_gen1_writev, #endif .hfp_get_default_pkey = hfp_gen1_get_default_pkey, .hfp_get_num_contexts = hfp_gen1_get_num_contexts, .hfp_get_num_free_contexts = hfp_gen1_get_num_free_contexts, .hfp_get_num_units = hfp_gen1_get_num_units, .hfp_get_num_ports = hfp_gen1_get_num_ports, .hfp_get_port_active = hfp_gen1_get_port_active, .hfp_get_unit_active = hfp_gen1_get_unit_active, .hfp_initialize = hfp_gen1_initialize, }, /* start of private hfp_gen1_private data */ .hfp_private = { .sdmahdr_req_size = 0, .dma_rtail = 0, .hdrq_rhf_off = 0, } }; /* __psmi_hal_gen1_constructor */ static void __attribute__ ((constructor)) __psmi_hal_gen1_constructor(void) { psmi_hal_register_instance((psmi_hal_instance_t*)&psm_gen1_hi); } opa-psm2-PSM2_11.2.185/psm_hal_gen1/psm_hal_gen1.h000066400000000000000000000116721370564314600212730ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "psm_user.h" #include "ips_proto.h" #include "ips_proto_internal.h" #include "psm_hal_gen1_spio.h" #include "psm_mq_internal.h" #include "opa_user_gen1.h" #define LAST_RHF_SEQNO 13 typedef struct { volatile uint64_t *cl_q_head; volatile uint64_t *cl_q_tail; union { /* hdr_qe's are only present in *_RX_HDR_Q* CL Q types: */ struct { uint32_t rx_hdrq_rhf_seq; uint32_t *p_rx_hdrq_rhf_seq; uint32_t *hdrq_base_addr; } hdr_qe; /* header queue entry */ /* egr_buffs's are only present in *_RX_EGR_Q* CL Q types: */ void **egr_buffs; }; } psm_hal_gen1_cl_q_t; COMPILE_TIME_ASSERT(MAX_SHARED_CTXTS_MUST_MATCH, PSM_HAL_MAX_SHARED_CTXTS == HFI1_MAX_SHARED_CTXTS); /* Private struct on a per-context basis. */ typedef struct _hfp_gen1_pc_private { struct _hfi_ctrl *ctrl; /* driver opaque hfi_proto */ psm_hal_gen1_cl_q_t cl_qs[PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(7) + 1]; struct ips_hwcontext_ctrl *hwcontext_ctrl; struct ips_subcontext_ureg *subcontext_ureg[HFI1_MAX_SHARED_CTXTS]; struct ips_spio spio_ctrl; struct hfi1_user_info_dep user_info; uint16_t sc2vl[PSMI_N_SCS]; } hfp_gen1_pc_private; /* At the end of each scb struct, we have space reserved to accommodate * three structures (for GEN1)- * struct psm_hal_sdma_req_info, struct psm_hal_pbc and struct ips_message_header. * The HIC should get the size needed for the extended memory region * using a HAL call (psmi_hal_get_scb_extended_mem_size). For Gen1, this API * will return the size of the below struct psm_hal_gen1_scb_extended * aligned up to be able to fit struct psm_hal_pbc on a 64-byte boundary. */ #define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT 1 struct psm_hal_gen1_scb_extended { union { struct sdma_req_info sri1; struct sdma_req_info_v6_3 sri2; }; struct { struct psm_hal_pbc pbc; struct ips_message_header ips_lrh; } PSMI_CACHEALIGN; }; /* declare the hfp_gen1_private struct */ typedef struct _hfp_gen1_private { /* GEN1 specific data that are common to all contexts: */ int sdmahdr_req_size; int dma_rtail; uint32_t hdrq_rhf_off; } hfp_gen1_private_t; /* declare hfp_gen1_t struct, (combines public psmi_hal_instance_t together with a private struct) */ typedef struct _hfp_gen1 { psmi_hal_instance_t phi; hfp_gen1_private_t hfp_private; } hfp_gen1_t; static const struct { uint32_t hfi1_event_bit, psmi_hal_hfi_event_bit; } hfi1_events_map[] = { { HFI1_EVENT_FROZEN, PSM_HAL_HFI_EVENT_FROZEN }, { HFI1_EVENT_LINKDOWN, PSM_HAL_HFI_EVENT_LINKDOWN }, { HFI1_EVENT_LID_CHANGE, PSM_HAL_HFI_EVENT_LID_CHANGE }, { HFI1_EVENT_LMC_CHANGE, PSM_HAL_HFI_EVENT_LMC_CHANGE }, { HFI1_EVENT_SL2VL_CHANGE, PSM_HAL_HFI_EVENT_SL2VL_CHANGE }, { HFI1_EVENT_TID_MMU_NOTIFY, PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY}, }; opa-psm2-PSM2_11.2.185/psm_hal_gen1/psm_hal_gen1_spio.c000066400000000000000000000721041370564314600223150ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ /* included header files */ #include #include #include #include #include #include "ips_proto.h" #include "ips_proto_internal.h" #include "psm_hal_gen1_spio.h" #include "ips_proto_params.h" /* Report PIO stalls every 20 seconds at the least */ #define SPIO_STALL_WARNING_INTERVAL (nanosecs_to_cycles(20e9)) #define SPIO_MAX_CONSECUTIVE_SEND_FAIL (1<<20) /* 1M */ /* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */ #define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */ static void spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now, uint64_t send_failures); static void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures); static psm2_error_t spio_reset_hfi(struct ips_spio *ctrl); static psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl); static psm2_error_t spio_credit_return_update(struct ips_spio *ctrl); static psm2_error_t spio_credit_return_update_shared(struct ips_spio *ctrl); static PSMI_HAL_INLINE psm2_error_t ips_spio_init(const struct psmi_context *context, struct ptl *ptl, struct ips_spio *ctrl #ifdef PSM_AVX512 , int is_avx512_enabled #endif ) { cpuid_t id; hfp_gen1_pc_private *psm_hw_ctxt = context->psm_hw_ctxt; struct _hfi_ctrl *con_ctrl = psm_hw_ctxt->ctrl; ctrl->ptl = ptl; ctrl->context = context; ctrl->unit_id = context->ep->unit_id; ctrl->portnum = context->ep->portnum; pthread_spin_init(&ctrl->spio_lock, PTHREAD_PROCESS_PRIVATE); ctrl->spio_credits_addr = (volatile __le64 *) con_ctrl->base_info.sc_credits_addr; ctrl->spio_bufbase_sop = (volatile uint64_t *)con_ctrl->base_info.pio_bufbase_sop; ctrl->spio_bufbase = (volatile uint64_t *)con_ctrl->base_info.pio_bufbase; ctrl->spio_consecutive_failures = 0; ctrl->spio_num_stall = 0ULL; ctrl->spio_num_stall_total = 0ULL; ctrl->spio_next_stall_warning = 0ULL; ctrl->spio_last_stall_cyc = 0ULL; ctrl->spio_init_cyc = get_cycles(); ctrl->spio_total_blocks = con_ctrl->ctxt_info.credits; ctrl->spio_block_index = 0; ctrl->spio_ctrl = (struct ips_spio_ctrl *)context->spio_ctrl; if (!ctrl->spio_ctrl) { ctrl->spio_ctrl = (volatile struct ips_spio_ctrl *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_spio_ctrl)); if (ctrl->spio_ctrl == NULL) { return PSM2_NO_MEMORY; } ctrl->spio_reset_hfi = spio_reset_hfi; ctrl->spio_credit_return_update = spio_credit_return_update; } else { ctrl->spio_reset_hfi = spio_reset_hfi_shared; ctrl->spio_credit_return_update = spio_credit_return_update_shared; } /* * Only the master process can initialize. */ if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { pthread_spin_init(&ctrl->spio_ctrl->spio_ctrl_lock, PTHREAD_PROCESS_SHARED); ctrl->spio_ctrl->spio_write_in_progress = 0; ctrl->spio_ctrl->spio_reset_count = 0; ctrl->spio_ctrl->spio_frozen_count = 0; ctrl->spio_ctrl->spio_available_blocks = ctrl->spio_total_blocks; ctrl->spio_ctrl->spio_block_index = 0; ctrl->spio_ctrl->spio_fill_counter = 0; psmi_assert(SPIO_CREDITS_Counter (ctrl->spio_ctrl->spio_credits.value) == 0); psmi_assert(SPIO_CREDITS_Status (ctrl->spio_ctrl->spio_credits.value) == 0); ctrl->spio_ctrl->spio_credits.credit_return = *ctrl->spio_credits_addr; } /* * Setup the PIO block copying routines. */ get_cpuid(0x1, 0, &id); /* 16B copying supported */ ctrl->spio_blockcpy_med = (id.edx & (1<spio_blockcpy_large = (id.ebx & (1<spio_blockcpy_med; #ifdef PSM_AVX512 /* 64B copying supported */ ctrl->spio_blockcpy_large = (is_avx512_enabled && (id.ebx & (1<spio_blockcpy_large; #endif #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) { PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &ctrl->cuda_pio_buffer, MAX_CUDA_MTU, CU_MEMHOSTALLOC_PORTABLE); } #endif _HFI_PRDBG("ips_spio_init() done\n"); return PSM2_OK; } static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl) { #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) PSMI_CUDA_CALL(cuMemFreeHost, (void *) ctrl->cuda_pio_buffer); #endif spio_report_stall(ctrl, get_cycles(), 0ULL); if (!ctrl->context->spio_ctrl) psmi_free((void *)ctrl->spio_ctrl); return PSM2_OK; } static PSMI_HAL_INLINE void spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now, uint64_t send_failures) { size_t off = 0; char buf[1024]; if (ctrl->spio_num_stall == 0) return; if (send_failures > 0) { char bufctr[128]; uint64_t tx_stat, rx_stat; int ret; off = snprintf(buf, sizeof(buf) - 1, "PIO Send context %d with total blocks %d , available blocks %d, " "fill counter %d, free counter %d ", (int)psm2_epid_context(ctrl->context->epid), ctrl->spio_total_blocks, ctrl->spio_ctrl->spio_available_blocks, ctrl->spio_ctrl->spio_fill_counter, SPIO_CREDITS_Counter(ctrl->spio_ctrl-> spio_credits.value)); buf[off] = '\0'; /* In case hfifs isn't running */ ret = hfi_get_single_portctr(ctrl->unit_id, ctrl->portnum, "TxPkt", &tx_stat); if (ret != -1) { ret = hfi_get_single_portctr(ctrl->unit_id, ctrl->portnum, "RxPkt", &rx_stat); if (ret != -1) { snprintf(bufctr, sizeof(bufctr) - 1, "(TxPktCnt=%llu,RxPktCnt=%llu)", (unsigned long long)tx_stat, (unsigned long long)rx_stat); bufctr[sizeof(bufctr) - 1] = '\0'; } else bufctr[0] = '\0'; } else bufctr[0] = '\0'; _HFI_DBG ("PIO Send Stall after at least %.2fM failed send attempts " "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n", send_failures / 1e6, PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_last_stall_cyc), (unsigned long long)ctrl->spio_num_stall, bufctr[0] != '\0' ? bufctr : "", buf); } else { _HFI_DBG ("PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs", (unsigned long long)ctrl->spio_num_stall, PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_last_stall_cyc)); } return; } static PSMI_HAL_INLINE void spio_handle_stall(struct ips_spio *ctrl, uint64_t send_failures) { uint64_t t_cyc_now = get_cycles(); /* We handle the pio-stall every time but only report something every 20 * seconds. We print a summary at the end while closing the device */ ctrl->spio_num_stall++; ctrl->spio_num_stall_total++; if (ctrl->spio_next_stall_warning <= t_cyc_now) { /* If context status is ok (i.e. no cables pulled or anything) */ if (psmi_context_check_status(ctrl->context) == PSM2_OK) spio_report_stall(ctrl, t_cyc_now, send_failures); ctrl->spio_next_stall_warning = get_cycles() + SPIO_STALL_WARNING_INTERVAL; } /* re-initialize our shadow from the real registers; by this time, * we know the hardware has to have done the update. * Also, kernel check may have changed things. */ ctrl->spio_credit_return_update(ctrl); ctrl->spio_last_stall_cyc = t_cyc_now; return; } /* * A send context halt is detected in several ways: * 1. during pio for normal credit return update; * 2. during events process when no event; * when a hfi is frozen, we recover hfi by calling this routine. */ static PSMI_HAL_INLINE void spio_reset_context(struct ips_spio *ctrl) { /* if there are too many reset, teardown process */ ctrl->spio_ctrl->spio_reset_count++; if (ctrl->spio_ctrl->spio_reset_count > IPS_CTXT_RESET_MAX) psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Too many send context reset, teardown...\n"); /* * Because there are many epaddrs and many flows using the * same PIO queue, it is hard to search all the unacked * queue and find the correct retry point. Instead we just * let the upper level flow control to NAK the packets and * do the retry from the right point. */ /* Call into driver to reset send context, driver will * block this routine until the send context is actually * reset. */ ips_wmb(); if (psmi_hal_hfi_reset_context(ctrl->context->psm_hw_ctxt)) psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Send context reset failed: %d.\n", errno); /* Reset spio shared control struct. */ ctrl->spio_ctrl->spio_available_blocks = ctrl->spio_total_blocks; ctrl->spio_ctrl->spio_block_index = 0; ctrl->spio_ctrl->spio_fill_counter = 0; /* Get updated credit return again after reset. */ ctrl->spio_ctrl->spio_credits.credit_return = *ctrl->spio_credits_addr; psmi_assert(SPIO_CREDITS_Counter (ctrl->spio_ctrl->spio_credits.value) == 0); psmi_assert(SPIO_CREDITS_Status (ctrl->spio_ctrl->spio_credits.value) == 0); } /* * hfi frozen is detected when checking events from driver, * psm calls to check events in the main receive loop * when there is no normal traffic. */ static PSMI_HAL_INLINE void spio_reset_hfi_internal(struct ips_spio *ctrl) { struct ips_recvhdrq *recvq = &((struct ptl_ips *)(ctrl->ptl))->recvq; struct ips_proto *proto = (struct ips_proto *)&((struct ptl_ips *)(ctrl->ptl))->proto; /* Reset receive queue state, this must be done first * because after send context reset, hardware start to * receive new packets. */ recvq->state->hdrq_head = 0; recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; recvq->state->num_hdrq_done = 0; recvq->state->hdr_countdown = 0; /* set the expected sequence number to 1. */ if (!(get_psm_gen1_hi()->hfp_private.dma_rtail)) psmi_hal_set_rhf_expected_sequence_number(1, recvq->psm_hal_cl_hdrq, ((struct ptl_ips *)proto->ptl)->context->psm_hw_ctxt); /* Reset send context */ spio_reset_context(ctrl); /* Reset sdma completion queue, this should be done last * because when send context is reset, driver will complete * all the sdma requests with error code -2. This error * code is ignored by PSM, but other error codes are * caught inside the routine. */ while (proto->sdma_done_index != proto->sdma_fill_index) ips_proto_dma_completion_update(proto); } static PSMI_HAL_INLINE psm2_error_t spio_reset_hfi(struct ips_spio *ctrl) { /* Drain receive header queue before reset hfi, we use * the main progression loop to do this so we return from * here. */ if (!ips_recvhdrq_isempty(&((struct ptl_ips *)(ctrl->ptl))->recvq)) return PSM2_OK_NO_PROGRESS; /* do the real reset work: * 1. reset receive header queue; * 2. reset send context; * 3. dain sdma completion queue; */ spio_reset_hfi_internal(ctrl); return PSM2_OK; } /* * There is a shared count and per process count, all initialized to * zero. If a process' local count is equal to shared count, it is * the first process and does the hfi reset, this process also move * both counts up by one. If a process' local count is not equal to * the shared count, it means other process has done the hfi reset, * it just saves the shared count to local count and return. All the * operation are locked by spio_ctrl_lock. */ static PSMI_HAL_INLINE psm2_error_t spio_reset_hfi_shared(struct ips_spio *ctrl) { volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl; /* Drain receive header queue before reset hfi, we use * the main progression loop to do this so we return from * here. We don't reset software receive header queue. */ if (!ips_recvhdrq_isempty(&((struct ptl_ips *)(ctrl->ptl))->recvq)) return PSM2_OK_NO_PROGRESS; pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); /* * In context sharing mode, if there is a subcontext * process in PIO writing, we need to wait till the PIO * writing is done. So we spin wait here. If other * process comes here and does the hfi reset, it should * be perfectly fine. */ while (ctrl->spio_ctrl->spio_write_in_progress) { pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); usleep(1000); pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); } if (ctrl->spio_frozen_count == ctrl->spio_ctrl->spio_frozen_count) { ctrl->spio_frozen_count++; ctrl->spio_ctrl->spio_frozen_count++; spio_reset_hfi_internal(ctrl); } else ctrl->spio_frozen_count = ctrl->spio_ctrl->spio_frozen_count; pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); return PSM2_OK; } /* * return value: * PSM2_OK: new credits updated; * PSM2_OK_NO_PROGRESS: no new credits; */ static PSMI_HAL_INLINE psm2_error_t spio_credit_return_update(struct ips_spio *ctrl) { uint64_t credit_return; credit_return = *ctrl->spio_credits_addr; /* Update available blocks based on fill counter and free counter */ if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) return PSM2_OK_NO_PROGRESS; ctrl->spio_ctrl->spio_credits.credit_return = credit_return; /* If Status is set, then send context is halted */ if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) { spio_reset_context(ctrl); } else { /* * OPA1 has 1M PIO buffer, but each context can have max 64K, * which is 1K 64B blocks, so the distance between fill counter * and credit return counter is no more than 1024; Both fill * counter and credit return counter are 11 bits value, * representing range [0, 2047]. */ psmi_assert((ctrl->spio_ctrl->spio_available_blocks + ((ctrl->spio_ctrl->spio_fill_counter - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. value)) & 0x7FF)) <= ctrl->spio_total_blocks); ctrl->spio_ctrl->spio_available_blocks = ctrl->spio_total_blocks - ((ctrl->spio_ctrl->spio_fill_counter - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. value)) & 0x7FF); /* a successful credit update, clear reset count */ ctrl->spio_ctrl->spio_reset_count = 0; } return PSM2_OK; } /* * return value: * PSM2_OK: new credits updated; * PSM2_OK_NO_PROGRESS: no new credits; */ static PSMI_HAL_INLINE psm2_error_t spio_credit_return_update_shared(struct ips_spio *ctrl) { uint64_t credit_return; pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock); credit_return = *ctrl->spio_credits_addr; /* Update available blocks based on fill counter and free counter */ if (ctrl->spio_ctrl->spio_credits.credit_return == credit_return) { pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); return PSM2_OK_NO_PROGRESS; } ctrl->spio_ctrl->spio_credits.credit_return = credit_return; /* If Status is set, then send context is halted */ if (SPIO_CREDITS_Status(ctrl->spio_ctrl->spio_credits.value)) { /* * In context sharing mode, if there is a subcontext * process in PIO writing, we need to wait till the PIO * writing is done. So we spin wait here. Other processes * won't come here because for them, there is NO new * credit return change (the first 'if' check in this * routine). */ while (ctrl->spio_ctrl->spio_write_in_progress) { pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); usleep(1000); pthread_spin_lock(&ctrl->spio_ctrl->spio_ctrl_lock); } spio_reset_context(ctrl); } else { /* * OPA1 has 1M PIO buffer, but each context can have max 64K, * which is 1K 64B blocks, so the distance between fill counter * and credit return counter is no more than 1024; Both fill * counter and credit return counter are 11 bits value, * representing range [0, 2047]. */ psmi_assert((ctrl->spio_ctrl->spio_available_blocks + ((ctrl->spio_ctrl->spio_fill_counter - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. value)) & 0x7FF)) <= ctrl->spio_total_blocks); ctrl->spio_ctrl->spio_available_blocks = ctrl->spio_total_blocks - ((ctrl->spio_ctrl->spio_fill_counter - SPIO_CREDITS_Counter(ctrl->spio_ctrl->spio_credits. value)) & 0x7FF); /* a successful credit update, clear reset count */ ctrl->spio_ctrl->spio_reset_count = 0; } pthread_spin_unlock(&ctrl->spio_ctrl->spio_ctrl_lock); return PSM2_OK; } /* * Check and process events * return value: * PSM2_OK: normal events processing; * PSM2_OK_NO_PROGRESS: no event is processed; */ static PSMI_HAL_INLINE psm2_error_t ips_spio_process_events(const struct ptl *ptl_gen) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; struct ips_spio *ctrl = ptl->proto.spioc; uint64_t event_mask; int rc = psmi_hal_get_hfi_event_bits(&event_mask,ctrl->context->psm_hw_ctxt); if (rc) return PSM2_OK_NO_PROGRESS; /* * If there is no event, try do credit return update * to catch send context halt. */ if_pf(event_mask == 0) return ctrl->spio_credit_return_update(ctrl); /* * Process mmu invalidation event, this will invalidate * all caching items removed by mmu notifier. */ if (event_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { /* * driver will clear the event bit before return, * PSM does not need to ack the event. */ return ips_tidcache_invalidation(&ptl->proto.protoexp->tidc); } /* Check if HFI is frozen */ if (event_mask & PSM_HAL_HFI_EVENT_FROZEN) { /* if no progress, return and retry */ if (ctrl->spio_reset_hfi(ctrl) != PSM2_OK) return PSM2_OK_NO_PROGRESS; } /* First ack the driver the receipt of the events */ _HFI_VDBG("Acking event(s) 0x%" PRIx64 " to qib driver.\n", (uint64_t) event_mask); psmi_hal_ack_hfi_event(event_mask, ctrl->context->psm_hw_ctxt); if (event_mask & PSM_HAL_HFI_EVENT_LINKDOWN) { /* A link down event can clear the LMC and SL2VL * change as those events are implicitly handled * in the link up/down event handler. */ event_mask &= ~(PSM_HAL_HFI_EVENT_LMC_CHANGE | PSM_HAL_HFI_EVENT_SL2VL_CHANGE); ips_ibta_link_updown_event(&((struct ptl_ips *)(ctrl->ptl))->proto); _HFI_VDBG("Link down detected.\n"); } if (event_mask & PSM_HAL_HFI_EVENT_LID_CHANGE) { /* Display a warning that LID change has occurred during * the run. This is not supported in the current * implementation and in general is bad for the SM to * re-assign LIDs during a run. */ _HFI_INFO ("Warning! LID change detected during run. " "Old LID: %d, New Lid: %d\n", (int)PSMI_EPID_GET_LID(ctrl->context->epid), (int)psmi_hal_get_port_lid(ctrl->unit_id, ctrl->portnum)); } if (event_mask & PSM_HAL_HFI_EVENT_LMC_CHANGE) _HFI_INFO("Fabric LMC changed.\n"); if (event_mask & PSM_HAL_HFI_EVENT_SL2VL_CHANGE) { _HFI_INFO("SL2VL mapping changed for port.\n"); ips_ibta_init_sl2sc_table(&((struct ptl_ips *)(ctrl->ptl))->proto); } return PSM2_OK; } static PSMI_HAL_INLINE void spio_handle_resync(struct ips_spio *ctrl, uint64_t consecutive_send_failed) { /* hfi_force_pio_avail_update(ctrl->context->ctrl); */ if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1))) spio_handle_stall(ctrl, consecutive_send_failed); } /* * This function attempts to write a packet to a PIO. * * Recoverable errors: * PSM2_OK: Packet triggered through PIO. * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled. * * Unrecoverable errors: * PSM2_EP_NO_NETWORK: No network, no lid, ... * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. */ static inline psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, struct psm_hal_pbc *pbc, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum #ifdef PSM_CUDA , uint32_t is_cuda_payload #endif ) { struct ips_spio *ctrl = proto->spioc; volatile struct ips_spio_ctrl *spio_ctrl = ctrl->spio_ctrl; volatile uint64_t *pioaddr; uint32_t paylen, nblks; psm2_error_t err = PSM2_OK; int do_lock = psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); if (do_lock) pthread_spin_lock(&ctrl->spio_lock); #ifdef PSM_FI if_pf(PSMI_FAULTINJ_ENABLED()) { PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1, IPS_FAULTINJ_PIOLOST); PSMI_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", 1, IPS_FAULTINJ_PIOBUSY); if (psmi_faultinj_is_fault(fi_lost)) { if (do_lock) pthread_spin_unlock(&ctrl->spio_lock); return PSM2_OK; } else if (psmi_faultinj_is_fault(fi_busy)) goto fi_busy; /* else fall through normal processing path, i.e. no faults */ } #endif /* #ifdef PSM_FI */ psmi_assert((length & 0x3) == 0); paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); nblks = 1 + ((paylen + 63) >> 6); if (spio_ctrl->spio_available_blocks < nblks) { ctrl->spio_credit_return_update(ctrl); if_pf(spio_ctrl->spio_available_blocks < nblks) { /* Check unit status */ #ifdef PSM_FI fi_busy: #endif /* #ifdef PSM_FI */ if ((err = psmi_context_check_status(ctrl->context)) == PSM2_OK) { if (0 == (++ctrl-> spio_consecutive_failures & (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL - 1))) spio_handle_resync(ctrl, ctrl-> spio_consecutive_failures); err = PSM2_EP_NO_RESOURCES; } /* If cable is pulled, we don't count it as a consecutive failure, * we just make it as though no send pio was available */ else if (err == PSM2_OK_NO_PROGRESS) err = PSM2_EP_NO_RESOURCES; /* else something bad happened in check_status */ if (do_lock) pthread_spin_unlock(&ctrl->spio_lock); return err; } } /* * if context->spio_ctrl is set, it is pointing to shared context ureg * page, and we are using context sharing. */ if (ctrl->context->spio_ctrl) { pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); if (spio_ctrl->spio_available_blocks < nblks) { pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); if (do_lock) pthread_spin_unlock(&ctrl->spio_lock); return PSM2_EP_NO_RESOURCES; } } _HFI_VDBG("credits: total %d, avail %d index %d, fill %d " "free %d: %d %d %d %d %d; addr %llx\n", ctrl->spio_total_blocks, spio_ctrl->spio_available_blocks, spio_ctrl->spio_block_index, spio_ctrl->spio_fill_counter, SPIO_CREDITS_Counter(spio_ctrl->spio_credits.value), SPIO_CREDITS_Status(spio_ctrl->spio_credits.value), SPIO_CREDITS_DueToPbc(spio_ctrl->spio_credits.value), SPIO_CREDITS_DueToTheshold(spio_ctrl->spio_credits.value), SPIO_CREDITS_DueToErr(spio_ctrl->spio_credits.value), SPIO_CREDITS_DueToForce(spio_ctrl->spio_credits.value), *ctrl->spio_credits_addr); /* * Save the assigned locally, update the shared for other processes. */ ctrl->spio_block_index = spio_ctrl->spio_block_index; spio_ctrl->spio_available_blocks -= nblks; /* fill counter should be 11 bits value, same as credit return counter */ spio_ctrl->spio_fill_counter = (spio_ctrl->spio_fill_counter + nblks) & 0x7FF; spio_ctrl->spio_block_index += nblks; if (spio_ctrl->spio_block_index >= ctrl->spio_total_blocks) spio_ctrl->spio_block_index -= ctrl->spio_total_blocks; /* * Unlock in context sharing mode, but increase refcount to * indicate I am in progress to write to PIO blocks. */ if (ctrl->context->spio_ctrl) { spio_ctrl->spio_write_in_progress++; pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); } ctrl->spio_num_stall = 0; /* now able to send, so clear if set */ ctrl->spio_consecutive_failures = 0; if (do_lock) pthread_spin_unlock(&ctrl->spio_lock); _HFI_VDBG("PIO write: nblks %d length %d, paylen %d\n", nblks, length, paylen); /* Setup PBC for this packet */ ips_proto_pbc_update(proto, flow, isCtrlMsg, pbc, sizeof(struct ips_message_header), paylen); /* Write to PIO: SOP block */ pioaddr = ctrl->spio_bufbase_sop + ctrl->spio_block_index * 8; if (++ctrl->spio_block_index == ctrl->spio_total_blocks) ctrl->spio_block_index = 0; ctrl->spio_blockcpy_med(pioaddr, (uint64_t *) pbc, 1); _HFI_VDBG("pio qw write sop %p: 8\n", pioaddr); /* Write to PIO: other blocks of payload */ #ifdef PSM_CUDA if (is_cuda_payload) { /* Since the implementation of cuMemcpy is unknown, and the HFI specifies several conditions for how PIO writes must occur, for safety reasons we should not assume that cuMemcpy will follow the HFI's requirements. The cuMemcpy should instead write into a buffer in host memory, and then PSM can copy to the HFI as usual. */ PSMI_CUDA_CALL(cuMemcpyDtoH, ctrl->cuda_pio_buffer, (CUdeviceptr)payload, paylen); payload = (uint32_t *) ctrl->cuda_pio_buffer; } #endif if (length >= 64) { ips_spio_blockcpy_fn_t blockcpy_fn; if (length >= 256) { blockcpy_fn = ctrl->spio_blockcpy_large; } else { blockcpy_fn = ctrl->spio_blockcpy_med; } uint32_t blks2send = length >> 6; uint32_t blks2end = ctrl->spio_total_blocks - ctrl->spio_block_index; pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; if (blks2end >= blks2send) { blockcpy_fn(pioaddr, (uint64_t *)payload, blks2send); _HFI_VDBG("pio blk write %p: %d\n", pioaddr, blks2send); ctrl->spio_block_index += blks2send; if (ctrl->spio_block_index == ctrl->spio_total_blocks) ctrl->spio_block_index = 0; payload += blks2send*16; } else { blockcpy_fn(pioaddr, (uint64_t *)payload, blks2end); _HFI_VDBG("pio blk write %p: %d\n", pioaddr, blks2end); payload += blks2end*16; pioaddr = ctrl->spio_bufbase; blockcpy_fn(pioaddr, (uint64_t *)payload, (blks2send-blks2end)); _HFI_VDBG("pio blk write %p: %d\n", pioaddr, (blks2send-blks2end)); ctrl->spio_block_index = blks2send - blks2end; payload += (blks2send-blks2end)*16; } length -= blks2send*64; } /* * The following code makes sure to write to pioaddr in * qword granularity, this is required by hardware. */ paylen = length + (cksum_valid ? PSM_CRC_SIZE_IN_BYTES : 0); if (paylen > 0) { uint32_t blkbuf[32]; uint32_t qws = length >> 3; uint32_t dws = 0; pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; if (++ctrl->spio_block_index == ctrl->spio_total_blocks) ctrl->spio_block_index = 0; /* Write the remaining qwords of payload */ if (qws) { hfi_qwordcpy_safe(pioaddr, (uint64_t *) payload, qws); _HFI_VDBG("pio qw write %p: %d\n", pioaddr, qws); payload += qws << 1; length -= qws << 3; pioaddr += qws; paylen -= qws << 3; } /* if we have last one dword payload */ if (length > 0) { blkbuf[dws++] = payload[0]; } /* if we have checksum to attach */ if (paylen > length) { blkbuf[dws++] = cksum; blkbuf[dws++] = cksum; } /* Write the rest of qwords of current block */ hfi_qwordcpy_safe(pioaddr, (uint64_t *) blkbuf, 8 - qws); _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8 - qws); if (paylen > ((8 - qws) << 3)) { /* We need another block */ pioaddr = ctrl->spio_bufbase + ctrl->spio_block_index * 8; if (++ctrl->spio_block_index == ctrl->spio_total_blocks) ctrl->spio_block_index = 0; /* Write the last block */ hfi_qwordcpy_safe(pioaddr, (uint64_t *) &blkbuf[(8 - qws) << 1], 8); _HFI_VDBG("pio qw write %p: %d\n", pioaddr, 8); } } /* * In context sharing, we need to track who is in progress of * writing to PIO block, this is for halted send context reset. * I am done with PIO blocks writing, decrease the refcount. */ if (ctrl->context->spio_ctrl) { pthread_spin_lock(&spio_ctrl->spio_ctrl_lock); spio_ctrl->spio_write_in_progress--; pthread_spin_unlock(&spio_ctrl->spio_ctrl_lock); } return err; } /* ips_spio_transfer_frame() */ opa-psm2-PSM2_11.2.185/psm_hal_gen1/psm_hal_gen1_spio.h000066400000000000000000000143221370564314600223200ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ #ifndef IPS_SPIO_H #define IPS_SPIO_H #include "psm_user.h" #define IPS_CTXT_RESET_MAX 1000 /* max send context reset */ struct ips_spio; struct ptl; struct ips_proto; struct ips_flow; /* 64B move instruction support */ #define AVX512F_BIT 16 /* level 07h, ebx */ /* 32B move instruction support */ #define AVX2_BIT 5 /* level 07h, ebx */ /* 16B move instruction support */ #define SSE2_BIT 26 /* level 01h, edx */ typedef void (*ips_spio_blockcpy_fn_t)(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock); #ifdef PSM_AVX512 void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock); #endif void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock); void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock); void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock); static PSMI_HAL_INLINE psm2_error_t ips_spio_init(const psmi_context_t *context, struct ptl *ptl, struct ips_spio *ctrl #ifdef PSM_AVX512 , int is_avx512_enabled #endif ); static PSMI_HAL_INLINE psm2_error_t ips_spio_fini(struct ips_spio *ctrl); static inline psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, struct psm_hal_pbc *pbc, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum #ifdef PSM_CUDA , uint32_t is_cuda_payload #endif ); static psm2_error_t ips_spio_process_events(const struct ptl *ptl); #define SPIO_CREDITS_Counter(value) (((value) >> 0) & 0x7FF) #define SPIO_CREDITS_Status(value) (((value) >> 11) & 0x1) #define SPIO_CREDITS_DueToPbc(value) (((value) >> 12) & 0x1) #define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1) #define SPIO_CREDITS_DueToErr(value) (((value) >> 14) & 0x1) #define SPIO_CREDITS_DueToForce(value) (((value) >> 15) & 0x1) struct ips_spio_credits { /* don't use bit operation for performance reason, * using above macro instead. uint16_t Counter:11; uint16_t Status:1; uint16_t CreditReturnDueToPbc:1; uint16_t CreditReturnDueToThreshold:1; uint16_t CreditReturnDueToErr:1; uint16_t CreditReturnDueToForce:1; */ union { struct { uint16_t value; uint16_t pad0; uint32_t pad1; }; uint64_t credit_return; }; }; struct ips_spio_ctrl { /* credit return lock for context sharing */ pthread_spinlock_t spio_ctrl_lock; /* PIO write in progress for context sharing */ volatile uint16_t spio_write_in_progress; /* send context reset count */ volatile uint16_t spio_reset_count; /* HFI frozen count, shared copy */ volatile uint16_t spio_frozen_count; volatile uint16_t spio_available_blocks; volatile uint16_t spio_block_index; volatile uint16_t spio_fill_counter; volatile struct ips_spio_credits spio_credits; } __attribute__ ((aligned(64))); struct ips_spio { const psmi_context_t *context; struct ptl *ptl; uint16_t unit_id; uint16_t portnum; pthread_spinlock_t spio_lock; /* thread lock */ volatile __le64 *spio_credits_addr __attribute__ ((aligned(64))); volatile uint64_t *spio_bufbase_sop; volatile uint64_t *spio_bufbase; volatile struct ips_spio_ctrl *spio_ctrl; uint16_t spio_frozen_count; /* local copy */ uint16_t spio_total_blocks; uint16_t spio_block_index; uint32_t spio_consecutive_failures; uint64_t spio_num_stall; uint64_t spio_num_stall_total; uint64_t spio_next_stall_warning; uint64_t spio_last_stall_cyc; uint64_t spio_init_cyc; psm2_error_t (*spio_reset_hfi)(struct ips_spio *ctrl); psm2_error_t (*spio_credit_return_update)(struct ips_spio *ctrl); /* copying routines based on block size */ ips_spio_blockcpy_fn_t spio_blockcpy_med; ips_spio_blockcpy_fn_t spio_blockcpy_large; #ifdef PSM_CUDA /* Use an intermediate buffer when writing PIO data from the GPU to ensure that we follow the HFI's write ordering rules. */ unsigned char *cuda_pio_buffer; #define MAX_CUDA_MTU 10240 #endif }; #endif /* IPS_SPIO_H */ opa-psm2-PSM2_11.2.185/psm_hal_gen1/psm_hal_inline_i.h000066400000000000000000001770531370564314600222350ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "psm_hal_gen1.h" extern size_t arrsz[MAPSIZE_MAX]; static inline struct _hfp_gen1 *get_psm_gen1_hi(void) { return (struct _hfp_gen1*) psmi_hal_current_hal_instance; } /* hfp_gen1_initialize */ static PSMI_HAL_INLINE int hfp_gen1_initialize(psmi_hal_instance_t *phi) { return 0; } /* hfp_gen1_finalize_ */ static PSMI_HAL_INLINE int hfp_gen1_finalize_(void) { return 0; } /* hfp_gen1_get_num_units */ static PSMI_HAL_INLINE int hfp_gen1_get_num_units(void) { return hfi_get_num_units(); } /* hfp_gen1_get_num_ports */ static PSMI_HAL_INLINE int hfp_gen1_get_num_ports(void) { return HFI_NUM_PORTS_GEN1; } /* hfp_gen1_get_unit_active */ static PSMI_HAL_INLINE int hfp_gen1_get_unit_active(int unit) { return hfi_get_unit_active(unit); } /* hfp_gen1_get_port_active */ static PSMI_HAL_INLINE int hfp_gen1_get_port_active(int unit, int port) { return hfi_get_port_active(unit, port); } /* hfp_gen1_get_contexts */ static PSMI_HAL_INLINE int hfp_gen1_get_num_contexts(int unit) { int64_t nctxts=0; if (!hfi_sysfs_unit_read_s64(unit, "nctxts", &nctxts, 0)) { return (int)nctxts; } return -PSM_HAL_ERROR_GENERAL_ERROR; } /* hfp_gen1_get_num_free_contexts */ static PSMI_HAL_INLINE int hfp_gen1_get_num_free_contexts(int unit) { int64_t nfreectxts=0; if (!hfi_sysfs_unit_read_s64(unit, "nfreectxts", &nfreectxts, 0)) { return (int)nfreectxts; } return -PSM_HAL_ERROR_GENERAL_ERROR; } static void free_egr_buffs(hfp_gen1_pc_private *psm_hw_ctxt) { #define FREE_EGR_BUFFS_TABLE(cl_qs_arr, index) ips_recvq_egrbuf_table_free(((cl_qs_arr)[index]).egr_buffs) size_t i, index, subctxt_cnt; psm_hal_gen1_cl_q_t *cl_qs; cl_qs = psm_hw_ctxt->cl_qs; index = PSM_HAL_CL_Q_RX_EGR_Q; FREE_EGR_BUFFS_TABLE(cl_qs, index); subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt; for (i = 0; i < subctxt_cnt; i++) { index = PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(i); FREE_EGR_BUFFS_TABLE(cl_qs, index); } #undef FREE_EGR_BUFFS_TABLE } static void unmap_hfi_mem(hfp_gen1_pc_private *psm_hw_ctxt) { size_t subctxt_cnt = psm_hw_ctxt->user_info.subctxt_cnt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; struct hfi1_base_info *binfo = &ctrl->base_info; struct hfi1_ctxt_info *cinfo = &ctrl->ctxt_info; /* 1. Unmap the PIO credits address */ HFI_MUNMAP_ERRCHECK(binfo, sc_credits_addr, arrsz[SC_CREDITS]); /* 2. Unmap the PIO buffer SOP address */ HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase_sop, arrsz[PIO_BUFBASE_SOP]); /* 3. Unmap the PIO buffer address */ HFI_MUNMAP_ERRCHECK(binfo, pio_bufbase, arrsz[PIO_BUFBASE]); /* 4. Unmap the receive header queue */ HFI_MUNMAP_ERRCHECK(binfo, rcvhdr_bufbase, arrsz[RCVHDR_BUFBASE]); /* 5. Unmap the receive eager buffer */ HFI_MUNMAP_ERRCHECK(binfo, rcvegr_bufbase, arrsz[RCVEGR_BUFBASE]); /* 6. Unmap the sdma completion queue */ HFI_MUNMAP_ERRCHECK(binfo, sdma_comp_bufbase, arrsz[SDMA_COMP_BUFBASE]); /* 7. Unmap RXE per-context CSRs */ HFI_MUNMAP_ERRCHECK(binfo, user_regbase, arrsz[USER_REGBASE]); ctrl->__hfi_rcvhdrtail = NULL; ctrl->__hfi_rcvhdrhead = NULL; ctrl->__hfi_rcvegrtail = NULL; ctrl->__hfi_rcvegrhead = NULL; ctrl->__hfi_rcvofftail = NULL; if (cinfo->runtime_flags & HFI1_CAP_HDRSUPP) { ctrl->__hfi_rcvtidflow = NULL; } /* 8. Unmap the rcvhdrq tail register address */ if (cinfo->runtime_flags & HFI1_CAP_DMA_RTAIL) { /* only unmap the RTAIL if it was enabled in the first place */ HFI_MUNMAP_ERRCHECK(binfo, rcvhdrtail_base, arrsz[RCVHDRTAIL_BASE]); } else { binfo->rcvhdrtail_base = 0; } /* 9. Unmap the event page */ HFI_MUNMAP_ERRCHECK(binfo, events_bufbase, arrsz[EVENTS_BUFBASE]); /* 10. Unmap the status page */ HFI_MUNMAP_ERRCHECK(binfo, status_bufbase, arrsz[STATUS_BUFBASE]); /* 11. If subcontext is used, unmap the buffers */ if (subctxt_cnt > 0) { /* only unmap subcontext-related stuff it subcontexts are enabled */ HFI_MUNMAP_ERRCHECK(binfo, subctxt_uregbase, arrsz[SUBCTXT_UREGBASE]); HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvhdrbuf, arrsz[SUBCTXT_RCVHDRBUF]); HFI_MUNMAP_ERRCHECK(binfo, subctxt_rcvegrbuf, arrsz[SUBCTXT_RCVEGRBUF]); } } /* hfp_gen1_close_context */ static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp) { hfp_gen1_pc_private *psm_hw_ctxt; if (!ctxtp || !*ctxtp) return PSM_HAL_ERROR_OK; psm_hw_ctxt = (hfp_gen1_pc_private *)(*ctxtp); /* Free the egress buffers */ free_egr_buffs(psm_hw_ctxt); /* Unmap the HFI memory */ unmap_hfi_mem(psm_hw_ctxt); /* Clean up the rest */ close(psm_hw_ctxt->ctrl->fd); free(psm_hw_ctxt->ctrl); psmi_free(psm_hw_ctxt); return PSM_HAL_ERROR_OK; } /* Moved from psm_context.c */ ustatic PSMI_HAL_INLINE int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid); MOCK_DCL_EPILOGUE(psmi_sharedcontext_params); ustatic PSMI_HAL_INLINE psm2_error_t psmi_init_userinfo_params(psm2_ep_t ep, int unit_id, psm2_uuid_t const unique_job_key, struct hfi1_user_info_dep *user_info); /* * Prepare user_info params for driver open, used only in psmi_context_open */ ustatic PSMI_HAL_INLINE psm2_error_t psmi_init_userinfo_params(psm2_ep_t ep, int unit_id, psm2_uuid_t const unique_job_key, struct hfi1_user_info_dep *user_info) { /* static variables, shared among rails */ static int shcontexts_enabled = -1, rankid, nranks; int avail_contexts = 0, max_contexts, ask_contexts; int ranks_per_context = 0; psm2_error_t err = PSM2_OK; union psmi_envvar_val env_maxctxt, env_ranks_per_context; static int subcontext_id_start; memset(user_info, 0, sizeof(*user_info)); user_info->userversion = HFI1_USER_SWMINOR|(hfi_get_user_major_version()<subctxt_id = 0; user_info->subctxt_cnt = 0; memcpy(user_info->uuid, unique_job_key, sizeof(user_info->uuid)); if (shcontexts_enabled == -1) { shcontexts_enabled = psmi_sharedcontext_params(&nranks, &rankid); } if (!shcontexts_enabled) return err; avail_contexts = hfi_get_num_contexts(unit_id); if (avail_contexts == 0) { err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE, "PSM2 found 0 available contexts on opa device(s)."); goto fail; } /* See if the user wants finer control over context assignments */ if (!psmi_getenv("PSM2_MAX_CONTEXTS_PER_JOB", "Maximum number of contexts for this PSM2 job", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)avail_contexts, &env_maxctxt)) { max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ } else if (!psmi_getenv("PSM2_SHAREDCONTEXTS_MAX", "", /* deprecated */ PSMI_ENVVAR_LEVEL_HIDDEN | PSMI_ENVVAR_LEVEL_NEVER_PRINT, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)avail_contexts, &env_maxctxt)) { _HFI_INFO ("The PSM2_SHAREDCONTEXTS_MAX env variable is deprecated. Please use PSM2_MAX_CONTEXTS_PER_JOB in future.\n"); max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ } else ask_contexts = max_contexts = avail_contexts; if (!psmi_getenv("PSM2_RANKS_PER_CONTEXT", "Number of ranks per context", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)1, &env_ranks_per_context)) { ranks_per_context = max(env_ranks_per_context.e_int, 1); ranks_per_context = min(ranks_per_context, HFI1_MAX_SHARED_CTXTS); } /* * See if we could get a valid ppn. If not, approximate it to be the * number of cores. */ if (nranks == -1) { long nproc = sysconf(_SC_NPROCESSORS_ONLN); if (nproc < 1) nranks = 1; else nranks = nproc; } /* * Make sure that our guesses are good educated guesses */ if (rankid >= nranks) { _HFI_PRDBG ("PSM2_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n", rankid, nranks); goto fail; } if (ranks_per_context) { int contexts = (nranks + ranks_per_context - 1) / ranks_per_context; if (contexts > ask_contexts) { err = psmi_handle_error(NULL, PSM2_EP_NO_DEVICE, "Incompatible settings for " "PSM2_MAX_CONTEXTS_PER_JOB and PSM2_RANKS_PER_CONTEXT"); goto fail; } ask_contexts = contexts; } /* group id based on total groups and local rank id */ user_info->subctxt_id = subcontext_id_start + rankid % ask_contexts; /* this is for multi-rail, when we setup a new rail, * we can not use the same subcontext ID as the previous * rail, otherwise, the driver will match previous rail * and fail. */ subcontext_id_start += ask_contexts; /* Need to compute with how many *other* peers we will be sharing the * context */ if (nranks > ask_contexts) { user_info->subctxt_cnt = nranks / ask_contexts; /* If ppn != multiple of contexts, some contexts get an uneven * number of subcontexts */ if (nranks % ask_contexts > rankid % ask_contexts) user_info->subctxt_cnt++; /* The case of 1 process "sharing" a context (giving 1 subcontext) * is supcontexted by the driver and PSM. However, there is no * need to share in this case so disable context sharing. */ if (user_info->subctxt_cnt == 1) user_info->subctxt_cnt = 0; if (user_info->subctxt_cnt > HFI1_MAX_SHARED_CTXTS) { err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR, "Calculation of subcontext count exceeded maximum supported"); goto fail; } } /* else subcontext_cnt remains 0 and context sharing is disabled. */ _HFI_PRDBG("PSM2_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d," "max_contexts=%d,ask_contexts=%d," "ranks_per_context=%d,id=%u,cnt=%u\n", rankid, nranks, avail_contexts, max_contexts, ask_contexts, ranks_per_context, user_info->subctxt_id, user_info->subctxt_cnt); fail: return err; } ustatic int MOCKABLE(psmi_sharedcontext_params)(int *nranks, int *rankid) { union psmi_envvar_val enable_shcontexts; char *ppn_env = NULL, *lrank_env = NULL, *c; *rankid = -1; *nranks = -1; #if 0 /* DEBUG: Used to selectively test possible shared context and shm-only * settings */ unsetenv("PSC_MPI_NODE_RANK"); unsetenv("PSC_MPI_PPN"); unsetenv("MPI_LOCALRANKID"); unsetenv("MPI_LOCALRANKS"); #endif /* We do not support context sharing for multiple endpoints */ if (psmi_multi_ep_enabled) { return 0; } /* New name in 2.0.1, keep observing old name */ psmi_getenv("PSM2_SHAREDCONTEXTS", "Enable shared contexts", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, (union psmi_envvar_val) PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, &enable_shcontexts); if (!enable_shcontexts.e_int) return 0; /* We support two types of syntaxes to let users give us a hint what * our local rankid is. Moving towards MPI_, but still support PSC_ */ if ((c = getenv("MPI_LOCALRANKID")) && *c != '\0') { lrank_env = "MPI_LOCALRANKID"; ppn_env = "MPI_LOCALNRANKS"; } else if ((c = getenv("PSC_MPI_PPN")) && *c != '\0') { ppn_env = "PSC_MPI_PPN"; lrank_env = "PSC_MPI_NODE_RANK"; } if (ppn_env != NULL && lrank_env != NULL) { union psmi_envvar_val env_rankid, env_nranks; psmi_getenv(lrank_env, "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid); psmi_getenv(ppn_env, "Shared context numranks", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_nranks); *rankid = env_rankid.e_int; *nranks = env_nranks.e_int; return 1; } else return 0; } MOCK_DEF_EPILOGUE(psmi_sharedcontext_params); /* moved from ips_subcontext.c */ static PSMI_HAL_INLINE psm2_error_t divvy_shared_mem_ptrs(hfp_gen1_pc_private *pc_private, psmi_context_t *context, const struct hfi1_base_info *base_info) { struct ips_hwcontext_ctrl **hwcontext_ctrl = &pc_private->hwcontext_ctrl; uint32_t subcontext_cnt = pc_private->user_info.subctxt_cnt; struct ips_subcontext_ureg **uregp = &pc_private->subcontext_ureg[0]; uintptr_t all_subcontext_uregbase = (uintptr_t) base_info->subctxt_uregbase; int i; psmi_assert_always(all_subcontext_uregbase != 0); for (i = 0; i < HFI1_MAX_SHARED_CTXTS; i++) { struct ips_subcontext_ureg *subcontext_ureg = (struct ips_subcontext_ureg *)all_subcontext_uregbase; *uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL; all_subcontext_uregbase += sizeof(struct ips_subcontext_ureg); } *hwcontext_ctrl = (struct ips_hwcontext_ctrl *)all_subcontext_uregbase; all_subcontext_uregbase += sizeof(struct ips_hwcontext_ctrl); context->spio_ctrl = (void *)all_subcontext_uregbase; all_subcontext_uregbase += sizeof(struct ips_spio_ctrl); context->tid_ctrl = (void *)all_subcontext_uregbase; all_subcontext_uregbase += sizeof(struct ips_tid_ctrl); context->tf_ctrl = (void *)all_subcontext_uregbase; all_subcontext_uregbase += sizeof(struct ips_tf_ctrl); psmi_assert((all_subcontext_uregbase - (uintptr_t) base_info->subctxt_uregbase) <= PSMI_PAGESIZE); return PSM2_OK; } static PSMI_HAL_INLINE uint64_t get_cap_mask(uint64_t gen1_mask) { static const struct { uint64_t gen1_bit; uint32_t psmi_hal_bit; } bit_map[] = { { HFI1_CAP_SDMA, PSM_HAL_CAP_SDMA }, { HFI1_CAP_SDMA_AHG, PSM_HAL_CAP_SDMA_AHG }, { HFI1_CAP_EXTENDED_PSN, PSM_HAL_CAP_EXTENDED_PSN }, { HFI1_CAP_HDRSUPP, PSM_HAL_CAP_HDRSUPP }, { HFI1_CAP_USE_SDMA_HEAD, PSM_HAL_CAP_USE_SDMA_HEAD }, { HFI1_CAP_MULTI_PKT_EGR, PSM_HAL_CAP_MULTI_PKT_EGR }, { HFI1_CAP_NODROP_RHQ_FULL, PSM_HAL_CAP_NODROP_RHQ_FULL }, { HFI1_CAP_NODROP_EGR_FULL, PSM_HAL_CAP_NODROP_EGR_FULL }, { HFI1_CAP_TID_UNMAP, PSM_HAL_CAP_TID_UNMAP }, { HFI1_CAP_PRINT_UNIMPL, PSM_HAL_CAP_PRINT_UNIMPL }, { HFI1_CAP_ALLOW_PERM_JKEY, PSM_HAL_CAP_ALLOW_PERM_JKEY }, { HFI1_CAP_NO_INTEGRITY, PSM_HAL_CAP_NO_INTEGRITY }, { HFI1_CAP_PKEY_CHECK, PSM_HAL_CAP_PKEY_CHECK }, { HFI1_CAP_STATIC_RATE_CTRL, PSM_HAL_CAP_STATIC_RATE_CTRL }, { HFI1_CAP_SDMA_HEAD_CHECK, PSM_HAL_CAP_SDMA_HEAD_CHECK }, { HFI1_CAP_EARLY_CREDIT_RETURN, PSM_HAL_CAP_EARLY_CREDIT_RETURN }, #ifdef HFI1_CAP_GPUDIRECT_OT { HFI1_CAP_GPUDIRECT_OT, PSM_HAL_CAP_GPUDIRECT_OT }, #else /* #ifdef HFI1_CAP_GPUDIRECT_OT */ #ifndef PSM_CUDA /* lifted from hfi1_user.h */ { (1UL << 63), PSM_HAL_CAP_GPUDIRECT_OT }, #else /* #ifndef PSM_CUDA */ #error "Inconsistent build. HFI1_CAP_GPUDIRECT_OT must be defined for CUDA builds." #endif /* #ifndef PSM_CUDA */ #endif /* #ifdef HFI1_CAP_GPUDIRECT_OT */ }; uint64_t rv = 0; int i; for (i=0;i < sizeof(bit_map)/sizeof(bit_map[0]);i++) { if (bit_map[i].gen1_bit & gen1_mask) rv |= bit_map[i].psmi_hal_bit; } return rv; } /* hfp_gen1_context_open */ static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit, int port, uint64_t open_timeout, psm2_ep_t ep, psm2_uuid_t const job_key, psmi_context_t *psm_ctxt, uint32_t cap_mask, unsigned retryCnt) { int fd = -1; psm2_error_t err = PSM2_OK; hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private)); if_pf (!pc_private) { err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; goto bail; } memset(pc_private, 0, sizeof(hfp_gen1_pc_private)); char dev_name[PATH_MAX]; fd = hfi_context_open_ex(unit, port, open_timeout, dev_name, sizeof(dev_name)); if (fd < 0) { err = -PSM_HAL_ERROR_CANNOT_OPEN_DEVICE; goto bail; } err = psmi_init_userinfo_params(ep, unit, job_key, &pc_private->user_info); if (err) { err = -PSM_HAL_ERROR_GENERAL_ERROR; goto bail; } /* attempt to assign the context via hfi_userinit_internal() */ int retry = 0; do { if (retry > 0) _HFI_INFO("hfi_userinit_internal: failed, trying again (%d/%d)\n", retry, retryCnt); pc_private->ctrl = hfi_userinit_internal(fd, ep->skip_affinity, &pc_private->user_info); } while (pc_private->ctrl == NULL && ++retry <= retryCnt); if (!pc_private->ctrl) { err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; goto bail; } else { if (getenv("PSM2_IDENTIFY")) { printf("%s %s run-time driver interface v%d.%d\n", hfi_get_mylabel(), hfi_ident_tag, hfi_get_user_major_version(), hfi_get_user_minor_version()); } struct _hfi_ctrl *ctrl = pc_private->ctrl; int i; if (hfi_get_port_lid(ctrl->__hfi_unit, ctrl->__hfi_port) <= 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get HFI LID in psm2_ep_open: is SMA running?"); goto bail; } uint64_t gid_lo,gid_hi; if (hfi_get_port_gid(ctrl->__hfi_unit, ctrl->__hfi_port, &gid_hi, &gid_lo) == -1) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get HFI GID in psm2_ep_open: is SMA running?"); goto bail; } ep->unit_id = ctrl->__hfi_unit; ep->portnum = ctrl->__hfi_port; ep->gid_hi = gid_hi; ep->gid_lo = gid_lo; /* Endpoint out_sl contains the default SL to use for this endpoint. */ /* Get the MTU for this SL. */ int sc; if ((sc=hfi_get_port_sl2sc(ep->unit_id, ctrl->__hfi_port, ep->out_sl)) < 0) { sc = PSMI_SC_DEFAULT; } int vl; if ((vl = hfi_get_port_sc2vl(ep->unit_id, ctrl->__hfi_port, sc)) < 0) { vl = PSMI_VL_DEFAULT; } if (sc == PSMI_SC_ADMIN || vl == PSMI_VL_ADMIN) { err = psmi_handle_error(NULL, PSM2_INTERNAL_ERR, "Invalid sl: %d, please specify correct sl via HFI_SL", ep->out_sl); goto bail; } if ((ep->mtu = hfi_get_port_vl2mtu(ep->unit_id, ctrl->__hfi_port, vl)) < 0) { err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get MTU for VL %d", vl); goto bail; } get_psm_gen1_hi()->phi.params.cap_mask = cap_mask | get_cap_mask(ctrl->ctxt_info.runtime_flags) | PSM_HAL_CAP_MERGED_TID_CTRLS | PSM_HAL_CAP_RSM_FECN_SUPP; int driver_major = hfi_get_user_major_version(); int driver_minor = hfi_get_user_minor_version(); if ((driver_major > 6) || ((driver_major == 6) && (driver_minor >= 3))) { get_psm_gen1_hi()->phi.params.cap_mask |= PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS; } get_psm_gen1_hi()->hfp_private.sdmahdr_req_size = HFI_SDMA_HDR_SIZE; if (hfi_check_non_dw_mul_sdma()) get_psm_gen1_hi()->phi.params.cap_mask |= PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE; /* The dma_rtail member is: 1 when the HFI1_CAP_DMA_RTAIL bit is set. 0 when the HFI1_CAP_DMA_RTAIL bit is NOT set. */ get_psm_gen1_hi()->hfp_private.dma_rtail = 0 != (HFI1_CAP_DMA_RTAIL & ctrl->ctxt_info.runtime_flags); psm_ctxt->psm_hw_ctxt = pc_private; if (pc_private->user_info.subctxt_cnt > 0) divvy_shared_mem_ptrs(pc_private, psm_ctxt, &ctrl->base_info); /* Initialize all of the cl q's. */ get_psm_gen1_hi()->hfp_private.hdrq_rhf_off = (ctrl->ctxt_info.rcvhdrq_entsize - 8) >> BYTE2DWORD_SHIFT; /* The following guard exists to workaround a critical issue flagged by KW to prevent subscripting past the end of the cl_qs[] array in the following for () loop. */ if (pc_private->user_info.subctxt_cnt <= HFI1_MAX_SHARED_CTXTS) { /* Here, we are initializing only the rx hdrq rhf seq for all subcontext cl q's: */ for (i=PSM_HAL_CL_Q_RX_HDR_Q_SC_0; i < PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(pc_private->user_info.subctxt_cnt); i += 2) { psm_hal_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[i]); pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pcl_q->hdr_qe.rx_hdrq_rhf_seq; if (get_psm_gen1_hi()->hfp_private.dma_rtail) pcl_q->hdr_qe.rx_hdrq_rhf_seq = 0; else pcl_q->hdr_qe.rx_hdrq_rhf_seq = 1; } } /* Next, initialize the hw rx hdr q and egr buff q: */ { /* base address of user registers */ volatile uint64_t *uregbase = (volatile uint64_t *)(uintptr_t) (ctrl->base_info.user_regbase); /* hw rx hdr q: */ psm_hal_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[PSM_HAL_CL_Q_RX_HDR_Q]); pcl_q->cl_q_head = (volatile uint64_t *)&(uregbase[ur_rcvhdrhead]); pcl_q->cl_q_tail = (volatile uint64_t *)&(uregbase[ur_rcvhdrtail]); pcl_q->hdr_qe.hdrq_base_addr = (uint32_t *) (ctrl->base_info.rcvhdr_bufbase); /* Initialize the ptr to the rx hdrq rhf seq: */ if (pc_private->user_info.subctxt_cnt > 0) /* During sharing of a context, the h/w hdrq rhf_seq is placed in shared memory and is shared by all subcontexts: */ pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pc_private->hwcontext_ctrl->rx_hdrq_rhf_seq; else pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = &pcl_q->hdr_qe.rx_hdrq_rhf_seq; if (get_psm_gen1_hi()->hfp_private.dma_rtail) *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 0; else *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 1; /* hw egr buff q: */ pcl_q = &pc_private->cl_qs[PSM_HAL_CL_Q_RX_EGR_Q]; pcl_q->cl_q_head = (volatile uint64_t *)&(uregbase[ur_rcvegrindexhead]); pcl_q->cl_q_tail = (volatile uint64_t *)&(uregbase[ur_rcvegrindextail]); pcl_q->egr_buffs = ips_recvq_egrbuf_table_alloc(ep, (void*)(ctrl->base_info.rcvegr_bufbase), ctrl->ctxt_info.egrtids, ctrl->ctxt_info.rcvegr_size); } /* Next, initialize the subcontext's rx hdr q and egr buff q: */ for (i=0; i < pc_private->user_info.subctxt_cnt;i++) { /* Subcontexts mimic the HW registers but use different addresses * to avoid cache contention. */ volatile uint64_t *subcontext_uregbase; uint32_t *rcv_hdr, *rcv_egr; unsigned hdrsize, egrsize; unsigned pagesize = getpagesize(); uint32_t subcontext = i; unsigned i = pagesize - 1; hdrsize = (ctrl->ctxt_info.rcvhdrq_cnt * ctrl->ctxt_info.rcvhdrq_entsize + i) & ~i; egrsize = (ctrl->ctxt_info.egrtids * ctrl->ctxt_info.rcvegr_size + i) & ~i; subcontext_uregbase = (uint64_t *) (((uintptr_t) (ctrl->base_info.subctxt_uregbase)) + (sizeof(struct ips_subcontext_ureg) * subcontext)); { struct ips_subcontext_ureg *pscureg = (struct ips_subcontext_ureg *)subcontext_uregbase; if (subcontext == ctrl->ctxt_info.subctxt) { memset(pscureg, 0, sizeof(*pscureg)); if (get_psm_gen1_hi()->hfp_private.dma_rtail) pscureg->writeq_state.hdrq_rhf_seq = 0; else pscureg->writeq_state.hdrq_rhf_seq = 1; } } rcv_hdr = (uint32_t *) (((uintptr_t) (ctrl->base_info.subctxt_rcvhdrbuf)) + (hdrsize * subcontext)); rcv_egr = (uint32_t *) (((uintptr_t) ctrl->base_info.subctxt_rcvegrbuf + (egrsize * subcontext))); /* rx hdr q: */ psm_hal_gen1_cl_q_t *pcl_q = &(pc_private->cl_qs[PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext)]); pcl_q->hdr_qe.hdrq_base_addr = rcv_hdr; pcl_q->cl_q_head = (volatile uint64_t *)&subcontext_uregbase[ur_rcvhdrhead * 8]; pcl_q->cl_q_tail = (volatile uint64_t *)&subcontext_uregbase[ur_rcvhdrtail * 8]; /* egr q: */ pcl_q = &(pc_private->cl_qs[PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext)]); pcl_q->cl_q_head = (volatile uint64_t *)&subcontext_uregbase[ur_rcvegrindexhead * 8]; pcl_q->cl_q_tail = (volatile uint64_t *)&subcontext_uregbase[ur_rcvegrindextail * 8]; pcl_q->egr_buffs = ips_recvq_egrbuf_table_alloc( ep, (void*)rcv_egr, ctrl->ctxt_info.egrtids, ctrl->ctxt_info.rcvegr_size); } return PSM_HAL_ERROR_OK; } return PSM_HAL_ERROR_OK; bail: if (fd >0) close(fd); if (pc_private) { if (pc_private->ctrl) free(pc_private->ctrl); psmi_free(pc_private); } return -PSM_HAL_ERROR_GENERAL_ERROR; } /* hfp_gen1_get_port_index2pkey */ static PSMI_HAL_INLINE int hfp_gen1_get_port_index2pkey(int unit, int port, int index) { return hfi_get_port_index2pkey(unit, port, index); } static PSMI_HAL_INLINE int hfp_gen1_get_cc_settings_bin(int unit, int port, char *ccabuf, size_t len_ccabuf) { return hfi_get_cc_settings_bin(unit, port, ccabuf, len_ccabuf); } static PSMI_HAL_INLINE int hfp_gen1_get_cc_table_bin(int unit, int port, uint16_t **ccatp) { return hfi_get_cc_table_bin(unit, port, ccatp); } static PSMI_HAL_INLINE int hfp_gen1_get_port_lmc(int unit, int port) { return hfi_get_port_lmc(unit, port); } static PSMI_HAL_INLINE int hfp_gen1_get_port_rate(int unit, int port) { return hfi_get_port_rate(unit, port); } static PSMI_HAL_INLINE int hfp_gen1_get_port_sl2sc(int unit, int port, int sl) { return hfi_get_port_sl2sc(unit, port, sl); } static PSMI_HAL_INLINE int hfp_gen1_get_sc2vl_map(struct ips_proto *proto) { hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; uint8_t i; /* Get SC2VL table for unit, port */ for (i = 0; i < PSMI_N_SCS; i++) { int ret = hfi_get_port_sc2vl( psmi_hal_get_unit_id( proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), i); if (ret < 0) /* Unable to get SC2VL. Set it to default */ ret = PSMI_VL_DEFAULT; psm_hw_ctxt->sc2vl[i] = (uint16_t) ret; } return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_set_pkey(psmi_hal_hw_context ctxt, uint16_t pkey) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return hfi_set_pkey(psm_hw_ctxt->ctrl, pkey); } static PSMI_HAL_INLINE int hfp_gen1_poll_type(uint16_t poll_type, psmi_hal_hw_context ctxt) { if (poll_type == PSMI_HAL_POLL_TYPE_URGENT) poll_type = HFI1_POLL_TYPE_URGENT; else poll_type = 0; hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return hfi_poll_type(psm_hw_ctxt->ctrl, poll_type); } static PSMI_HAL_INLINE int hfp_gen1_get_port_lid(int unit, int port) { return hfi_get_port_lid(unit, port); } static PSMI_HAL_INLINE int hfp_gen1_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo) { return hfi_get_port_gid(unit, port, hi, lo); } static PSMI_HAL_INLINE int hfp_gen1_free_tid(psmi_hal_hw_context ctxt, uint64_t tidlist, uint32_t tidcnt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return hfi_free_tid(psm_hw_ctxt->ctrl, tidlist, tidcnt); } static PSMI_HAL_INLINE int hfp_gen1_get_tidcache_invalidation(psmi_hal_hw_context ctxt, uint64_t tidlist, uint32_t *tidcnt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return hfi_get_invalidation(psm_hw_ctxt->ctrl, tidlist, tidcnt); } static PSMI_HAL_INLINE int hfp_gen1_update_tid(psmi_hal_hw_context ctxt, uint64_t vaddr, uint32_t *length, uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return hfi_update_tid(psm_hw_ctxt->ctrl, vaddr, length, tidlist, tidcnt, flags); } static PSMI_HAL_INLINE int hfp_gen1_writev(const struct iovec *iov, int iovcnt, struct ips_epinfo *ignored, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = (hfp_gen1_pc_private *)ctxt; return hfi_cmd_writev(psm_hw_ctxt->ctrl->fd, iov, iovcnt); } static PSMI_HAL_INLINE int hfp_gen1_dma_slot_available(int slotidx, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; if (slotidx < 0 || slotidx >= ctrl->ctxt_info.sdma_ring_size) return -1; struct hfi1_sdma_comp_entry *sdma_comp_queue = (struct hfi1_sdma_comp_entry *) ctrl->base_info.sdma_comp_bufbase; return sdma_comp_queue[slotidx].status != QUEUED; } static PSMI_HAL_INLINE int hfp_gen1_get_sdma_ring_slot_status(int slotIdx, psmi_hal_sdma_ring_slot_status *status, uint32_t *errorCode, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; if (slotIdx < 0 || slotIdx >= ctrl->ctxt_info.sdma_ring_size) { *status = PSM_HAL_SDMA_RING_ERROR; return -PSM_HAL_ERROR_GENERAL_ERROR; } struct hfi1_sdma_comp_entry *sdma_comp_queue = (struct hfi1_sdma_comp_entry *) ctrl->base_info.sdma_comp_bufbase; switch (sdma_comp_queue[slotIdx].status) { case FREE: *status = PSM_HAL_SDMA_RING_AVAILABLE; break; case QUEUED: *status = PSM_HAL_SDMA_RING_QUEUED; break; case COMPLETE: *status = PSM_HAL_SDMA_RING_COMPLETE; break; case ERROR: *status = PSM_HAL_SDMA_RING_ERROR; break; default: *status = PSM_HAL_SDMA_RING_ERROR; return -PSM_HAL_ERROR_GENERAL_ERROR; } *errorCode = sdma_comp_queue[slotIdx].errcode; return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_get_hfi_event_bits(uint64_t *event_bits, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; uint64_t *pevents_mask = (uint64_t *)ctrl->base_info.events_bufbase; uint64_t events_mask = *pevents_mask; uint64_t hal_hfi_event_bits = 0; int i; if (!events_mask) { *event_bits = 0; return PSM_HAL_ERROR_OK; } /* Encode hfi1_events as HAL event codes here */ for (i = 0; i < sizeof(hfi1_events_map)/sizeof(hfi1_events_map[0]); i++) { if (events_mask & hfi1_events_map[i].hfi1_event_bit) hal_hfi_event_bits |= hfi1_events_map[i].psmi_hal_hfi_event_bit; } *event_bits = hal_hfi_event_bits; return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_ack_hfi_event(uint64_t ack_bits, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; uint64_t hfi1_ack_bits = 0; int i; /* Decode from HAL event codes to hfi1_events */ for (i = 0; i < sizeof(hfi1_events_map)/sizeof(hfi1_events_map[0]); i++) { if (ack_bits & hfi1_events_map[i].psmi_hal_hfi_event_bit) hfi1_ack_bits |= hfi1_events_map[i].hfi1_event_bit; } return hfi_event_ack(ctrl, hfi1_ack_bits); } static PSMI_HAL_INLINE int hfp_gen1_hfi_reset_context(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return hfi_reset_context(ctrl); } static PSMI_HAL_INLINE uint64_t hfp_gen1_get_hw_status(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; struct hfi1_status *status = (struct hfi1_status *) ctrl->base_info.status_bufbase; uint64_t hw_status = 0; int i; static const struct { uint32_t hfi1_status_dev_bit, psmi_hal_status_bit; } status_dev_map[] = { { HFI1_STATUS_INITTED, PSM_HAL_HW_STATUS_INITTED }, { HFI1_STATUS_CHIP_PRESENT, PSM_HAL_HW_STATUS_CHIP_PRESENT }, { HFI1_STATUS_HWERROR, PSM_HAL_HW_STATUS_HWERROR }, }; for (i=0; i < sizeof(status_dev_map)/sizeof(status_dev_map[0]); i++) { if (status->dev &status_dev_map[i].hfi1_status_dev_bit) hw_status |= status_dev_map[i].psmi_hal_status_bit; } static const struct { uint32_t hfi1_status_port_bit, psmi_hal_status_bit; } status_port_map[] = { { HFI1_STATUS_IB_READY, PSM_HAL_HW_STATUS_IB_READY }, { HFI1_STATUS_IB_CONF, PSM_HAL_HW_STATUS_IB_CONF }, }; for (i=0; i < sizeof(status_port_map)/sizeof(status_port_map[0]); i++) { if (status->port &status_port_map[i].hfi1_status_port_bit) hw_status |= status_port_map[i].psmi_hal_status_bit; } return hw_status; } static PSMI_HAL_INLINE int hfp_gen1_get_hw_status_freezemsg(volatile char** msg, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; struct hfi1_status *status = (struct hfi1_status *) ctrl->base_info.status_bufbase; *msg = (volatile char *) status->freezemsg; return PSM2_OK; } static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_major_bldtime_version() { return HFI1_USER_SWMAJOR; } static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_minor_bldtime_version() { return HFI1_USER_SWMINOR; } static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_major_runtime_version(psmi_hal_hw_context ctx) { return hfi_get_user_major_version(); } static PSMI_HAL_INLINE uint16_t hfp_gen1_get_user_minor_runtime_version(psmi_hal_hw_context ctx) { return hfi_get_user_minor_version(); } static inline uint32_t get_ht(volatile uint64_t *ht_register) { uint64_t res = *ht_register; ips_rmb(); return (uint32_t)res; } static inline void set_ht(volatile uint64_t *ht_register, uint64_t new_ht) { *ht_register = new_ht; return; } /* hfp_gen1_get_cl_q_head_index */ static PSMI_HAL_INLINE psmi_hal_cl_idx hfp_gen1_get_cl_q_head_index( psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return get_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_head); } /* hfp_gen1_get_cl_q_tail_index */ static PSMI_HAL_INLINE psmi_hal_cl_idx hfp_gen1_get_cl_q_tail_index( psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return get_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_tail); } /* hfp_gen1_set_cl_q_head_index */ static PSMI_HAL_INLINE void hfp_gen1_set_cl_q_head_index( psmi_hal_cl_idx idx, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; set_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_head, idx); return; } /* hfp_gen1_set_cl_q_tail_index */ static PSMI_HAL_INLINE void hfp_gen1_set_cl_q_tail_index( psmi_hal_cl_idx idx, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; set_ht(psm_hw_ctxt->cl_qs[cl_q].cl_q_tail, idx); return; } /* hfp_gen1_cl_q_empty */ static inline int hfp_gen1_cl_q_empty(psmi_hal_cl_idx head_idx, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { if (!get_psm_gen1_hi()->hfp_private.dma_rtail) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; int seq = hfi_hdrget_seq(pcl_q->hdr_qe.hdrq_base_addr + (head_idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)); return (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq != seq); } return (head_idx == hfp_gen1_get_cl_q_tail_index(cl_q, ctxt)); } static inline int hfp_gen1_get_rhf(psmi_hal_cl_idx idx, psmi_hal_raw_rhf_t *rhfp, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; uint32_t *pu32 = (pcl_q->hdr_qe.hdrq_base_addr + (idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)); *rhfp = *((psmi_hal_raw_rhf_t*)pu32); return PSM_HAL_ERROR_OK; } static inline int hfp_gen1_get_ips_message_hdr(psmi_hal_cl_idx idx, psmi_hal_raw_rhf_t rhf, struct ips_message_header **imhp, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; uint32_t *pu32 = pcl_q->hdr_qe.hdrq_base_addr + (idx + hfi_hdrget_hdrq_offset((uint32_t *)&rhf)); *imhp = (struct ips_message_header*)pu32; return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_get_receive_event(psmi_hal_cl_idx head_idx, psmi_hal_hw_context ctxt, struct ips_recvhdrq_event *rcv_ev) { int rv; if_pf ((rv=hfp_gen1_get_rhf(head_idx, &rcv_ev->psm_hal_rhf.raw_rhf, rcv_ev->psm_hal_hdr_q, ctxt)) != PSM_HAL_ERROR_OK) return rv; /* here, we turn off the TFSEQ err bit if set: */ rcv_ev->psm_hal_rhf.decomposed_rhf = rcv_ev->psm_hal_rhf.raw_rhf & (~(PSMI_HAL_RHF_ERR_MASK_64(TFSEQ))); /* Now, get the lrh: */ if_pf ((rv=hfp_gen1_get_ips_message_hdr(head_idx, rcv_ev->psm_hal_rhf.raw_rhf, &rcv_ev->p_hdr, rcv_ev->psm_hal_hdr_q, ctxt)) != PSM_HAL_ERROR_OK) return rv; /* If the hdrq_head is before cachedlastscan, that means that we have * already prescanned this for BECNs and FECNs, so we should not check * again */ if_pt((rcv_ev->proto->flags & IPS_PROTO_FLAG_CCA) && (head_idx >= rcv_ev->recvq->state->hdrq_cachedlastscan)) { /* IBTA CCA handling: * If FECN bit set handle IBTA CCA protocol. For the * flow that suffered congestion we flag it to generate * a control packet with the BECN bit set - This is * currently an unsolicited ACK. * * For all MQ packets the FECN processing/BECN * generation is done in the is_expected_or_nak * function as each eager packet is inspected there. * * For TIDFLOW/Expected data transfers the FECN * bit/BECN generation is done in protoexp_data. Since * header suppression can result in even FECN packets * being suppressed the expected protocol generated * additional BECN packets if a "large" number of * generations are swapped without progress being made * for receive. "Large" is set empirically to 4. * * FECN packets are ignored for all control messages * (except ACKs and NAKs) since they indicate * congestion on the control path which is not rate * controlled. The CCA specification allows FECN on * ACKs to be disregarded as well. */ rcv_ev->is_congested = _is_cca_fecn_set(rcv_ev-> p_hdr) & IPS_RECV_EVENT_FECN; rcv_ev->is_congested |= (_is_cca_becn_set(rcv_ev->p_hdr) << (IPS_RECV_EVENT_BECN - 1)); } else rcv_ev->is_congested = 0; return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE void *hfp_gen1_get_egr_buff(psmi_hal_cl_idx idx, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; return pcl_q->egr_buffs[idx]; } static PSMI_HAL_INLINE int hfp_gen1_retire_hdr_q_entry(psmi_hal_cl_idx *idx, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt, uint32_t elemsz, uint32_t elemlast, int *emptyp) { psmi_hal_cl_idx tmp = *idx + elemsz; hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; if (!get_psm_gen1_hi()->hfp_private.dma_rtail) { (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq)++; if (*pcl_q->hdr_qe.p_rx_hdrq_rhf_seq > LAST_RHF_SEQNO) *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = 1; } if_pf(tmp > elemlast) tmp = 0; *emptyp = hfp_gen1_cl_q_empty(tmp, cl_q, ctxt); *idx = tmp; return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_get_rhf_expected_sequence_number(unsigned int *pseqnum, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; *pseqnum = *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq; return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_set_rhf_expected_sequence_number(unsigned int seqnum, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; *pcl_q->hdr_qe.p_rx_hdrq_rhf_seq = seqnum; return PSM_HAL_ERROR_OK; } /* Get pbc static rate value for flow for a given message length */ PSMI_ALWAYS_INLINE( uint16_t ips_proto_pbc_static_rate(struct ips_proto *proto, struct ips_flow *flow, uint32_t msgLen)) { uint32_t rate = 0; /* The PBC rate is based on which HFI type as different media have different * mechanism for static rate control. */ switch (proto->epinfo.ep_hfi_type) { case PSMI_HFI_TYPE_OPA1: { /* * time_to_send is: * * (packet_length) [bits] / (pkt_egress_rate) [bits/sec] * ----------------------------------------------------- * fabric_clock_period == (1 / 805 * 10^6) [1/sec] * * (where pkt_egress_rate is assumed to be 100 Gbit/s.) */ uint32_t time_to_send = (8 * msgLen * 805) / (100000); rate = (time_to_send >> flow->path->pr_cca_divisor) * (flow->path->pr_active_ipd); if (rate > 65535) rate = 65535; } break; default: rate = 0; } return (uint16_t) rate; } /* This is a helper function to convert Per Buffer Control to little-endian */ PSMI_ALWAYS_INLINE( void ips_proto_pbc_to_le(struct psm_hal_pbc *pbc)) { pbc->pbc0 = __cpu_to_le32(pbc->pbc0); pbc->PbcStaticRateControlCnt = __cpu_to_le16(pbc->PbcStaticRateControlCnt); pbc->fill1 = __cpu_to_le16(pbc->fill1); } /* This is only used for SDMA cases; pbc is really a pointer to * struct ips_pbc_header * or the equivalent un-named structure * in ips_scb. Please note pcb will be in little-endian byte * order on return */ PSMI_ALWAYS_INLINE( void ips_proto_pbc_update(struct ips_proto *proto, struct ips_flow *flow, uint32_t isCtrlMsg, struct psm_hal_pbc *pbc, uint32_t hdrlen, uint32_t paylen)) { hfp_gen1_pc_private *psm_hw_ctxt = proto->ep->context.psm_hw_ctxt; int dw = (sizeof(struct psm_hal_pbc) + hdrlen + paylen) >> BYTE2DWORD_SHIFT; int sc = proto->sl2sc[flow->path->pr_sl]; int vl = psm_hw_ctxt->sc2vl[sc]; uint16_t static_rate = 0; if_pf(!isCtrlMsg && flow->path->pr_active_ipd) static_rate = ips_proto_pbc_static_rate(proto, flow, hdrlen + paylen); pbc->pbc0 = (dw & HFI_PBC_LENGTHDWS_MASK) | ((vl & HFI_PBC_VL_MASK) << HFI_PBC_VL_SHIFT) | (((sc >> HFI_PBC_SC4_SHIFT) & HFI_PBC_SC4_MASK) << HFI_PBC_DCINFO_SHIFT); pbc->PbcStaticRateControlCnt = static_rate & HFI_PBC_STATICRCC_MASK; /* Per Buffer Control must be in little-endian */ ips_proto_pbc_to_le(pbc); return; } static PSMI_HAL_INLINE int hfp_gen1_check_rhf_sequence_number(unsigned int seqno) { return (seqno <= LAST_RHF_SEQNO) ? PSM_HAL_ERROR_OK : PSM_HAL_ERROR_GENERAL_ERROR; } static PSMI_HAL_INLINE int hfp_gen1_set_pbc(struct ips_proto *proto, struct ips_flow *flow, uint32_t isCtrlMsg, struct psm_hal_pbc *dest, uint32_t hdrlen, uint32_t paylen) { ips_proto_pbc_update(proto, flow, isCtrlMsg, dest, hdrlen, paylen); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_set_entry(uint32_t flowid, uint32_t genval, uint32_t seqnum, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; hfi_tidflow_set_entry(ctrl, flowid, genval, seqnum); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_reset(psmi_hal_hw_context ctxt, uint32_t flowid, uint32_t genval, uint32_t seqnum) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; hfi_tidflow_reset(ctrl, flowid, genval, seqnum); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; *ptf = hfi_tidflow_get(ctrl, flowid); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_hw(uint32_t flowid, uint64_t *ptf, psmi_hal_hw_context ctxt) { return hfp_gen1_tidflow_get(flowid, ptf, ctxt); } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_seqnum(uint64_t val, uint32_t *pseqn) { *pseqn = hfi_tidflow_get_seqnum(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_genval(uint64_t val, uint32_t *pgv) { *pgv = hfi_tidflow_get_genval(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_check_update_pkt_seq(void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, psmi_seqnum_t sequence_num, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr, void (*ips_protoexp_do_tf_generr) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr), void (*ips_protoexp_do_tf_seqerr) (void *vpprotoexp /* actually a: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually a: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr) ) { struct ips_protoexp *protoexp = (struct ips_protoexp *) vpprotoexp; struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; if_pf(psmi_hal_has_sw_status(PSM_HAL_HDRSUPP_ENABLED)) { /* Drop packet if generation number does not match. There * is a window that before we program the hardware tidflow * table with new gen/seq, hardware might receive some * packets with the old generation. */ if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) { PSM2_LOG_MSG("leaving"); return PSM_HAL_ERROR_GENERAL_ERROR; } #ifdef PSM_DEBUG /* Check if new packet falls into expected seq range, we need * to deal with wrap around of the seq value from 2047 to 0 * because seq is only 11 bits. */ int16_t seq_off = (int16_t)(sequence_num.psn_seq - tidrecvc->tidflow_genseq.psn_seq); if (seq_off < 0) seq_off += 2048; /* seq is 11 bits */ psmi_assert(seq_off < 1024); #endif /* NOTE: with RSM in use, we should not automatically update * our PSN from the HFI's PSN. The HFI doesn't know about * RSM interceptions. */ /* (DON'T!) Update the shadow tidflow_genseq */ /* tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; */ } /* Always check the sequence number if we get a header, even if SH. */ if_pt(sequence_num.psn_num == tidrecvc->tidflow_genseq.psn_num) { /* Update the shadow tidflow_genseq */ tidrecvc->tidflow_genseq.psn_seq = sequence_num.psn_seq + 1; /* update the fake tidflow table with new seq, this is for * seqerr and err_chk_gen processing to get the latest * valid sequence number */ hfp_gen1_tidflow_set_entry( tidrecvc->rdescid._desc_idx, tidrecvc->tidflow_genseq.psn_gen, tidrecvc->tidflow_genseq.psn_seq, tidrecvc->context->psm_hw_ctxt); } else { /* Generation mismatch */ if (sequence_num.psn_gen != tidrecvc->tidflow_genseq.psn_gen) { ips_protoexp_do_tf_generr(protoexp, tidrecvc, p_hdr); PSM2_LOG_MSG("leaving"); return PSM_HAL_ERROR_GENERAL_ERROR; } else { /* Possible sequence mismatch error */ /* First, check if this is a recoverable SeqErr - * caused by a good packet arriving in a tidflow that * has had a FECN bit set on some earlier packet. */ /* If this is the first RSM packet, our own PSN state * is probably old. Pull from the HFI if it has * newer data. */ uint64_t tf; psmi_seqnum_t tf_sequence_num; hfp_gen1_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, tidrecvc->context->psm_hw_ctxt); hfp_gen1_tidflow_get_seqnum(tf, &tf_sequence_num.psn_val); if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq) tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; /* Now re-check the sequence numbers. */ if (sequence_num.psn_seq > tidrecvc->tidflow_genseq.psn_seq) { /* It really was a sequence error. Restart. */ ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); PSM2_LOG_MSG("leaving"); return PSM_HAL_ERROR_GENERAL_ERROR; } else { /* False SeqErr. We can accept this packet. */ if (sequence_num.psn_seq == tidrecvc->tidflow_genseq.psn_seq) tidrecvc->tidflow_genseq.psn_seq++; } } } return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_flowvalid(uint64_t val, uint32_t *pfv) { *pfv = hfi_tidflow_get_flowvalid(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_enabled(uint64_t val, uint32_t *penabled) { *penabled = hfi_tidflow_get_enabled(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_keep_after_seqerr(uint64_t val, uint32_t *pkase) { *pkase = hfi_tidflow_get_keep_after_seqerr(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_keep_on_generr(uint64_t val, uint32_t *pkoge) { *pkoge = hfi_tidflow_get_keep_on_generr(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_keep_payload_on_generr(uint64_t val, uint32_t *pkpoge) { *pkpoge = hfi_tidflow_get_keep_payload_on_generr(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_seqmismatch(uint64_t val, uint32_t *psmm) { *psmm = hfi_tidflow_get_seqmismatch(val); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_tidflow_get_genmismatch(uint64_t val, uint32_t *pgmm) { *pgmm = hfi_tidflow_get_genmismatch(val); return PSM_HAL_ERROR_OK; } static inline int hfp_gen1_write_header_to_subcontext(struct ips_message_header *pimh, psmi_hal_cl_idx idx, psmi_hal_raw_rhf_t rhf, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; uint32_t *pu32 = pcl_q->hdr_qe.hdrq_base_addr + (idx + hfi_hdrget_hdrq_offset((uint32_t *)&rhf)); struct ips_message_header *piph_dest = (struct ips_message_header *)pu32; *piph_dest = *pimh; return PSM_HAL_ERROR_OK; } static inline void writehdrq_write_rhf_atomic(uint64_t *rhf_dest, uint64_t rhf_src) { /* * In 64-bit mode, we check in init that the rhf will always be 8-byte * aligned */ *rhf_dest = rhf_src; return; } static inline int hfp_gen1_write_rhf_to_subcontext(psmi_hal_raw_rhf_t rhf, psmi_hal_cl_idx idx, uint32_t *phdrq_rhf_seq, psmi_hal_cl_q cl_q, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; psm_hal_gen1_cl_q_t *pcl_q = &psm_hw_ctxt->cl_qs[cl_q]; if (!get_psm_gen1_hi()->hfp_private.dma_rtail) { uint32_t rhf_seq = *phdrq_rhf_seq; hfi_hdrset_seq((uint32_t *) &rhf, rhf_seq); rhf_seq++; if (rhf_seq > LAST_RHF_SEQNO) rhf_seq = 1; *phdrq_rhf_seq = rhf_seq; } /* Now write the new rhf */ writehdrq_write_rhf_atomic((uint64_t*)(pcl_q->hdr_qe.hdrq_base_addr + (idx + get_psm_gen1_hi()->hfp_private.hdrq_rhf_off)), rhf); return PSM_HAL_ERROR_OK; } static PSMI_HAL_INLINE int hfp_gen1_subcontext_ureg_get(ptl_t *ptl_gen, struct ips_subcontext_ureg **uregp, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; int i; struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen; ptl->recvshc->hwcontext_ctrl = psm_hw_ctxt->hwcontext_ctrl; for (i=0;i < psm_hw_ctxt->user_info.subctxt_cnt; i++) uregp[i] = psm_hw_ctxt->subcontext_ureg[i]; return PSM_HAL_ERROR_OK; } static inline int ips_write_eager_packet(struct ips_writehdrq *writeq, struct ips_recvhdrq_event *rcv_ev, psmi_hal_cl_idx write_hdr_tail, uint32_t subcontext, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; psmi_hal_cl_idx write_egr_tail; write_egr_tail = hfp_gen1_get_cl_q_tail_index( PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), ctxt); uint32_t next_write_egr_tail = write_egr_tail; /* checksum is trimmed from paylen, we need to add back */ uint32_t rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev) + (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0); psmi_assert(rcv_paylen > 0); uint32_t egr_elemcnt = ctrl->ctxt_info.egrtids; uint32_t egr_elemsz = ctrl->ctxt_info.rcvegr_size; /* Loop as long as the write eager queue is NOT full */ while (1) { next_write_egr_tail++; if (next_write_egr_tail >= egr_elemcnt) next_write_egr_tail = 0; psmi_hal_cl_idx egr_head; egr_head = hfp_gen1_get_cl_q_head_index( PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), ctxt); if (next_write_egr_tail == egr_head) { break; } /* Move to next eager entry if leftover is not enough */ if ((writeq->state->egrq_offset + rcv_paylen) > egr_elemsz) { writeq->state->egrq_offset = 0; write_egr_tail = next_write_egr_tail; /* Update the eager buffer tail pointer */ hfp_gen1_set_cl_q_tail_index(write_egr_tail, PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), ctxt); } else { /* There is enough space in this entry! */ /* Use pre-calculated address from look-up table */ char *write_payload = hfp_gen1_get_egr_buff(write_egr_tail, PSM_HAL_GET_SC_CL_Q_RX_EGR_Q(subcontext), ctxt)+ writeq->state->egrq_offset; const char *rcv_payload = ips_recvhdrq_event_payload(rcv_ev); psmi_assert(write_payload != NULL); psmi_assert(rcv_payload != NULL); psmi_mq_mtucpy(write_payload, rcv_payload, rcv_paylen); /* Fix up the rhf with the subcontext's eager index/offset */ hfi_hdrset_egrbfr_index((uint32_t*)(&rcv_ev->psm_hal_rhf.raw_rhf),write_egr_tail); hfi_hdrset_egrbfr_offset((uint32_t *)(&rcv_ev->psm_hal_rhf.raw_rhf), (writeq->state-> egrq_offset >> 6)); /* Copy the header to the subcontext's header queue */ hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, write_hdr_tail, rcv_ev->psm_hal_rhf.raw_rhf, PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); /* Update offset to next 64B boundary */ writeq->state->egrq_offset = (writeq->state->egrq_offset + rcv_paylen + 63) & (~63); return IPS_RECVHDRQ_CONTINUE; } } /* At this point, the eager queue is full -- drop the packet. */ /* Copy the header to the subcontext's header queue */ /* Mark header with ETIDERR (eager overflow) */ hfi_hdrset_err_flags((uint32_t*) (&rcv_ev->psm_hal_rhf.raw_rhf), HFI_RHF_TIDERR); /* Clear UseEgrBfr bit because payload is dropped */ hfi_hdrset_use_egrbfr((uint32_t *)(&rcv_ev->psm_hal_rhf.raw_rhf), 0); hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, write_hdr_tail, rcv_ev->psm_hal_rhf.raw_rhf, PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); return IPS_RECVHDRQ_BREAK; } static PSMI_HAL_INLINE int hfp_gen1_forward_packet_to_subcontext(struct ips_writehdrq *writeq, struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; psmi_hal_cl_idx write_hdr_head; psmi_hal_cl_idx write_hdr_tail; uint32_t hdrq_elemsz = ctrl->ctxt_info.rcvhdrq_entsize >> BYTE2DWORD_SHIFT; psmi_hal_cl_idx next_write_hdr_tail; int result = IPS_RECVHDRQ_CONTINUE; /* Drop packet if write header queue is disabled */ if_pf (!writeq->state->enabled) { return IPS_RECVHDRQ_BREAK; } write_hdr_head = hfp_gen1_get_cl_q_head_index( PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); write_hdr_tail = hfp_gen1_get_cl_q_tail_index( PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); /* Drop packet if write header queue is full */ next_write_hdr_tail = write_hdr_tail + hdrq_elemsz; if (next_write_hdr_tail > writeq->hdrq_elemlast) { next_write_hdr_tail = 0; } if (next_write_hdr_tail == write_hdr_head) { return IPS_RECVHDRQ_BREAK; } if (psmi_hal_rhf_get_use_egr_buff(rcv_ev->psm_hal_rhf)) { result = ips_write_eager_packet(writeq, rcv_ev, write_hdr_tail, subcontext, ctxt); } else { /* Copy the header to the subcontext's header queue */ hfp_gen1_write_header_to_subcontext(rcv_ev->p_hdr, write_hdr_tail, rcv_ev->psm_hal_rhf.raw_rhf, PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); } /* Ensure previous writes are visible before writing rhf seq or tail */ ips_wmb(); /* The following func call may modify the hdrq_rhf_seq */ hfp_gen1_write_rhf_to_subcontext(rcv_ev->psm_hal_rhf.raw_rhf, write_hdr_tail, &writeq->state->hdrq_rhf_seq, PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); /* The tail must be updated regardless of PSM_HAL_CAP_DMA_RTAIL * since this tail is also used to keep track of where * ips_writehdrq_append will write to next. For subcontexts there is * no separate shadow copy of the tail. */ hfp_gen1_set_cl_q_tail_index(next_write_hdr_tail, PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(subcontext), ctxt); return result; } static PSMI_HAL_INLINE int hfp_gen1_set_pio_size(uint32_t pio_size, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; ctrl->__hfi_piosize = pio_size; return 0; } static PSMI_HAL_INLINE int hfp_gen1_set_effective_mtu(uint32_t eff_mtu, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; ctrl->__hfi_mtusize = eff_mtu; return 0; } static PSMI_HAL_INLINE int hfp_gen1_set_tf_valid(uint32_t tf_valid, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; ctrl->__hfi_tfvalid = tf_valid; return 0; } static PSMI_HAL_INLINE int hfp_gen1_get_default_pkey(void) { return HFI_DEFAULT_P_KEY; } #include "psm_hal_gen1_spio.c" static PSMI_HAL_INLINE int hfp_gen1_spio_init(const psmi_context_t *context, struct ptl *ptl, void **ctrl) { hfp_gen1_pc_private *psm_hw_ctxt = context->psm_hw_ctxt; #ifdef PSM_AVX512 union psmi_envvar_val env_enable_avx512; psmi_getenv("PSM2_AVX512", "Enable (set envvar to 1) AVX512 code in PSM (Enabled by default)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)1, &env_enable_avx512); int is_avx512_enabled = env_enable_avx512.e_int; int rc = ips_spio_init(context,ptl, &psm_hw_ctxt->spio_ctrl, is_avx512_enabled); #else int rc = ips_spio_init(context,ptl, &psm_hw_ctxt->spio_ctrl); #endif if (rc >= 0) { *ctrl = &psm_hw_ctxt->spio_ctrl; } return rc; } static PSMI_HAL_INLINE int hfp_gen1_spio_fini(void **ctrl, psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; int rc = ips_spio_fini(&psm_hw_ctxt->spio_ctrl); if (!rc) *ctrl = NULL; return rc; } static PSMI_HAL_INLINE int hfp_gen1_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, struct psm_hal_pbc *pbc, uint32_t *payload, uint32_t length, uint32_t isCtrlMsg, uint32_t cksum_valid, uint32_t cksum, psmi_hal_hw_context ctxt #ifdef PSM_CUDA , uint32_t is_cuda_payload #endif ) { return ips_spio_transfer_frame(proto, flow, pbc, payload, length, isCtrlMsg, cksum_valid, cksum #ifdef PSM_CUDA , is_cuda_payload #endif ); } static PSMI_HAL_INLINE int hfp_gen1_spio_process_events(const struct ptl *ptl) { return ips_spio_process_events(ptl); } static PSMI_HAL_INLINE int hfp_gen1_get_node_id(int unit, int *nodep) { int64_t node_id = hfi_sysfs_unit_read_node_s64(unit); *nodep = (int)node_id; if (node_id != -1) return PSM_HAL_ERROR_OK; else return -PSM_HAL_ERROR_GENERAL_ERROR; } static PSMI_HAL_INLINE int hfp_gen1_get_bthqp(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->base_info.bthqp; } static PSMI_HAL_INLINE int hfp_gen1_get_context(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->ctxt_info.ctxt; } static PSMI_HAL_INLINE uint64_t hfp_gen1_get_gid_lo(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; uint64_t gid_lo, gid_hi; if (hfi_get_port_gid(ctrl->__hfi_unit, ctrl->__hfi_port, &gid_hi, &gid_lo) == -1) { psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get HFI GID in psm2_ep_open: is SMA running?"); } return gid_lo; } static PSMI_HAL_INLINE uint64_t hfp_gen1_get_gid_hi(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; uint64_t gid_lo, gid_hi; if (hfi_get_port_gid(ctrl->__hfi_unit, ctrl->__hfi_port, &gid_hi, &gid_lo) == -1) { psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get HFI GID in psm2_ep_open: is SMA running?"); } return gid_hi; } static PSMI_HAL_INLINE int hfp_gen1_get_hfi_type(psmi_hal_hw_context ctxt) { return PSM_HAL_INSTANCE_GEN1; } static PSMI_HAL_INLINE int hfp_gen1_get_jkey(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->base_info.jkey; } static PSMI_HAL_INLINE int hfp_gen1_get_lid(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; int lid; if ((lid = hfi_get_port_lid(ctrl->__hfi_unit, ctrl->__hfi_port)) <= 0) { psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, "Can't get HFI LID in psm2_ep_open: is SMA running?"); } return lid; } static PSMI_HAL_INLINE int hfp_gen1_get_pio_size(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return (ctrl->ctxt_info.credits / 2) * 64 - (sizeof(struct ips_message_header) + HFI_PCB_SIZE_IN_BYTES); } static PSMI_HAL_INLINE int hfp_gen1_get_port_num(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->__hfi_port; } static PSMI_HAL_INLINE int hfp_gen1_get_rx_egr_tid_cnt(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->ctxt_info.egrtids; } static PSMI_HAL_INLINE int hfp_gen1_get_rx_hdr_q_cnt(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->ctxt_info.rcvhdrq_cnt; } static PSMI_HAL_INLINE int hfp_gen1_get_rx_hdr_q_ent_size(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->ctxt_info.rcvhdrq_entsize; } static PSMI_HAL_INLINE int hfp_gen1_get_sdma_req_size(psmi_hal_hw_context ctxt) { return get_psm_gen1_hi()->hfp_private.sdmahdr_req_size; } static PSMI_HAL_INLINE int hfp_gen1_get_sdma_ring_size(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->ctxt_info.sdma_ring_size; } static PSMI_HAL_INLINE int hfp_gen1_get_subctxt(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->ctxt_info.subctxt; } static PSMI_HAL_INLINE int hfp_gen1_get_subctxt_cnt(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return psm_hw_ctxt->user_info.subctxt_cnt; } static PSMI_HAL_INLINE int hfp_gen1_get_tid_exp_cnt(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->__hfi_tidexpcnt; } static PSMI_HAL_INLINE int hfp_gen1_get_unit_id(psmi_hal_hw_context ctxt) { hfp_gen1_pc_private *psm_hw_ctxt = ctxt; struct _hfi_ctrl *ctrl = psm_hw_ctxt->ctrl; return ctrl->__hfi_unit; } static PSMI_HAL_INLINE int hfp_gen1_get_fd(psmi_hal_hw_context ctxt) { if (!ctxt) return -1; hfp_gen1_pc_private *psm_hw_ctxt = ctxt; return psm_hw_ctxt->ctrl->fd; } static PSMI_HAL_INLINE int hfp_gen1_get_pio_stall_cnt(psmi_hal_hw_context ctxt, uint64_t **pio_stall_cnt) { if (!ctxt) return -PSM_HAL_ERROR_GENERAL_ERROR; hfp_gen1_pc_private *psm_hw_ctxt = ctxt; *pio_stall_cnt = &psm_hw_ctxt->spio_ctrl.spio_num_stall_total; return PSM_HAL_ERROR_OK; } opa-psm2-PSM2_11.2.185/psm_help.h000066400000000000000000000140611370564314600162030ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef _PSMI_HELP_H #define _PSMI_HELP_H #include "psm_log.h" /* XXX gcc only */ #define PSMI_INLINE(FN) \ static inline FN #define PSMI_ALWAYS_INLINE(FN) \ static __inline__ FN __attribute__((always_inline)); \ static __inline__ FN #define PSMI_NEVER_INLINE(FN) \ static FN __attribute__((noinline)); \ static FN #define _PPragma(x) _Pragma(x) #define STRINGIFY(s) _STRINGIFY(s) #define _STRINGIFY(s) #s #define PSMI_CURLOC __FILE__ ":" STRINGIFY(__LINE__) #define psmi_assert_always_loc(x, curloc) \ do { \ if_pf(!(x)) { \ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ "Assertion failure at %s: %s", curloc, \ STRINGIFY(x)); \ } } while (0) #define psmi_assert_always(x) psmi_assert_always_loc(x, PSMI_CURLOC) #ifdef PSM_DEBUG # define psmi_assert(x) psmi_assert_always(x) # define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized()) #else # define psmi_assert(x) # define PSMI_ASSERT_INITIALIZED() #endif #define _PSMI_API_NAME(FN) __ ## FN #define _PSMI_API_STR(FN) _STRINGIFY(__ ## FN) #define PSMI_API_DECL(FN) \ typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN)))); #define PSMI_ERR_UNLESS_INITIALIZED(ep) \ do { \ if (!psmi_isinitialized()) { \ PSM2_LOG_MSG("leaving"); \ return psmi_handle_error(ep, PSM2_INIT_NOT_INIT, \ "PSM2 has not been initialized"); \ } \ } while (0) #define PSMI_CHECKMEM(err, mem) \ do { \ if ((mem) == NULL) { \ (err) = PSM2_NO_MEMORY; \ goto fail; \ } \ } while (0) #define PSMI_CACHEALIGN __attribute__((aligned(64))) /* Easy way to ignore the OK_NO_PROGRESS case */ PSMI_ALWAYS_INLINE(psm2_error_t psmi_err_only(psm2_error_t err)) { if (err > PSM2_OK_NO_PROGRESS) return err; else return PSM2_OK; } #ifdef min #undef min #endif #define min(a, b) ((a) < (b) ? (a) : (b)) #ifdef max #undef max #endif #define max(a, b) ((a) > (b) ? (a) : (b)) #define SEC_ULL 1000000000ULL #define MSEC_ULL 1000000ULL #define USEC_ULL 1000ULL #define NSEC_ULL 1ULL #define PSMI_TRUE 1 #define PSMI_FALSE 0 #define PSMI_CYCLES_TO_SECSF(cycles) \ ((double) cycles_to_nanosecs(cycles) / 1.0e9) #define PSMI_PAGESIZE psmi_getpagesize() #define PSMI_POWEROFTWO(P) (((P)&((P)-1)) == 0) #define PSMI_ALIGNDOWN(p, P) (((uintptr_t)(p))&~((uintptr_t)((P)-1))) #define PSMI_ALIGNUP(p, P) (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)), (P))) #define PSMI_MAKE_DRIVER_VERSION(major, minor) ((major)<<16 | ((minor) & 0xffff)) #ifdef PSM_DEBUG /* The intent of the following two macros is to emit an internal error if a size of a 'member' is not as expected, violating an assumption in the code. There are some problems with the implementation of this code: The first macro creates a static const variable with ABSOLUTELY NO references to them. For example there are ABSOLUTELY NO uses of the second macro in the PSM code. This is not completely pure. GCC version 5, for example, emits a warning for defining a static const when it is not referenced. A better implementation of the intent of this code is to use static_assert() so that at compile time the violations can be caught and corrected - not at run time. */ #define PSMI_STRICT_SIZE_DECL(member, sz) static const size_t __psm2_ss_ ## member = sz #define PSMI_STRICT_SIZE_VERIFY(member, sz) \ do { \ if (__psm2_ss_ ## member != (sz)) { \ char errmsg[64]; \ snprintf(errmsg, 32, "Internal error: %s " \ "size doesn't match expected %d bytes", \ STRINGIFY(member), (int) __psm2_ss_ ## member); \ exit(-1); \ } \ } while (0) #else #define PSMI_STRICT_SIZE_DECL(member, sz) /* nothing */ #define PSMI_STRICT_SIZE_VERIFY(member, sz) /* nothing */ #endif /* PSM_DEBUG */ #endif /* _PSMI_HELP_H */ opa-psm2-PSM2_11.2.185/psm_lock.h000066400000000000000000000157501370564314600162110ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PSMI_IN_USER_H #error psm_lock.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSMI_LOCK_H #define _PSMI_LOCK_H #ifndef PSMI_USE_PTHREAD_SPINLOCKS #define PSMI_USE_PTHREAD_SPINLOCKS 0 #endif #if PSMI_USE_PTHREAD_SPINLOCKS typedef pthread_spinlock_t psmi_spinlock_t; #define psmi_spin_init(lock) pthread_spin_init(lock, \ PTHREAD_PROCESS_PRIVATE) #define psmi_spin_destroy(lock) pthread_spin_destroy(lock) #define psmi_spin_lock(lock) pthread_spin_lock(lock) #define psmi_spin_trylock(lock) pthread_spin_trylock(lock) #define psmi_spin_unlock(lock) pthread_spin_unlock(lock) #else typedef ips_atomic_t psmi_spinlock_t; #define PSMI_SPIN_INVALID 2 #define PSMI_SPIN_LOCKED 1 #define PSMI_SPIN_UNLOCKED 0 #endif /* psmi_lock_t structure */ typedef struct { #ifdef PSMI_LOCK_IS_SPINLOCK psmi_spinlock_t lock; #elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) pthread_mutex_t lock; pthread_t lock_owner; #elif defined(PSMI_LOCK_IS_MUTEXLOCK) pthread_mutex_t lock; #endif } psmi_lock_t; #if PSMI_USE_PTHREAD_SPINLOCKS #else PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock)) { ips_atomic_set(lock, PSMI_SPIN_UNLOCKED); return 0; } PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock)) { if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED) == PSMI_SPIN_UNLOCKED) { return 0; } return EBUSY; } PSMI_ALWAYS_INLINE(int psmi_spin_destroy(psmi_spinlock_t *lock)) { if (lock == NULL) { return EINVAL; } /* We could just do psmi_spin_trylock() here and dispense with the invalid state */ if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_INVALID) == PSMI_SPIN_UNLOCKED) { return 0; } return EBUSY; } PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock)) { while (psmi_spin_trylock(lock) == EBUSY) { } return 0; } PSMI_ALWAYS_INLINE(int psmi_spin_unlock(psmi_spinlock_t *lock)) { atomic_set(lock, PSMI_SPIN_UNLOCKED); return 0; } #endif /* PSMI_USE_PTHREAD_SPINLOCKS */ PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock)) { #ifdef PSMI_LOCK_IS_SPINLOCK psmi_spin_init(&(lock->lock)); #elif defined(PSMI_LOCK_IS_MUTEXLOCK) pthread_mutex_init(&(lock->lock), NULL); #elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP); pthread_mutex_init(&(lock->lock), &attr); pthread_mutexattr_destroy(&attr); lock->lock_owner = PSMI_LOCK_NO_OWNER; #endif } PSMI_ALWAYS_INLINE(void psmi_destroy_lock(psmi_lock_t *lock)) { int err; #ifdef PSMI_LOCK_IS_SPINLOCK /* This will map to either pthread_spin_destroy() or our custom psmi_spin_destroy(). * Both their return values can be interpreted by strerror(). */ if ((err = psmi_spin_destroy(&(lock->lock))) != 0) { _HFI_VDBG("Destroying spinlock failed: %s\n", strerror(err)); } /* The same path for both the regular mutex and the debugging mutex */ #elif defined(PSMI_LOCK_IS_MUTEXLOCK) || defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) if ((err = pthread_mutex_destroy(&(lock->lock))) != 0) { /* strerror_r() may be a better choice here but it is tricky * to reliably detect the XSI vs GNU version, and if hardcoded, * may be inadvertently changed when tampering with headers/makefiles * in the long run. * * This would result in incorrect operation: a segfault from * derefencing the return value or failure to retrieve the * error string. * * The C11's strerror_s may be an option here too. */ _HFI_VDBG("Destroying mutex failed: %s\n", strerror(err)); } #endif } PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name)) { if (sem_post(sem) == -1) { _HFI_VDBG("Semaphore %s: post failed\n", name ? name : "NULL" ); return -1; } _HFI_VDBG("Semaphore %s: post succeeded\n", name ? name : "NULL"); return 0; } PSMI_ALWAYS_INLINE(int psmi_sem_timedwait(sem_t *sem, const char *name)) { /* Wait 5 seconds for shm read-write lock to open */ struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); ts.tv_sec += 5; if (sem_timedwait(sem, &ts) == -1) { _HFI_VDBG("Semaphore %s: Timedwait failed\n", name ? name : "NULL" ); return -1; } _HFI_VDBG("Semaphore %s: Timedwait succeeded\n", name ? name : "NULL"); return 0; } PSMI_ALWAYS_INLINE(int psmi_init_semaphore(sem_t **sem, const char *name, mode_t mode, int value)) { *sem = sem_open(name, O_CREAT | O_EXCL, mode, value); if ((*sem == SEM_FAILED) && (errno == EEXIST)) { *sem = sem_open(name, O_CREAT, mode, value); if (*sem == SEM_FAILED) { _HFI_VDBG("Cannot open semaphore %s, errno=%d\n", name, errno); return -1; } } else if (*sem == SEM_FAILED) { _HFI_VDBG("Cannot create semaphore %s, errno=%d\n", name, errno); return -1; } return 0; } #endif /* _PSMI_LOCK_H */ opa-psm2-PSM2_11.2.185/psm_log.h000066400000000000000000000260671370564314600160450ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _PSMI_LOG_H #define _PSMI_LOG_H /* A note about PSM_LOG and PSM_LOG_FAST_IO: By default, the PSM_LOG facility is safe and slow. Log messages are written to a file under /tmp as they're generated. So, if the test case has an abnormal termination such as a segmentation fault or an abort(), the log messages will still be available. However, debugging timing sensitive problems, make the default PSM_LOG facility inadequate as the timing overhead that it introduces dominates, and the symptoms of the problem being tested may change. When performance is important, you can use BOTH: PSM_LOG and PSM_LOG_FAST_IO. With PSM_LOG_FAST_IO, log messages are written to a memory buffer, and when the program terminates, the log messages are written to a file under /tmp * How to use basic functionality of PSM LOG: - To use default PSM_LOG, build PSM2 with macro PSM_LOG=1 - To use PSM_LOG when performance is critical, build PSM2 with macros PSM_LOG=1 PSM_LOG_FAST_IO=1 - Insert log message in code with a . Log message follow the same format as printf(). For example: PSM2_LOG_MSG(" %u", 1); - To filter out log messages, set environment variable PSM2_LOG_SRCH_FORMAT_STRING to and the wildcard character (*). For example, PSM2_LOG_SRCH_FORMAT_STRING=* - A more detailed explanation to use PSM LOG can be found below. * How to get log messages with abnormal termination while using PSM LOG with PSM_LOG_FAST_IO: - Log messages are saved from a memory buffer to a file under /tmp when psmi_log_fini() is called. psmi_log_fini() is exposed to the outside world via the linker script file, so client test code can psmi_log_fini() on a fatal error. -------------------------------------------------------------------------------- This file (psm_log.h) defines macros for logging messages to assist investigations into the psm library. By default, these macros are not defined when building psm. When not defined, the macros become no-ops in the PSM code. When enabled (by defining the PSM_LOG symbol), the macros present information to the psmi_log_message() facility for processing. See below for more information on the psmi_log_message() facility. The macros are described in the following: PSM2_LOG_MSG(FORMAT,...) Spills a printf-style message to the log. PSM2_LOG_DECLARE_BT_BUFFER() Declares a local back trace buffer for use with the PSM2_LOG_BT() macro. PSM2_LOG_BT(NFRAMES,FORMAT,...) Spills the current backtrace, if it differs from the previous backtrace spilled to the log. The psmi_log_message() facility is the backend for these messages when PSM_LOG is enabled. The psmi_log_message() facility spills messages to unique log files based on the process id and the thread id. So every unique process id, and thread id will spill to unique log files. The psmi_log_message prefixes each message in the log files with a high resolution timer message so that messages from multiple threads and log files can be reconciled to one timeline. It is left as an exercise to the reader to reconcile log messages from different hosts to one timeline. The backtrace capability in the PSM_LOG functionality needs some explanation: often a bug happens only when the code is tickled from a specific call-chain. The PSM2_LOG_BT() macro supports identifying the unique call-chain when a problem occurs. The model is as follows: A unique declaration is made for a backtrace to spill the backtrace information to. This declaration should be made in the same basic block as the use of the PSM2_LOG_BT() macro. To make the declaration, use PSM2_LOG_DECLARE_BT_BUFFER(). When the PSM_LOG is enabled, at the statement for the macro: PSM2_LOG_BT(NFRAMES,FORMAT,...), the psmi_log_message() facility generates the current backtrace, and compares the first NFRAMES of the current backtrace against the previous backtrace stored in the backtrace buffer declared with the declaration. If the two backtraces differ, the psmi_log_message() code saves the current backtrace into the declared buffer, and then spills the backtrace to the log file. At runtime, setting environment variables can squelch the log file from getting too big: PSM2_LOG_INC_FUNCTION_NAMES is a list of function name lists (abbreviated FNL) (see below), that will INClude the FNL's into the colleciton of functions to spill log data for. PSM2_LOG_EXC_FUNCTION_NAMES is a list of FNL's (see below), that will EXClude the FNL's from the collection of functions to spill log data for. An FNL is a 'Function Name List' that is defined by the following grammar: # A LINE1 is either a single line number of a range of line numbers: LINE1 :: lineNumber | lineNumber1 '-' lineNumber2 # LINES is a list of LINE1's separated by commas: LINES :: LINE1 | LINE1 ',' LINES # An FN is either a function name, or a function name with a list of lines: FN :: functionName | functionName ';' LINES # A FNL is a list of FN's separated by colons: FNL :: FN | FN ':' FNL # Examples: foo:bar the two functions foo and bar foo;1-10 lines 1 to 10 of function foo. bar;1,3,5 lines 1, 3 and 5 of function bar PSM2_LOG_SRCH_FORMAT_STRING If set, overrides the PSM2_LOG_INC_FUNCTION_NAMES and PSM2_LOG_EXC_FUNCTION_NAMES settings. Causes the psmi_log_message() facility to only emit the log messages that match (using fnmatch()) the message in FORMAT. */ typedef enum { PSM2_LOG_TX = 0, PSM2_LOG_RX = 1, PSM2_LOG_PEND = 2, } psmi_log_tx_rx_t; #ifdef PSM_LOG extern void psmi_log_initialize(void); /* defined in psm_utils.c */ extern void psmi_log_message(const char *fileName, const char *functionName, int lineNumber, const char *format, ...); #ifdef PSM_LOG_FAST_IO extern void psmi_log_fini(void); #else #define psmi_log_fini() /* nothing */ #endif #define PSM2_LOG_MSG(FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,FORMAT, ## __VA_ARGS__) #define PSM2_LOG_BT_BUFFER_SIZE 100 #define PSM2_LOG_DECLARE_BT_BUFFER() static void * psm_log_bt_buffer[PSM2_LOG_BT_BUFFER_SIZE] #define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) static void * psm_log_bt_buffer[SIZE] #define PSM2_LOG_BT_MAGIC ((const char *)-1) #define PSM2_LOG_BT(NFRAMES,FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_BT_MAGIC,psm_log_bt_buffer,NFRAMES,FORMAT, ## __VA_ARGS__) #define PSM2_LOG_EPM_MAGIC ((const char *)-2) /* EPM is short for Emit Protocol Message to the log file. OPCODE is an int, and corresponds to one of the OPCODES declared in ptl_ips/ips_proto_header.h TXRX is an int, and should be one of the above two consts (PSM2_LOG_TX, or PSM2_LOG_RX). FROMEPID and TOEPID are uint64_t's and the fromepid should be the epid (end point id) of the sender of the message and the toepid should be the epid (end point id) of the receiver of the message */ #define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) \ psmi_log_message(__FILE__,__FUNCTION__,__LINE__, \ PSM2_LOG_EPM_MAGIC,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, \ ## __VA_ARGS__) /* Just adds a condition to the PSM2_LOG_EPM() macro. */ #define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) \ if (COND) \ PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__) #define PSM2_LOG_DUMP_MAGIC ((const char *)-3) #define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...) \ psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_DUMP_MAGIC,ADDR,SIZE, \ FORMAT, ## __VA_ARGS__) #define PSM2_LOG_PKT_STRM_MAGIC ((const char *)-4) #define PSM2_LOG_MIN_MAGIC PSM2_LOG_BT_MAGIC #define PSM2_LOG_MAX_MAGIC PSM2_LOG_PKT_STRM_MAGIC #define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...) \ psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_PKT_STRM_MAGIC,TXRX, \ IPS_MSG_HDRP,FORMAT, ## __VA_ARGS__) #else #define psmi_log_initialize() /* nothing */ #define PSM2_LOG_MSG(FORMAT , ...) /* nothing */ #define psmi_log_fini() /* nothing */ #define PSM2_LOG_DECLARE_BT_BUFFER() /* nothing */ #define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) /* nothing */ #define PSM2_LOG_BT(NFRAMES,FORMAT , ...) /* nothing */ #define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */ #define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */ #define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...) /* nothing */ #define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...) /* nothing */ #endif /* #ifdef PSM_LOG */ #endif /* #ifndef _PSMI_LOG_H */ opa-psm2-PSM2_11.2.185/psm_memcpy.c000066400000000000000000000047741370564314600165520ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include #include #include "psm_user.h" #include "psm_mq_internal.h" void *psmi_memcpyo(void *dst, const void *src, size_t n) { psmi_mq_mtucpy(dst, src, n); return dst; } opa-psm2-PSM2_11.2.185/psm_mock.c000066400000000000000000000061021370564314600161740ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "psm_user.h" #include "psm_mq_internal.h" #include "psm2_mock_testing.h" #ifdef PSM2_MOCK_TESTING void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl) { _PSMI_LOCK_INIT(*pl); } MOCK_DEF_EPILOGUE(psmi_mockable_lock_init); int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl) { int ret = _PSMI_LOCK_TRY(*pl); return ret; } MOCK_DEF_EPILOGUE(psmi_mockable_lock_try); void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl) { _PSMI_LOCK(*pl); } MOCK_DEF_EPILOGUE(psmi_mockable_lock); void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl) { _PSMI_UNLOCK(*pl); } MOCK_DEF_EPILOGUE(psmi_mockable_unlock); void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl) { _PSMI_LOCK_ASSERT(*pl); } MOCK_DEF_EPILOGUE(psmi_mockable_lock_assert); void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl) { _PSMI_UNLOCK_ASSERT(*pl); } MOCK_DEF_EPILOGUE(psmi_mockable_unlock_assert); #endif opa-psm2-PSM2_11.2.185/psm_mpool.c000066400000000000000000000356271370564314600164070ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" #define PSMI_MPOOL_ALIGNMENT 64 struct mpool_element { union { SLIST_ENTRY(mpool_element) me_next; mpool_t me_mpool; }; uint32_t me_gen_count; uint32_t me_index; #ifdef PSM_DEBUG uint32_t me_isused; #endif } __attribute__ ((aligned(8))); #ifdef PSM_DEBUG # define me_mark_used(me) ((me)->me_isused = 1) # define me_mark_unused(me) ((me)->me_isused = 0) #else # define me_mark_used(me) # define me_mark_unused(me) #endif struct mpool { int mp_type; int mp_flags; int mp_vector_shift; uint32_t mp_elm_vector_size; uint32_t mp_elm_offset; uint32_t mp_num_obj; uint32_t mp_num_obj_inuse; uint32_t mp_elm_size; uint32_t mp_obj_size; uint32_t mp_num_obj_per_chunk; uint32_t mp_num_obj_max_total; psmi_memtype_t mp_memtype; SLIST_HEAD(, mpool_element) mp_head; struct mpool_element **mp_elm_vector; struct mpool_element **mp_elm_vector_free; non_empty_callback_fn_t mp_non_empty_cb; void *mp_non_empty_cb_context; #ifdef PSM_CUDA alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb; void *mp_alloc_dealloc_cb_context; #endif }; static int psmi_mpool_allocate_chunk(mpool_t); /** * psmi_mpool_create() * * Create a memory pool and allocates objects of size * . If more memory is needed to accommodate mpool_get() * requests, the memory pool will allocate another chunk of * objects, until it reaches the maximum number of objects * it can allocate. * * size of each individual object * number of objects to allocate per chunk (power of two) * total number of objects that may be allocated * at any given time. Must be a power of two greater than * . * * flags to be applied on the memory pool (ie. memory * alignment) * * callback to be called when the memory pool has some * free objects available again (after running out of them). * context pointer for the callback * * Return the mpool on success, NULL on failure. */ mpool_t psmi_mpool_create_inner(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context) { mpool_t mp; int s; size_t hdr_size; if (!PSMI_POWEROFTWO(num_obj_per_chunk) || !PSMI_POWEROFTWO(num_obj_max_total) || num_obj_max_total < num_obj_per_chunk) { return NULL; } mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool)); if (mp == NULL) { fprintf(stderr, "Failed to allocate memory for memory pool: %s\n", strerror(errno)); return NULL; } for (s = 1; s < num_obj_per_chunk; s <<= 1) mp->mp_vector_shift++; mp->mp_flags = flags; mp->mp_num_obj_per_chunk = num_obj_per_chunk; mp->mp_num_obj_max_total = num_obj_max_total; mp->mp_non_empty_cb = cb; mp->mp_non_empty_cb_context = context; mp->mp_memtype = statstype; SLIST_INIT(&mp->mp_head); mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk; mp->mp_elm_vector = psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size, sizeof(struct mpool_element *)); if (mp->mp_elm_vector == NULL) { fprintf(stderr, "Failed to allocate memory for memory pool vector: " "%s\n", strerror(errno)); psmi_free(mp); return NULL; } mp->mp_elm_vector_free = mp->mp_elm_vector; if (flags & PSMI_MPOOL_ALIGN) { /* User wants its block to start on a PSMI_MPOOL_ALIGNMENT * boundary. */ hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element), PSMI_MPOOL_ALIGNMENT); mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT); mp->mp_elm_size = hdr_size + mp->mp_obj_size; mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element); } else { hdr_size = sizeof(struct mpool_element); mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8); mp->mp_elm_size = hdr_size + mp->mp_obj_size; mp->mp_elm_offset = 0; } return mp; } mpool_t MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context) { mpool_t mp; mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk, num_obj_max_total, flags, statstype, cb, context); if (mp == NULL) return NULL; if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { psmi_mpool_destroy(mp); return NULL; } return mp; } MOCK_DEF_EPILOGUE(psmi_mpool_create); #ifdef PSM_CUDA mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context, alloc_dealloc_callback_fn_t ad_cb, void *ad_context) { mpool_t mp; mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk, num_obj_max_total, flags, statstype, cb, context); if (mp == NULL) return NULL; mp->mp_alloc_dealloc_cb = ad_cb; mp->mp_alloc_dealloc_cb_context = ad_context; if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { psmi_mpool_destroy(mp); return NULL; } return mp; } #endif /** * psmi_mpool_get() * * memory pool * * Requests an object from the memory pool. * * Returns NULL if the maximum number of objects has been allocated (refer to * in psmi_mpool_create) or if running out of memory. */ void *psmi_mpool_get(mpool_t mp) { struct mpool_element *me; void *obj; if (SLIST_EMPTY(&mp->mp_head)) { if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) return NULL; } me = SLIST_FIRST(&mp->mp_head); SLIST_REMOVE_HEAD(&mp->mp_head, me_next); psmi_assert(!me->me_isused); me_mark_used(me); /* store a backpointer to the memory pool */ me->me_mpool = mp; mp->mp_num_obj_inuse++; psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj); obj = (void *)((uintptr_t) me + sizeof(struct mpool_element)); return obj; } /** * psmi_mpool_put() * * object to return to the memory pool * * Returns an to the memory pool subsystem. This object will be re-used * to fulfill new psmi_mpool_get() requests. */ void psmi_mpool_put(void *obj) { struct mpool_element *me; int was_empty; mpool_t mp; me = (struct mpool_element *) ((uintptr_t) obj - sizeof(struct mpool_element)); me->me_gen_count++; mp = me->me_mpool; psmi_assert(mp != NULL); psmi_assert(mp->mp_num_obj_inuse >= 0); psmi_assert(me->me_isused); me_mark_unused(me); was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total; SLIST_INSERT_HEAD(&mp->mp_head, me, me_next); mp->mp_num_obj_inuse--; /* tell the user that memory is available */ if (mp->mp_non_empty_cb && was_empty) mp->mp_non_empty_cb(mp->mp_non_empty_cb_context); } /** * psmi_mpool_get_obj_index() * * object in the memory pool * * Returns the index of the in the memory pool. */ int psmi_mpool_get_obj_index(void *obj) { struct mpool_element *me = (struct mpool_element *) ((uintptr_t) obj - sizeof(struct mpool_element)); return me->me_index; } /** * psmi_mpool_get_obj_gen_count() * * object in the memory pool * * Returns the generation count of the . */ uint32_t psmi_mpool_get_obj_gen_count(void *obj) { struct mpool_element *me = (struct mpool_element *) ((uintptr_t) obj - sizeof(struct mpool_element)); return me->me_gen_count; } /** * psmi_mpool_get_obj_index_gen_count() * * object in the memory pool * * Returns the index of the in . * Returns the generation count of the in . */ int psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index, uint32_t *gen_count) { struct mpool_element *me = (struct mpool_element *) ((uintptr_t) obj - sizeof(struct mpool_element)); *index = me->me_index; *gen_count = me->me_gen_count; return 0; } /** * psmi_mpool_find_obj_by_index() * * memory pool * index of the object * * Returns the object located at in the memory pool or NULL if the * is invalid. */ void *psmi_mpool_find_obj_by_index(mpool_t mp, int index) { struct mpool_element *me; if_pf(index < 0 || index >= mp->mp_num_obj) return NULL; me = (struct mpool_element *) ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] + (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size + mp->mp_elm_offset); /* If this mpool doesn't require generation counts, it's illegal to find a * freed object */ #ifdef PSM_DEBUG if (mp->mp_flags & PSMI_MPOOL_NOGENERATION) psmi_assert(!me->me_isused); #endif return (void *)((uintptr_t) me + sizeof(struct mpool_element)); } #ifdef PSM_CUDA /** * psmi_mpool_chunk_dealloc() * memory pool * index * Calls the dealloc function on each element in the chunk. */ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx) { int j; for (j = 0; j < mp->mp_num_obj_per_chunk; j++) mp->mp_alloc_dealloc_cb(0 /* is not alloc */, mp->mp_alloc_dealloc_cb_context, ((void *) mp->mp_elm_vector[idx]) + j * mp->mp_elm_size + sizeof(struct mpool_element)); } #endif /** * psmi_mpool_destroy() * * memory pool * * Destroy a previously allocated memory pool and reclaim its associated * memory. The behavior is undefined if some objects have not been returned * to the memory pool with psmi_mpool_put(). */ void psmi_mpool_destroy(mpool_t mp) { int i = 0; size_t nbytes = mp->mp_num_obj * mp->mp_elm_size; for (i = 0; i < mp->mp_elm_vector_size; i++) { if (mp->mp_elm_vector[i]) { #ifdef PSM_CUDA if (mp->mp_alloc_dealloc_cb) psmi_mpool_chunk_dealloc(mp, i); #endif psmi_free(mp->mp_elm_vector[i]); } } psmi_free(mp->mp_elm_vector); nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *); psmi_free(mp); nbytes += sizeof(struct mpool); } /** * psmi_mpool_get_max_obj() * * memory pool * * Returns the num-obj-per-chunk * Returns the num-obj-max-total */ void MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, uint32_t *num_obj_max_total) { *num_obj_per_chunk = mp->mp_num_obj_per_chunk; *num_obj_max_total = mp->mp_num_obj_max_total; return; } MOCK_DEF_EPILOGUE(psmi_mpool_get_obj_info); static int psmi_mpool_allocate_chunk(mpool_t mp) { struct mpool_element *elm; void *chunk; uint32_t i = 0, num_to_allocate; num_to_allocate = mp->mp_num_obj + mp->mp_num_obj_per_chunk > mp->mp_num_obj_max_total ? 0 : mp->mp_num_obj_per_chunk; psmi_assert(mp->mp_num_obj + num_to_allocate <= mp->mp_num_obj_max_total); if (num_to_allocate == 0) return PSM2_NO_MEMORY; #ifdef PSM_CUDA if (mp->mp_alloc_dealloc_cb) chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype, num_to_allocate, mp->mp_elm_size); else chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, num_to_allocate * mp->mp_elm_size); #else chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, num_to_allocate * mp->mp_elm_size); #endif if (chunk == NULL) { fprintf(stderr, "Failed to allocate memory for memory pool chunk: %s\n", strerror(errno)); return PSM2_NO_MEMORY; } for (i = 0; i < num_to_allocate; i++) { #ifdef PSM_CUDA if (mp->mp_alloc_dealloc_cb) mp->mp_alloc_dealloc_cb(1 /* is alloc */, mp->mp_alloc_dealloc_cb_context, chunk + i * mp->mp_elm_size + sizeof(struct mpool_element)); #endif elm = (struct mpool_element *)((uintptr_t) chunk + i * mp->mp_elm_size + mp->mp_elm_offset); elm->me_gen_count = 0; elm->me_index = mp->mp_num_obj + i; #ifdef PSM_DEBUG elm->me_isused = 0; #endif SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next); #if 0 fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n", (long)(mp->mp_elm_vector_free - mp->mp_elm_vector), (int)i, elm, (void *)((uintptr_t) elm + sizeof(struct mpool_element)), SLIST_NEXT(elm, me_next)); #endif } psmi_assert((uintptr_t) mp->mp_elm_vector_free < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size * sizeof(struct mpool_element *)); mp->mp_elm_vector_free[0] = chunk; mp->mp_elm_vector_free++; mp->mp_num_obj += num_to_allocate; return PSM2_OK; } #if 0 void psmi_mpool_dump(mpool_t mp) { int i, j; struct mpool_element *me; fprintf(stderr, "Memory pool %p has %d elements per chunk.\n", mp, mp->mp_num_obj_per_chunk); for (i = 0; i < mp->mp_elm_vector_size; i++) { if (mp->mp_elm_vector[i] != NULL) { fprintf(stderr, "===========================\n"); fprintf(stderr, "mpool chunk #%d\n", i); for (j = 0, me = mp->mp_elm_vector[i]; j < mp->mp_num_obj_per_chunk; j++, me = (struct mpool_element *) ((uintptr_t) me + mp->mp_elm_size)) { fprintf(stderr, "obj=%p index=%d gen_count=%d\n", (void *)((uintptr_t) me + sizeof(struct mpool_element)), me->me_index, me->me_gen_count); } fprintf(stderr, "===========================\n"); } } } #endif opa-psm2-PSM2_11.2.185/psm_mpool.h000066400000000000000000000075121370564314600164040ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef _PSMI_IN_USER_H #error psm_mpool.h not meant to be included directly, include psm_user.h instead #endif #ifndef PSM_MPOOL_H #define PSM_MPOOL_H /* mpool flags */ #define PSMI_MPOOL_ALIGN_CACHE 0x1 #define PSMI_MPOOL_ALIGN_PAGE 0x2 #define PSMI_MPOOL_NOGENERATION 0x4 /* Backwards compatibility */ #define PSMI_MPOOL_ALIGN PSMI_MPOOL_ALIGN_CACHE typedef struct mpool *mpool_t; typedef void (*non_empty_callback_fn_t) (void *context); typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context, void *chunk); mpool_t MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context); MOCK_DCL_EPILOGUE(psmi_mpool_create); mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, non_empty_callback_fn_t cb, void *context, alloc_dealloc_callback_fn_t ad_cb, void *ad_context); void psmi_mpool_destroy(mpool_t mp); void MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, uint32_t *num_obj_max_total); MOCK_DCL_EPILOGUE(psmi_mpool_get_obj_info); void *psmi_mpool_get(mpool_t mp); void psmi_mpool_put(void *obj); int psmi_mpool_get_obj_index(void *obj); uint32_t psmi_mpool_get_obj_gen_count(void *obj); int psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index, uint32_t *gen_count); void *psmi_mpool_find_obj_by_index(mpool_t mp, int index); #endif opa-psm2-PSM2_11.2.185/psm_mq.c000066400000000000000000001204221370564314600156620ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include #include "psm_user.h" #include "psm2_hal.h" #include "psm_mq_internal.h" #ifdef PSM_CUDA #include "psm_gdrcpy.h" #endif /* * Functions to manipulate the expected queue in mq_ep. */ /* * Once the linked lists cross the size limit, this function will enable tag * hashing and disable the non-hashing fastpath. We need to go back and insert * reqs into the hash tables where the hashing searches will look for them. */ void psmi_mq_fastpath_disable(psm2_mq_t mq) { psm2_mq_req_t *curp, cur; struct mqq *qp; unsigned hashvals[NUM_HASH_CONFIGS]; int t = PSM2_ANYTAG_ANYSRC; mq->nohash_fastpath = 0; /* Everything in the unexpected_q needs to be duplicated into each of the (three) unexpected hash tables. */ qp = &mq->unexpected_q; for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[t]) { mq->unexpected_hash_len++; hashvals[PSM2_TAG_SRC] = hash_64(*(uint64_t *) cur->req_data.tag.tag) % NUM_HASH_BUCKETS; hashvals[PSM2_TAG_ANYSRC] = hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; hashvals[PSM2_ANYTAG_SRC] = hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; for (t = PSM2_TAG_SRC; t < PSM2_ANYTAG_ANYSRC; t++) mq_qq_append_which(mq->unexpected_htab, t, hashvals[t], cur); } /* Everything in the expected_q needs to be moved into the (single) correct expected hash table. */ qp = &mq->expected_q; for (curp = &qp->first; (cur = *curp) != NULL; /*curp = &cur->next*/) { /* must read next ptr before remove */ curp = &cur->next[PSM2_ANYTAG_ANYSRC]; if ((cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) && (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF)) { /* hash tag0 and tag1 */ t = PSM2_TAG_SRC; hashvals[t] = hash_64(*(uint64_t *) cur->req_data.tag.tag) % NUM_HASH_BUCKETS; mq_qq_append_which(mq->expected_htab, t, hashvals[t], cur); } else if (cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) { t = PSM2_TAG_ANYSRC; hashvals[t] = hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; mq_qq_append_which(mq->expected_htab, t, hashvals[t], cur); } else if (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF) { t = PSM2_ANYTAG_SRC; hashvals[t] = hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; mq_qq_append_which(mq->expected_htab, t, hashvals[t], cur); } else continue; /* else, req must stay in ANY ANY */ mq->expected_list_len--; mq->expected_hash_len++; mq_qq_remove_which(cur, PSM2_ANYTAG_ANYSRC); } } /* easy threshold to re-enable: if |hash| == 0 && |list| < X aggressive threshold: if |hash| + |list| < X even easier: if |hash| + |list| == 0 might be better approach to avoid constant bouncing between modes */ void psmi_mq_fastpath_try_reenable(psm2_mq_t mq) { if_pf(mq->nohash_fastpath == 0 && mq->unexpected_hash_len == 0 && mq->expected_hash_len == 0 && mq->unexpected_list_len == 0 && mq->expected_list_len == 0){ mq->nohash_fastpath = 1; } } /* * ! @brief PSM exposed version to allow PTLs to match */ /*! @brief Try to match against the MQ using a tag and tagsel * * @param[in] mq Message Queue * @param[in] src Source (sender) epaddr, may be PSM2_MQ_ANY_ADDR. * @param[in] tag Input Tag * @param[in] tagsel Input Tag Selector * @param[in] remove Non-zero to remove the req from the queue * * @returns NULL if no match or an mq request if there is a match */ static psm2_mq_req_t mq_req_match_with_tagsel(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove) { psm2_mq_req_t *curp; psm2_mq_req_t cur; unsigned hashval; int i, j = 0; struct mqq *qp; if_pt (mq->nohash_fastpath) { i = j = PSM2_ANYTAG_ANYSRC; qp = &mq->unexpected_q; } else if ((tagsel->tag[0] == 0xFFFFFFFF) && (tagsel->tag[1] == 0xFFFFFFFF)) { i = PSM2_TAG_SRC; hashval = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS; qp = &mq->unexpected_htab[i][hashval]; } else if (tagsel->tag[0] == 0xFFFFFFFF) { i = PSM2_TAG_ANYSRC; hashval = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS; qp = &mq->unexpected_htab[i][hashval]; } else if (tagsel->tag[1] == 0xFFFFFFFF) { i = PSM2_ANYTAG_SRC; hashval = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS; qp = &mq->unexpected_htab[i][hashval]; } else { /* unhashable tag */ i = PSM2_ANYTAG_ANYSRC; qp = &mq->unexpected_q; } for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[i]) { psmi_assert(cur->req_data.peer != PSM2_MQ_ANY_ADDR); if ((src == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) && !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & tagsel->tag[0]) && !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & tagsel->tag[1]) && !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & tagsel->tag[2])) { /* match! */ if (remove) { if_pt (i == PSM2_ANYTAG_ANYSRC) mq->unexpected_list_len--; else mq->unexpected_hash_len--; for (; j < NUM_MQ_SUBLISTS; j++) mq_qq_remove_which(cur, j); psmi_mq_fastpath_try_reenable(mq); } return cur; } } return NULL; } static void mq_add_to_expected_hashes(psm2_mq_t mq, psm2_mq_req_t req) { unsigned hashval; int i; req->timestamp = mq->timestamp++; if_pt (mq->nohash_fastpath) { mq_qq_append(&mq->expected_q, req); req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q; mq->expected_list_len++; if_pf (mq->expected_list_len >= HASH_THRESHOLD) psmi_mq_fastpath_disable(mq); } else if ((req->req_data.tagsel.tag[0] == 0xFFFFFFFF) && (req->req_data.tagsel.tag[1] == 0xFFFFFFFF)) { i = PSM2_TAG_SRC; hashval = hash_64(*(uint64_t *) req->req_data.tag.tag) % NUM_HASH_BUCKETS; mq_qq_append_which(mq->expected_htab, i, hashval, req); mq->expected_hash_len++; } else if (req->req_data.tagsel.tag[0] == 0xFFFFFFFF) { i = PSM2_TAG_ANYSRC; hashval = hash_32(req->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; mq_qq_append_which(mq->expected_htab, i, hashval, req); mq->expected_hash_len++; } else if (req->req_data.tagsel.tag[1] == 0xFFFFFFFF) { i = PSM2_ANYTAG_SRC; hashval = hash_32(req->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; mq_qq_append_which(mq->expected_htab, i, hashval, req); mq->expected_hash_len++; } else { mq_qq_append(&mq->expected_q, req); req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q; mq->expected_list_len++; } } /*! @brief Try to remove the req in the MQ * * @param[in] mq Message Queue * @param[in] req MQ request * * @returns 1 if successfully removed, or 0 if req cannot be found. */ static int mq_req_remove_single(psm2_mq_t mq, psm2_mq_req_t req) { int i; /* item should only exist in one expected queue at a time */ psmi_assert((!!req->q[0] + !!req->q[1] + !!req->q[2] + !!req->q[3]) == 1); for (i = 0; i < NUM_MQ_SUBLISTS; i++) if (req->q[i]) /* found */ break; switch (i) { case PSM2_ANYTAG_ANYSRC: mq->expected_list_len--; break; case PSM2_TAG_SRC: case PSM2_TAG_ANYSRC: case PSM2_ANYTAG_SRC: mq->expected_hash_len--; break; default: return 0; } mq_qq_remove_which(req, i); psmi_mq_fastpath_try_reenable(mq); return 1; } PSMI_ALWAYS_INLINE( psm2_mq_req_t psmi_mq_iprobe_inner(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove_req)) { psm2_mq_req_t req; PSMI_LOCK(mq->progress_lock); req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req); if (req != NULL) { PSMI_UNLOCK(mq->progress_lock); return req; } psmi_poll_internal(mq->ep, 1); /* try again */ req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req); PSMI_UNLOCK(mq->progress_lock); return req; } psm2_error_t __psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, psm2_mq_status2_t *status) { psm2_mq_req_t req; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 0); psmi_assert_req_not_internal(req); if (req != NULL) { if (status != NULL) { mq_status2_copy(req, status); } PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSM2_LOG_MSG("leaving"); return PSM2_MQ_NO_COMPLETIONS; } PSMI_API_DECL(psm2_mq_iprobe2) psm2_error_t __psm2_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, psm2_mq_status_t *status) { psm2_mq_tag_t rtag; psm2_mq_tag_t rtagsel; psm2_mq_req_t req; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); *(uint64_t *) rtag.tag = tag; #ifdef PSM_DEBUG rtag.tag[2] = 0; #endif *(uint64_t *) rtagsel.tag = tagsel; rtagsel.tag[2] = 0; req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 0); psmi_assert_req_not_internal(req); if (req != NULL) { if (status != NULL) { mq_status_copy(req, status); } PSM2_LOG_MSG("leaving"); return PSM2_OK; } PSM2_LOG_MSG("leaving"); return PSM2_MQ_NO_COMPLETIONS; } PSMI_API_DECL(psm2_mq_iprobe) psm2_error_t __psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, psm2_mq_req_t *reqo, psm2_mq_status2_t *status) { psm2_mq_req_t req; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 1); if (req != NULL) { if (status != NULL) { mq_status2_copy(req, status); } *reqo = req; PSM2_LOG_MSG("leaving"); return PSM2_OK; } *reqo = NULL; PSM2_LOG_MSG("leaving"); return PSM2_MQ_NO_COMPLETIONS; } PSMI_API_DECL(psm2_mq_improbe2) psm2_error_t __psm2_mq_improbe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, psm2_mq_req_t *reqo, psm2_mq_status_t *status) { psm2_mq_tag_t rtag; psm2_mq_tag_t rtagsel; psm2_mq_req_t req; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); *(uint64_t *) rtag.tag = tag; #ifdef PSM_DEBUG rtag.tag[2] = 0; #endif *(uint64_t *) rtagsel.tag = tagsel; rtagsel.tag[2] = 0; req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 1); if (req != NULL) { if (status != NULL) { mq_status_copy(req, status); } *reqo = req; PSM2_LOG_MSG("leaving"); return PSM2_OK; } *reqo = NULL; PSM2_LOG_MSG("leaving"); return PSM2_MQ_NO_COMPLETIONS; } PSMI_API_DECL(psm2_mq_improbe) psm2_error_t __psm2_mq_cancel(psm2_mq_req_t *ireq) { psm2_mq_req_t req = *ireq; psm2_mq_t mq; psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); if (req == NULL) { PSM2_LOG_MSG("leaving"); return PSM2_MQ_NO_COMPLETIONS; } /* Cancelling a send is a blocking operation, and expensive. * We only allow cancellation of rendezvous sends, consider the eager sends * as always unsuccessfully cancelled. */ mq = req->mq; PSMI_LOCK(mq->progress_lock); if (MQE_TYPE_IS_RECV(req->type)) { if (req->state == MQ_STATE_POSTED) { int rc; rc = mq_req_remove_single(mq, req); psmi_assert_always(rc); req->state = MQ_STATE_COMPLETE; mq_qq_append(&mq->completed_q, req); err = PSM2_OK; } else err = PSM2_MQ_NO_COMPLETIONS; } else { err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR, "Cannot cancel send requests (req=%p)", req); } PSMI_UNLOCK(mq->progress_lock); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_cancel) /* This is the only PSM function that blocks. * We handle it in a special manner since we don't know what the user's * execution environment is (threads, oversubscribing processes, etc). * * The status argument can be an instance of either type psm2_mq_status_t or * psm2_mq_status2_t. Depending on the type, a corresponding status copy * routine should be passed in. */ PSMI_ALWAYS_INLINE( psm2_error_t psmi_mq_wait_inner(psm2_mq_req_t *ireq, void *status, psmi_mq_status_copy_t status_copy, int do_lock)) { psm2_error_t err = PSM2_OK; psm2_mq_req_t req = *ireq; if (req == PSM2_MQ_REQINVALID) { return PSM2_OK; } if (do_lock) PSMI_LOCK(req->mq->progress_lock); if (req->state != MQ_STATE_COMPLETE) { psm2_mq_t mq = req->mq; /* We'll be waiting on this req, mark it as so */ req->type |= MQE_TYPE_WAITING; _HFI_VDBG("req=%p, buf=%p, len=%d, waiting\n", req, req->req_data.buf, req->req_data.buf_len); if (req->testwait_callback) { err = req->testwait_callback(ireq); if (do_lock) PSMI_UNLOCK(req->mq->progress_lock); if (status != NULL) { status_copy(req, status); } return err; } PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE); if (err > PSM2_OK_NO_PROGRESS) goto fail_with_lock; else err = PSM2_OK; } if(!psmi_is_req_internal(req)) mq_qq_remove(&req->mq->completed_q, req); if (status != NULL) { status_copy(req, status); } _HFI_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n", req, req->req_data.buf, req->req_data.buf_len, req->req_data.error_code); psmi_mq_req_free(req); *ireq = PSM2_MQ_REQINVALID; fail_with_lock: if (do_lock) PSMI_UNLOCK(req->mq->progress_lock); return err; } psm2_error_t __psm2_mq_wait2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status) { psm2_error_t rv; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); psmi_assert_req_not_internal(*ireq); rv = psmi_mq_wait_inner(ireq, status, (psmi_mq_status_copy_t) mq_status2_copy, 1); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_wait2) psm2_error_t __psm2_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status) { psm2_error_t rv; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); psmi_assert_req_not_internal(*ireq); rv = psmi_mq_wait_inner(ireq, status, (psmi_mq_status_copy_t) mq_status_copy, 1); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_wait) psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq) { return psmi_mq_wait_inner(ireq, NULL, NULL, 0); } /* The status argument can be an instance of either type psm2_mq_status_t or * psm2_mq_status2_t. Depending on the type, a corresponding status copy * routine should be passed in. */ PSMI_ALWAYS_INLINE( psm2_error_t psmi_mq_test_inner(psm2_mq_req_t *ireq, void *status, psmi_mq_status_copy_t status_copy)) { psm2_mq_req_t req = *ireq; psm2_error_t err = PSM2_OK; PSMI_ASSERT_INITIALIZED(); if (req == PSM2_MQ_REQINVALID) { return PSM2_OK; } if (req->state != MQ_STATE_COMPLETE) { if (req->testwait_callback) { PSMI_LOCK(req->mq->progress_lock); err = req->testwait_callback(ireq); if (status != NULL) { status_copy(req, status); } PSMI_UNLOCK(req->mq->progress_lock); return err; } else return PSM2_MQ_NO_COMPLETIONS; } if (status != NULL) status_copy(req, status); _HFI_VDBG ("req=%p complete, tag=%08x.%08x.%08x buf=%p, len=%d, err=%d\n", req, req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], req->req_data.buf, req->req_data.buf_len, req->req_data.error_code); PSMI_LOCK(req->mq->progress_lock); mq_qq_remove(&req->mq->completed_q, req); psmi_mq_req_free(req); PSMI_UNLOCK(req->mq->progress_lock); *ireq = PSM2_MQ_REQINVALID; return err; } psm2_error_t __psm2_mq_test2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status) { psm2_error_t rv; PSM2_LOG_MSG("entering"); rv = psmi_mq_test_inner(ireq, status, (psmi_mq_status_copy_t) mq_status2_copy); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_test2) psm2_error_t __psm2_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status) { psm2_error_t rv; PSM2_LOG_MSG("entering"); rv = psmi_mq_test_inner(ireq, status, (psmi_mq_status_copy_t) mq_status_copy); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_test) psm2_error_t __psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context, psm2_mq_req_t *req) { psm2_error_t err; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); psmi_assert(stag != NULL); PSMI_LOCK(mq->progress_lock); err = dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL, stag, buf, len, context, req); PSMI_UNLOCK(mq->progress_lock); psmi_assert(*req != NULL); psmi_assert_req_not_internal(*req); (*req)->req_data.peer = dest; PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_isend2) psm2_error_t __psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, const void *buf, uint32_t len, void *context, psm2_mq_req_t *req) { psm2_error_t err; psm2_mq_tag_t tag; PSM2_LOG_MSG("entering"); *((uint64_t *) tag.tag) = stag; tag.tag[2] = 0; PSMI_ASSERT_INITIALIZED(); PSMI_LOCK(mq->progress_lock); err = dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL, &tag, buf, len, context, req); PSMI_UNLOCK(mq->progress_lock); psmi_assert(*req != NULL); psmi_assert_req_not_internal(*req); (*req)->req_data.peer = dest; PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_isend) psm2_error_t __psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, psm2_mq_tag_t *stag, const void *buf, uint32_t len) { psm2_error_t err; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); psmi_assert(stag != NULL); PSMI_LOCK(mq->progress_lock); err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len); PSMI_UNLOCK(mq->progress_lock); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_send2) psm2_error_t __psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, const void *buf, uint32_t len) { psm2_error_t err; psm2_mq_tag_t tag; PSM2_LOG_MSG("entering stag: 0x%" PRIx64, stag); *((uint64_t *) tag.tag) = stag; tag.tag[2] = 0; PSMI_ASSERT_INITIALIZED(); PSMI_LOCK(mq->progress_lock); err = dest->ptlctl->mq_send(mq, dest, flags, &tag, buf, len); PSMI_UNLOCK(mq->progress_lock); PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_send) /* * Common subroutine to psm2_mq_irecv2 and psm2_mq_imrecv. This code assumes * that the provided request has been matched, and begins copying message data * that has already arrived to the user's buffer. Any remaining data is copied * by PSM polling until the message is complete. */ static psm2_error_t psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) { uint32_t copysz; PSM2_LOG_MSG("entering"); psmi_assert(MQE_TYPE_IS_RECV(req->type)); psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy; #ifdef PSM_CUDA if (!req->is_buf_gpu_mem) psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; #endif _HFI_VDBG("(req=%p) buf=%p len=%u req.state=%u\n", req, buf, len, req->state); switch (req->state) { case MQ_STATE_COMPLETE: if (req->req_data.buf != NULL) { /* 0-byte messages don't alloc a sysbuf */ copysz = mq_set_msglen(req, len, req->req_data.send_msglen); void *ubuf = buf; #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, len)) { ubuf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)buf, len, 1, mq->ep->epaddr->proto); psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; } #endif psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz); psmi_mq_sysbuf_free(mq, req->req_data.buf); } req->req_data.buf = buf; req->req_data.buf_len = len; mq_qq_append(&mq->completed_q, req); break; case MQ_STATE_UNEXP: /* not done yet */ copysz = mq_set_msglen(req, len, req->req_data.send_msglen); /* Copy What's been received so far and make sure we don't receive * any more than copysz. After that, swap system with user buffer */ req->recv_msgoff = min(req->recv_msgoff, copysz); #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { buf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->user_gpu_buffer, req->req_data.send_msglen, 1, mq->ep->epaddr->proto); psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; } #endif if (req->recv_msgoff) { psmi_mtucpy_fn(buf, (const void *)req->req_data.buf, req->recv_msgoff); } psmi_mq_sysbuf_free(mq, req->req_data.buf); req->state = MQ_STATE_MATCHED; req->req_data.buf = buf; req->req_data.buf_len = len; break; case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ copysz = mq_set_msglen(req, len, req->req_data.send_msglen); /* Copy What's been received so far and make sure we don't receive * any more than copysz. After that, swap system with user buffer */ req->recv_msgoff = min(req->recv_msgoff, copysz); if (req->recv_msgoff) { psmi_mtucpy_fn(buf, (const void *)req->req_data.buf, req->recv_msgoff); } if (req->send_msgoff) { psmi_mq_sysbuf_free(mq, req->req_data.buf); } req->state = MQ_STATE_MATCHED; req->req_data.buf = buf; req->req_data.buf_len = len; req->rts_callback(req, 0); break; default: fprintf(stderr, "Unexpected state %d in req %p\n", req->state, req); fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n", req->type, req->mq, req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2]); abort(); } PSM2_LOG_MSG("leaving"); return PSM2_OK; } psm2_error_t __psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req) { psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); PSMI_LOCK_ASSERT(mq->progress_lock); if (fp_type == PSM2_MQ_ISEND_FP) { psmi_assert(tag != NULL); err = addr->ptlctl->mq_isend(mq, addr, flags, PSMI_REQ_FLAG_FASTPATH, tag, buf, len, context, req); psmi_assert(*req != NULL); psmi_assert_req_not_internal(*req); (*req)->req_data.peer = addr; } else if (fp_type == PSM2_MQ_IRECV_FP) { psm2_mq_req_t recv_req; #ifdef PSM_CUDA int gpu_mem = 0; void *gpu_user_buffer = NULL; if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { psmi_cuda_set_attr_sync_memops(buf); gpu_mem = 1; gpu_user_buffer = buf; } #endif /* First check unexpected Queue and remove req if found */ recv_req = mq_req_match_with_tagsel(mq, addr, tag, tagsel, REMOVE_ENTRY); if (recv_req == NULL) { /* prepost before arrival, add to expected q */ recv_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); if_pf(recv_req == NULL) { err = PSM2_NO_MEMORY; goto recv_ret; } recv_req->req_data.peer = addr; recv_req->req_data.tag = *tag; recv_req->req_data.tagsel = *tagsel; recv_req->state = MQ_STATE_POSTED; recv_req->req_data.buf = buf; recv_req->req_data.buf_len = len; recv_req->req_data.recv_msglen = len; recv_req->recv_msgoff = 0; recv_req->req_data.context = context; #ifdef PSM_CUDA recv_req->is_buf_gpu_mem = gpu_mem; recv_req->user_gpu_buffer = gpu_user_buffer; #endif mq_add_to_expected_hashes(mq, recv_req); _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x " " tagsel=%08x.%08x.%08x req=%p\n", buf, len, tag->tag[0], tag->tag[1], tag->tag[2], tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); } else { _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x" " tagsel=%08x.%08x.%08x req=%p\n", buf, len, tag->tag[0], tag->tag[1], tag->tag[2], tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); #ifdef PSM_CUDA recv_req->is_buf_gpu_mem = gpu_mem; recv_req->user_gpu_buffer = gpu_user_buffer; #endif recv_req->req_data.context = context; psm2_mq_irecv_inner(mq, recv_req, buf, len); } recv_ret: psmi_assert_req_not_internal(recv_req); *req = recv_req; } else { err = PSM2_PARAM_ERR; } PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_fp_msg) psm2_error_t __psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) { psm2_error_t err = PSM2_OK; psm2_mq_req_t req; #ifdef PSM_CUDA int gpu_mem = 0; if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { psmi_cuda_set_attr_sync_memops(buf); gpu_mem = 1; } #endif PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); PSMI_LOCK(mq->progress_lock); /* First check unexpected Queue and remove req if found */ req = mq_req_match_with_tagsel(mq, src, tag, tagsel, REMOVE_ENTRY); if (req == NULL) { /* prepost before arrival, add to expected q */ req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); if_pf(req == NULL) { err = PSM2_NO_MEMORY; goto ret; } req->req_data.peer = src; req->req_data.tag = *tag; req->req_data.tagsel = *tagsel; req->state = MQ_STATE_POSTED; req->req_data.buf = buf; req->req_data.buf_len = len; req->req_data.recv_msglen = len; req->recv_msgoff = 0; req->req_data.context = context; #ifdef PSM_CUDA req->is_buf_gpu_mem = gpu_mem; if (gpu_mem) req->user_gpu_buffer = buf; else req->user_gpu_buffer = NULL; #endif mq_add_to_expected_hashes(mq, req); _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x " " tagsel=%08x.%08x.%08x req=%p\n", buf, len, tag->tag[0], tag->tag[1], tag->tag[2], tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); } else { _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x" " tagsel=%08x.%08x.%08x req=%p\n", buf, len, tag->tag[0], tag->tag[1], tag->tag[2], tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); #ifdef PSM_CUDA req->is_buf_gpu_mem = gpu_mem; if (gpu_mem) req->user_gpu_buffer = buf; else req->user_gpu_buffer = NULL; #endif req->req_data.context = context; psm2_mq_irecv_inner(mq, req, buf, len); } ret: PSMI_UNLOCK(mq->progress_lock); psmi_assert_req_not_internal(req); *reqo = req; PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_irecv2) psm2_error_t __psm2_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) { psm2_error_t rv; psm2_mq_tag_t rtag; psm2_mq_tag_t rtagsel; *reqo = NULL; PSM2_LOG_MSG("entering tag: 0x%" PRIx64, tag); *(uint64_t *) rtag.tag = tag; #ifdef PSM_DEBUG rtag.tag[2] = 0; #endif *(uint64_t *) rtagsel.tag = tagsel; rtagsel.tag[2] = 0; rv = __psm2_mq_irecv2(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, flags, buf, len, context, reqo); psmi_assert_req_not_internal(*reqo); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_irecv) psm2_error_t __psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) { psm2_error_t err = PSM2_OK; psm2_mq_req_t req = *reqo; PSM2_LOG_MSG("entering"); PSMI_ASSERT_INITIALIZED(); if (req == PSM2_MQ_REQINVALID) { err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR, "Invalid request (req=%p)", req); } else { /* Message is already matched -- begin delivering message data to the user's buffer. */ req->req_data.context = context; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { psmi_cuda_set_attr_sync_memops(buf); req->is_buf_gpu_mem = 1; } else { req->is_buf_gpu_mem = 0; } #endif PSMI_LOCK(mq->progress_lock); psm2_mq_irecv_inner(mq, req, buf, len); PSMI_UNLOCK(mq->progress_lock); } PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_imrecv) /* The status argument can be an instance of either type psm2_mq_status_t or * psm2_mq_status2_t. Depending on the type, a corresponding status copy * routine should be passed in. */ PSMI_ALWAYS_INLINE( psm2_error_t psmi_mq_ipeek_inner(psm2_mq_t mq, psm2_mq_req_t *oreq, void *status, psmi_mq_status_copy_t status_copy)) { psm2_mq_req_t req; PSMI_ASSERT_INITIALIZED(); if ((req = mq->completed_q.first) == NULL) { PSMI_LOCK(mq->progress_lock); psmi_poll_internal(mq->ep, 1); if ((req = mq->completed_q.first) == NULL) { PSMI_UNLOCK(mq->progress_lock); return PSM2_MQ_NO_COMPLETIONS; } PSMI_UNLOCK(mq->progress_lock); } /* something in the queue */ *oreq = req; if (status != NULL) status_copy(req, status); return PSM2_OK; } psm2_error_t __psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status2_t *status) { psm2_error_t rv; *oreq = NULL; PSM2_LOG_MSG("entering"); rv = psmi_mq_ipeek_inner(mq, oreq, status, (psmi_mq_status_copy_t) mq_status2_copy); psmi_assert_req_not_internal(*oreq); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_ipeek2) psm2_error_t __psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status) { psm2_error_t rv; *oreq = NULL; PSM2_LOG_MSG("entering"); rv = psmi_mq_ipeek_inner(mq, oreq, status, (psmi_mq_status_copy_t) mq_status_copy); psmi_assert_req_not_internal(*oreq); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_ipeek) psm2_error_t __psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array, psmi_mq_status_copy_user_t status_copy, int *count) { psm2_mq_req_t req; int read_count = *count; int ret = 0; PSMI_ASSERT_INITIALIZED(); *count = 0; while (*count < read_count) { PSMI_LOCK(mq->progress_lock); if (mq->completed_q.first == NULL) psmi_poll_internal(mq->ep, 1); if ((req = mq->completed_q.first) == NULL) { PSMI_UNLOCK(mq->progress_lock); return PSM2_MQ_NO_COMPLETIONS; } mq_qq_remove(&mq->completed_q, req); PSMI_UNLOCK(mq->progress_lock); ret = status_copy(&req->req_data, status_array, *count); psm2_mq_req_free(mq, req); if (unlikely(ret < 0)) { *count = ret; return PSM2_INTERNAL_ERR; } else if (ret == 0) { continue; } *count = *count + 1; if (ret > 1) break; } return PSM2_OK; } PSMI_API_DECL(psm2_mq_ipeek_dequeue_multi) psm2_error_t __psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *oreq) { psm2_mq_req_t req; PSMI_ASSERT_INITIALIZED(); PSMI_LOCK(mq->progress_lock); if (mq->completed_q.first == NULL) psmi_poll_internal(mq->ep, 1); if ((req = mq->completed_q.first) == NULL) { PSMI_UNLOCK(mq->progress_lock); return PSM2_MQ_NO_COMPLETIONS; } mq_qq_remove(&mq->completed_q, req); PSMI_UNLOCK(mq->progress_lock); *oreq = req; return PSM2_OK; } PSMI_API_DECL(psm2_mq_ipeek_dequeue) psm2_error_t __psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req) { PSMI_ASSERT_INITIALIZED(); if (req == NULL) return PSM2_OK; PSMI_LOCK(mq->progress_lock); psmi_mq_req_free(req); PSMI_UNLOCK(mq->progress_lock); return PSM2_OK; } PSMI_API_DECL(psm2_mq_req_free) static psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) { psm2_error_t err = PSM2_OK; uint32_t val32; switch (key) { case PSM2_MQ_RNDV_HFI_SZ: if (get) *((uint32_t *) value) = mq->hfi_thresh_rv; else { val32 = *((uint32_t *) value); mq->hfi_thresh_rv = val32; } _HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n", mq->hfi_thresh_rv, get ? "GET" : "SET"); break; case PSM2_MQ_RNDV_SHM_SZ: if (get) *((uint32_t *) value) = mq->shm_thresh_rv; else { val32 = *((uint32_t *) value); mq->shm_thresh_rv = val32; } _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n", mq->shm_thresh_rv, get ? "GET" : "SET"); break; case PSM2_MQ_MAX_SYSBUF_MBYTES: /* Deprecated: this option no longer does anything. */ break; default: err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown option key=%u", key); break; } return err; } psm2_error_t __psm2_mq_getopt(psm2_mq_t mq, int key, void *value) { psm2_error_t rv; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(mq->ep); rv = psmi_mqopt_ctl(mq, key, value, 1); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_getopt) psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value) { psm2_error_t rv; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(mq->ep); rv = psmi_mqopt_ctl(mq, key, (void *)value, 0); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_setopt) #define TAB_SIZE 16 #define STATS \ STAT(rx_user_num) \ STAT(rx_sys_bytes) \ STAT(rx_sys_num) \ STAT(tx_num) \ STAT(tx_eager_num) \ STAT(tx_eager_bytes) \ STAT(tx_rndv_num) \ STAT(tx_rndv_bytes) \ STAT(tx_shm_num) \ STAT(rx_shm_num) \ STAT(rx_sysbuf_num) \ STAT(rx_sysbuf_bytes) static void psmi_mq_print_stats(psm2_mq_t mq, FILE *perf_stats_fd) { psm2_mq_stats_t stats; char msg_buffer[MSG_BUFFER_LEN]; psm2_mq_get_stats(mq, &stats); #define STAT(x) \ snprintf(msg_buffer, MSG_BUFFER_LEN, "%*lu",TAB_SIZE, stats.x); \ fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); STATS #undef STAT fwrite("\n", sizeof(char), 1, perf_stats_fd); } static void *psmi_mq_print_stats_thread(void *_mq) { psm2_mq_t mq = (psm2_mq_t)_mq; char perf_file_name[MSG_BUFFER_LEN]; char msg_buffer[MSG_BUFFER_LEN]; int delta_t = 0; snprintf(perf_file_name, MSG_BUFFER_LEN, "./psm2-perf-stat-ep-%" PRIu64 "-pid-%d", (uint64_t)(mq->ep->epid), getpid()); FILE *perf_stats_fd = fopen(perf_file_name, "w+"); if (!perf_stats_fd) { _HFI_ERROR("Failed to create fd for performance logging\n"); goto end; } #define STAT(x) \ snprintf(msg_buffer, MSG_BUFFER_LEN, "%*s",TAB_SIZE, #x);\ fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); STAT(delta_t) STATS #undef STAT fwrite("\n", sizeof(char), 1, perf_stats_fd); /* Performance stats will be printed every $PSM2_MQ_PRINT_STATS seconds */ do { snprintf(msg_buffer, MSG_BUFFER_LEN, "%*d",TAB_SIZE, delta_t); fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); psmi_mq_print_stats(mq, perf_stats_fd); fflush(perf_stats_fd); usleep(MICRO_SEC * mq->print_stats); delta_t += mq->print_stats; } while (mq->mq_perf_data.perf_print_stats); fclose(perf_stats_fd); end: pthread_exit(NULL); } static void psmi_mq_print_stats_init(psm2_mq_t mq) { mq->mq_perf_data.perf_print_stats = 1; if (pthread_create(&(mq->mq_perf_data.perf_print_thread), NULL, psmi_mq_print_stats_thread, (void*)mq)) { mq->mq_perf_data.perf_print_stats = 0; _HFI_ERROR("Failed to create logging thread\n"); } } static void psmi_mq_print_stats_finalize(psm2_mq_t mq) { if (mq->mq_perf_data.perf_print_stats) { mq->mq_perf_data.perf_print_stats = 0; pthread_join(mq->mq_perf_data.perf_print_thread, NULL); } } /* * This is the API for the user. We actually allocate the MQ much earlier, but * the user can set options after obtaining an endpoint */ psm2_error_t __psm2_mq_init(psm2_ep_t ep, uint64_t ignored, const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo) { psm2_error_t err = PSM2_OK; if (ep == NULL) { err = PSM2_PARAM_ERR; goto fail; } psm2_mq_t mq = ep->mq; int i; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); psmi_assert_always(mq != NULL); psmi_assert_always(mq->ep != NULL); /* Process options */ for (i = 0; err == PSM2_OK && i < numopts; i++) err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0); if (err != PSM2_OK) /* error already handled */ goto fail; /* Initialize the unexpected system buffer allocator */ psmi_mq_sysbuf_init(mq); char buf[128]; psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf); _HFI_VDBG("%s", buf); *mqo = mq; if (mq->print_stats > 0) psmi_mq_print_stats_init(mq); fail: PSM2_LOG_MSG("leaving"); return err; } PSMI_API_DECL(psm2_mq_init) psm2_error_t __psm2_mq_finalize(psm2_mq_t mq) { psm2_error_t rv = PSM2_OK; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(mq->ep); if (mq->print_stats == -1) { mq->print_stats = 1; psmi_mq_print_stats_init(mq); } if (mq->print_stats != 0) psmi_mq_print_stats_finalize(mq); PSM2_LOG_MSG("leaving"); return rv; } PSMI_API_DECL(psm2_mq_finalize) void __psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats) { PSM2_LOG_MSG("entering"); memcpy(stats, &mq->stats, sizeof(psm2_mq_stats_t)); PSM2_LOG_MSG("leaving"); } PSMI_API_DECL(psm2_mq_get_stats) psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo) { psm2_error_t err = PSM2_OK; psm2_mq_t mq = (psm2_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm2_mq)); if (mq == NULL) { err = psmi_handle_error(NULL, PSM2_NO_MEMORY, "Couldn't allocate memory for mq endpoint"); goto fail; } mq->ep = NULL; /*mq->unexpected_callback = NULL; */ mq->memmode = psmi_parse_memmode(); memset(mq->unexpected_htab, 0, NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq)); memset(mq->expected_htab, 0, NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq)); memset(&mq->expected_q, 0, sizeof(struct mqq)); memset(&mq->unexpected_q, 0, sizeof(struct mqq)); memset(&mq->completed_q, 0, sizeof(struct mqq)); memset(&mq->outoforder_q, 0, sizeof(struct mqq)); STAILQ_INIT(&mq->eager_q); /* The values are overwritten in initialize_defaults, they're just set to * sensible defaults until then */ if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M) { mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_PHI2; mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_PHI2; } else { mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_XEON; mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_XEON; } mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_CUDA; #endif mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV; memset(&mq->stats, 0, sizeof(psm2_mq_stats_t)); err = psmi_mq_req_init(mq); if (err) goto fail; *mqo = mq; return PSM2_OK; fail: if (mq != NULL) psmi_free(mq); return err; } psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq) { union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv, env_shmrv, env_stats; psmi_getenv("PSM2_MQ_TINY_HFI_THRESH", "hfi tiny packet switchover (max 8, default 8)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)mq->hfi_thresh_tiny, &env_hfitiny); mq->hfi_thresh_tiny = min(env_hfitiny.e_uint, 8); psmi_getenv("PSM2_MQ_RNDV_HFI_THRESH", "hfi eager-to-rendezvous switchover", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv); mq->hfi_thresh_rv = env_hfirv.e_uint; psmi_getenv("PSM2_MQ_RNDV_HFI_WINDOW", "hfi rendezvous window size, max 4M", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin); mq->hfi_base_window_rv = min(4 * 1024 * 1024, env_rvwin.e_uint); /* Re-evaluate this since it may have changed after initializing the shm * device */ mq->shm_thresh_rv = psmi_shm_mq_rv_thresh; psmi_getenv("PSM2_MQ_RNDV_SHM_THRESH", "shm eager-to-rendezvous switchover", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv); mq->shm_thresh_rv = env_shmrv.e_uint; psmi_getenv("PSM2_MQ_PRINT_STATS", "Prints MQ performance stats every n seconds to file" "./psm2-perf-stat-ep-[epid]-[pid] when set to -1 stats are " "printed only once during finalization", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) 0, &env_stats); mq->print_stats = env_stats.e_uint; mq->nohash_fastpath = 1; return PSM2_OK; } psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq) { psmi_mq_req_fini(mq); psmi_mq_sysbuf_fini(mq); psmi_free(mq); return PSM2_OK; } MOCK_DEF_EPILOGUE(psmi_mq_free); opa-psm2-PSM2_11.2.185/psm_mq_internal.h000066400000000000000000000421321370564314600175640ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef MQ_INT_H #define MQ_INT_H /* Ugh. smmintrin.h eventually includes mm_malloc.h, which calls malloc */ #ifdef malloc #undef malloc #endif #ifdef free #undef free #endif #include #include "psm_user.h" #include "psm_sysbuf.h" #include "psm2_mock_testing.h" #if 0 typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t) (psm2_mq_t mq, uint16_t mode, psm2_epaddr_t epaddr, uint64_t tag, uint32_t send_msglen, const void *payload, uint32_t paylen); #endif #define MICRO_SEC 1000000 #define MSG_BUFFER_LEN 100 struct psm2_mq_perf_data { pthread_t perf_print_thread; int perf_print_stats; }; enum psm2_mq_tag_pattern { PSM2_TAG_SRC = 0, PSM2_TAG_ANYSRC, PSM2_ANYTAG_SRC, PSM2_ANYTAG_ANYSRC, }; struct psm2_mq { psm2_ep_t ep; /**> ep back pointer */ mpool_t sreq_pool; mpool_t rreq_pool; struct mqq unexpected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS]; struct mqq expected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS]; /* in case the compiler can't figure out how to preserve the hashed values between mq_req_match() and mq_add_to_unexpected_hashes() ... */ unsigned hashvals[NUM_HASH_CONFIGS]; /*psm_mq_unexpected_callback_fn_t unexpected_callback; */ struct mqq expected_q; /**> Preposted (expected) queue */ struct mqq unexpected_q; /**> Unexpected queue */ struct mqq completed_q; /**> Completed queue */ struct mqq outoforder_q; /**> OutofOrder queue */ STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */ uint32_t hfi_thresh_tiny; uint32_t hfi_thresh_rv; uint32_t shm_thresh_rv; uint32_t hfi_base_window_rv; /**> this is a base rndv window size, will be further trimmed down per-connection based on the peer's MTU */ int memmode; uint64_t timestamp; psm2_mq_stats_t stats; /**> MQ stats, accumulated by each PTL */ int print_stats; struct psm2_mq_perf_data mq_perf_data; int nohash_fastpath; unsigned unexpected_hash_len; unsigned unexpected_list_len; unsigned expected_hash_len; unsigned expected_list_len; psmi_mem_ctrl_t handler_index[MM_NUM_OF_POOLS]; int mem_ctrl_is_init; uint64_t mem_ctrl_total_bytes; psmi_lock_t progress_lock; }; #define MQE_TYPE_IS_SEND(type) ((type) & MQE_TYPE_SEND) #define MQE_TYPE_IS_RECV(type) ((type) & MQE_TYPE_RECV) #define MQE_TYPE_SEND 0x1000 #define MQE_TYPE_RECV 0x2000 #define MQE_TYPE_FLAGMASK 0x0fff #define MQE_TYPE_WAITING 0x0001 #define MQE_TYPE_WAITING_PEER 0x0004 #define MQE_TYPE_EAGER_QUEUE 0x0008 #define MQ_STATE_COMPLETE 0 #define MQ_STATE_POSTED 1 #define MQ_STATE_MATCHED 2 #define MQ_STATE_UNEXP 3 #define MQ_STATE_UNEXP_RV 4 #define MQ_STATE_FREE 5 /* * These must match the ips protocol message opcode. */ #define MQ_MSG_TINY 0xc1 #define MQ_MSG_SHORT 0xc2 #define MQ_MSG_EAGER 0xc3 #define MQ_MSG_LONGRTS 0xc4 /* * Descriptor allocation limits. * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure */ #define MQ_SENDREQ_LIMITS { \ .env = "PSM2_MQ_SENDREQS_MAX", \ .descr = "Max num of isend requests in flight", \ .env_level = PSMI_ENVVAR_LEVEL_USER, \ .minval = 1, \ .maxval = ~0, \ .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ } #define MQ_RECVREQ_LIMITS { \ .env = "PSM2_MQ_RECVREQS_MAX", \ .descr = "Max num of irecv requests in flight", \ .env_level = PSMI_ENVVAR_LEVEL_USER, \ .minval = 1, \ .maxval = ~0, \ .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ } typedef psm2_error_t(*mq_rts_callback_fn_t) (psm2_mq_req_t req, int was_posted); typedef psm2_error_t(*mq_testwait_callback_fn_t) (psm2_mq_req_t *req); /* If request is marked as internal, then it will not be exposed to the user, will not be added to the mq->completed_q. This flag is set if request is used by e.g. MPI_SEND */ #define PSMI_REQ_FLAG_IS_INTERNAL (1 << 0) /* Identifies req as part of fast path. */ #define PSMI_REQ_FLAG_FASTPATH (1 << 1) /* Identifies req as a NORMAL operation with no special cases.*/ #define PSMI_REQ_FLAG_NORMAL 0 #define psmi_is_req_internal(req) ((req)->flags_internal & PSMI_REQ_FLAG_IS_INTERNAL) #define psmi_assert_req_not_internal(req) psmi_assert(((req) == PSM2_MQ_REQINVALID) || \ (!psmi_is_req_internal(req))) /* receive mq_req, the default */ struct psm2_mq_req { struct psm2_mq_req_user req_data; struct { psm2_mq_req_t next[NUM_MQ_SUBLISTS]; psm2_mq_req_t prev[NUM_MQ_SUBLISTS]; STAILQ_ENTRY(psm2_mq_req) nextq; /* used for eager only */ }; struct mqq *q[NUM_MQ_SUBLISTS]; uint64_t timestamp; uint32_t state; uint32_t type; psm2_mq_t mq; /* Some PTLs want to get notified when there's a test/wait event */ mq_testwait_callback_fn_t testwait_callback; uint16_t msg_seqnum; /* msg seq num for mctxt */ uint32_t recv_msgoff; /* Message offset into req_data.buf */ union { uint32_t send_msgoff; /* Bytes received so far.. can be larger than buf_len */ uint32_t recv_msgposted; }; uint32_t rts_reqidx_peer; uint32_t flags_user; uint32_t flags_internal; /* Used to keep track of unexpected rendezvous */ mq_rts_callback_fn_t rts_callback; psm2_epaddr_t rts_peer; uintptr_t rts_sbuf; #ifdef PSM_CUDA uint8_t* user_gpu_buffer; STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch; uint32_t prefetch_send_msgoff; int cuda_hostbuf_used; CUipcMemHandle cuda_ipc_handle; CUevent cuda_ipc_event; uint8_t cuda_ipc_handle_attached; uint32_t cuda_ipc_offset; /* * is_sendbuf_gpu_mem - Used to always select TID path on the receiver * when send is on a device buffer */ uint8_t is_sendbuf_gpu_mem; #endif /* * is_buf_gpu_mem - used to indicate if the send or receive is issued * on a device/host buffer. */ uint8_t is_buf_gpu_mem; /* PTLs get to store their own per-request data. MQ manages the allocation * by allocating psm2_mq_req so that ptl_req_data has enough space for all * possible PTLs. */ union { void *ptl_req_ptr; /* when used by ptl as pointer */ uint8_t ptl_req_data[0]; /* when used by ptl for "inline" data */ }; }; PSMI_ALWAYS_INLINE( unsigned hash_64(uint64_t a)) { return _mm_crc32_u64(0, a); } PSMI_ALWAYS_INLINE( unsigned hash_32(uint32_t a)) { return _mm_crc32_u32(0, a); } void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars); MOCK_DCL_EPILOGUE(psmi_mq_mtucpy); void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars); #if defined(__x86_64__) void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars); #else #define psmi_mq_mtucpy_safe psmi_mq_mtucpy #endif /* * Optimize for 0-8 byte case, but also handle others. */ PSMI_ALWAYS_INLINE( void mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) { #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) { PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)dest, (CUdeviceptr)src, len); return; } #endif switch (len) { case 8: *dest++ = *src++; case 4: *dest++ = *src++; case 0: return; case 7: case 6: case 5: *dest++ = *src++; len -= 4; case 3: case 2: case 1: break; default: /* greater than 8 */ psmi_mq_mtucpy(dest, src, len); return; } uint8_t *dest1 = (uint8_t *) dest; uint8_t *src1 = (uint8_t *) src; switch (len) { case 3: *dest1++ = *src1++; case 2: *dest1++ = *src1++; case 1: *dest1++ = *src1++; } } typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len); #ifdef PSM_CUDA PSMI_ALWAYS_INLINE( void mq_copy_tiny_host_mem(uint32_t *dest, uint32_t *src, uint8_t len)) { switch (len) { case 8: *dest++ = *src++; case 4: *dest++ = *src++; case 0: return; case 7: case 6: case 5: *dest++ = *src++; len -= 4; case 3: case 2: case 1: break; default: /* greater than 8 */ psmi_mq_mtucpy(dest, src, len); return; } uint8_t *dest1 = (uint8_t *) dest; uint8_t *src1 = (uint8_t *) src; switch (len) { case 3: *dest1++ = *src1++; case 2: *dest1++ = *src1++; case 1: *dest1++ = *src1++; } } #endif /* Typedef describing a function to populate a psm2_mq_status(2)_t given a * matched request. The purpose of this typedef is to avoid duplicating * code to handle both PSM v1 and v2 status objects. Outer routines pass in * either mq_status_copy or mq_status2_copy and the inner routine calls that * provided routine to fill in the correct status type. */ typedef void (*psmi_mq_status_copy_t) (psm2_mq_req_t req, void *status); /* * Given an req with buffer ubuf of length ubuf_len, * fill in the req's status and return the amount of bytes the request * can receive. * * The function sets status truncation errors. Basically what MPI_Status does. */ PSMI_ALWAYS_INLINE( void mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status)) { status->msg_tag = *((uint64_t *) req->req_data.tag.tag); status->msg_length = req->req_data.send_msglen; status->nbytes = req->req_data.recv_msglen; status->error_code = (psm2_error_t)req->req_data.error_code; status->context = req->req_data.context; } PSMI_ALWAYS_INLINE( void mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status)) { status->msg_peer = req->req_data.peer; status->msg_tag = req->req_data.tag; status->msg_length = req->req_data.send_msglen; status->nbytes = req->req_data.recv_msglen; status->error_code = (psm2_error_t)req->req_data.error_code; status->context = req->req_data.context; } PSMI_ALWAYS_INLINE( uint32_t mq_set_msglen(psm2_mq_req_t req, uint32_t recvlen, uint32_t sendlen)) { req->req_data.send_msglen = sendlen; if (recvlen < sendlen) { req->req_data.recv_msglen = recvlen; req->req_data.error_code = PSM2_MQ_TRUNCATION; return recvlen; } else { req->req_data.recv_msglen = sendlen; req->req_data.error_code = PSM2_OK; return sendlen; } } PSMI_ALWAYS_INLINE( int min_timestamp_4(psm2_mq_req_t *match)) { uint64_t oldest = -1; int which = -1, i; for (i = 0; i < 4; i++) { if (match[i] && (match[i]->timestamp < oldest)) { oldest = match[i]->timestamp; which = i; } } return which; } #ifndef PSM_DEBUG /*! Append to Queue */ PSMI_ALWAYS_INLINE(void mq_qq_append(struct mqq *q, psm2_mq_req_t req)) { req->next[PSM2_ANYTAG_ANYSRC] = NULL; req->prev[PSM2_ANYTAG_ANYSRC] = q->last; if (q->last) q->last->next[PSM2_ANYTAG_ANYSRC] = req; else q->first = req; q->last = req; req->q[PSM2_ANYTAG_ANYSRC] = q; } #else #define mq_qq_append(qq, req) \ do { \ psmi_assert_req_not_internal(req); \ (req)->next[PSM2_ANYTAG_ANYSRC] = NULL; \ (req)->prev[PSM2_ANYTAG_ANYSRC] = (qq)->last; \ if ((qq)->last) \ (qq)->last->next[PSM2_ANYTAG_ANYSRC] = (req); \ else \ (qq)->first = (req); \ (qq)->last = (req); \ (req)->q[PSM2_ANYTAG_ANYSRC] = (qq); \ if (qq == &(req)->mq->completed_q) \ _HFI_VDBG("Moving (req)=%p to completed queue on %s, %d\n", \ (req), __FILE__, __LINE__); \ } while (0) #endif PSMI_ALWAYS_INLINE( void mq_qq_append_which(struct mqq q[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS], int table, int bucket, psm2_mq_req_t req)) { req->next[table] = NULL; req->prev[table] = q[table][bucket].last; if (q[table][bucket].last) q[table][bucket].last->next[table] = req; else q[table][bucket].first = req; q[table][bucket].last = req; req->q[table] = &q[table][bucket]; } PSMI_ALWAYS_INLINE(void mq_qq_remove(struct mqq *q, psm2_mq_req_t req)) { if (req->next[PSM2_ANYTAG_ANYSRC] != NULL) req->next[PSM2_ANYTAG_ANYSRC]->prev[PSM2_ANYTAG_ANYSRC] = req->prev[PSM2_ANYTAG_ANYSRC]; else q->last = req->prev[PSM2_ANYTAG_ANYSRC]; if (req->prev[PSM2_ANYTAG_ANYSRC]) req->prev[PSM2_ANYTAG_ANYSRC]->next[PSM2_ANYTAG_ANYSRC] = req->next[PSM2_ANYTAG_ANYSRC]; else q->first = req->next[PSM2_ANYTAG_ANYSRC]; } PSMI_ALWAYS_INLINE(void mq_qq_remove_which(psm2_mq_req_t req, int table)) { struct mqq *q = req->q[table]; req->q[table] = NULL; if (req->next[table] != NULL) req->next[table]->prev[table] = req->prev[table]; else q->last = req->prev[table]; if (req->prev[table]) req->prev[table]->next[table] = req->next[table]; else q->first = req->next[table]; } psm2_error_t psmi_mq_req_init(psm2_mq_t mq); psm2_error_t psmi_mq_req_fini(psm2_mq_t mq); psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type); MOCK_DCL_EPILOGUE(psmi_mq_req_alloc); #define psmi_mq_req_free(req) psmi_mpool_put(req) /* * Main receive progress engine, for shmops and hfi, in mq.c */ psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo); psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq); psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq); MOCK_DCL_EPILOGUE(psmi_mq_free); /* Three functions that handle all MQ stuff */ #define MQ_RET_MATCH_OK 0 #define MQ_RET_UNEXP_OK 1 #define MQ_RET_UNEXP_NO_RESOURCES 2 #define MQ_RET_DATA_OK 3 #define MQ_RET_DATA_OUT_OF_ORDER 4 void psmi_mq_handle_rts_complete(psm2_mq_req_t req); int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, uint32_t offset, const void *payload, uint32_t paylen); int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, uint32_t msglen, const void *payload, uint32_t paylen, int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o); int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, uint32_t msglen, uint32_t offset, const void *payload, uint32_t paylen, int msgorder, uint32_t opcode, psm2_mq_req_t *req_o); int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req); void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn); void psmi_mq_fastpath_disable(psm2_mq_t mq); void psmi_mq_fastpath_try_reenable(psm2_mq_t mq); PSMI_ALWAYS_INLINE( psm2_mq_req_t mq_ooo_match(struct mqq *q, void *msgctl, uint16_t msg_seqnum)) { psm2_mq_req_t *curp; psm2_mq_req_t cur; for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next[PSM2_ANYTAG_ANYSRC]) { if (cur->ptl_req_ptr == msgctl && cur->msg_seqnum == msg_seqnum) { /* match! */ mq_qq_remove(q, cur); return cur; } } return NULL; /* no match */ } PSMI_ALWAYS_INLINE( psm2_mq_req_t mq_eager_match(psm2_mq_t mq, void *peer, uint16_t msg_seqnum)) { psm2_mq_req_t cur; cur = STAILQ_FIRST(&mq->eager_q); while (cur) { if (cur->ptl_req_ptr == peer && cur->msg_seqnum == msg_seqnum) return cur; cur = STAILQ_NEXT(cur, nextq); } return NULL; /* no match */ } #if 0 /* Not exposed in public psm, but may extend parts of PSM 2.1 to support * this feature before 2.3 */ psm_mq_unexpected_callback_fn_t psmi_mq_register_unexpected_callback(psm2_mq_t mq, psm_mq_unexpected_callback_fn_t fn); #endif PSMI_ALWAYS_INLINE(void psmi_mq_stats_rts_account(psm2_mq_req_t req)) { psm2_mq_t mq = req->mq; if (MQE_TYPE_IS_SEND(req->type)) { mq->stats.tx_num++; mq->stats.tx_rndv_num++; mq->stats.tx_rndv_bytes += req->req_data.send_msglen; } else { mq->stats.rx_user_num++; mq->stats.rx_user_bytes += req->req_data.recv_msglen; } return; } #endif opa-psm2-PSM2_11.2.185/psm_mq_recv.c000066400000000000000000000446251370564314600167130ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "psm_mq_internal.h" #include "ptl_ips/ips_proto_header.h" #ifdef PSM_CUDA #include "psm_gdrcpy.h" #endif #if 0 /* Not exposed in public psm, but may extend parts of PSM 2.1 to support * this feature before 2.3 */ psm_mq_unexpected_callback_fn_t psmi_mq_register_unexpected_callback(psm2_mq_t mq, psm_mq_unexpected_callback_fn_t fn) { psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback; mq->unexpected_callback = fn; return old_fn; } #endif void psmi_mq_handle_rts_complete(psm2_mq_req_t req) { psm2_mq_t mq = req->mq; /* Stats on rendez-vous messages */ psmi_mq_stats_rts_account(req); req->state = MQ_STATE_COMPLETE; ips_barrier(); if(!psmi_is_req_internal(req)) mq_qq_append(&mq->completed_q, req); _HFI_VDBG("RTS complete, req=%p, recv_msglen = %d\n", req, req->req_data.recv_msglen); return; } static void psmi_mq_req_copy(psm2_mq_req_t req, uint32_t offset, const void *buf, uint32_t nbytes) { /* recv_msglen may be changed by unexpected receive req_data.buf. */ uint32_t msglen_this, end; uint8_t *msgptr = (uint8_t *) req->req_data.buf + offset; /* out of receiving range. */ if (offset >= req->req_data.recv_msglen) { req->send_msgoff += nbytes; return; } end = offset + nbytes; if (end > req->req_data.recv_msglen) { msglen_this = req->req_data.recv_msglen - offset; end = req->req_data.recv_msglen; } else { msglen_this = nbytes; } psmi_mq_mtucpy(msgptr, buf, msglen_this); if (req->recv_msgoff < end) { req->recv_msgoff = end; } req->send_msgoff += nbytes; return; } int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, uint32_t offset, const void *buf, uint32_t nbytes) { psmi_assert(req != NULL); int rc; if (req->state == MQ_STATE_MATCHED) rc = MQ_RET_MATCH_OK; else { psmi_assert(req->state == MQ_STATE_UNEXP); rc = MQ_RET_UNEXP_OK; } psmi_mq_req_copy(req, offset, buf, nbytes); /* * the reason to use >= is because send_msgoff * may be DW pad included. */ if (req->send_msgoff >= req->req_data.send_msglen) { if (req->type & MQE_TYPE_EAGER_QUEUE) { STAILQ_REMOVE(&mq->eager_q, req, psm2_mq_req, nextq); } if (req->state == MQ_STATE_MATCHED) { req->state = MQ_STATE_COMPLETE; ips_barrier(); mq_qq_append(&mq->completed_q, req); } else { /* MQ_STATE_UNEXP */ req->state = MQ_STATE_COMPLETE; } } return rc; } static void mq_add_to_unexpected_hashes(psm2_mq_t mq, psm2_mq_req_t req) { int table; mq_qq_append(&mq->unexpected_q, req); req->q[PSM2_ANYTAG_ANYSRC] = &mq->unexpected_q; mq->unexpected_list_len++; if_pt (mq->nohash_fastpath) { if_pf (mq->unexpected_list_len >= HASH_THRESHOLD) psmi_mq_fastpath_disable(mq); return; } for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++) mq_qq_append_which(mq->unexpected_htab, table, mq->hashvals[table], req); mq->unexpected_hash_len++; } psm2_mq_req_t mq_list_scan(struct mqq *q, psm2_epaddr_t src, psm2_mq_tag_t *tag, int which, uint64_t *time_threshold) { psm2_mq_req_t *curp, cur; for (curp = &q->first; ((cur = *curp) != NULL) && (cur->timestamp < *time_threshold); curp = &cur->next[which]) { if ((cur->req_data.peer == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) && !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & cur->req_data.tagsel.tag[0]) && !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & cur->req_data.tagsel.tag[1]) && !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & cur->req_data.tagsel.tag[2])) { *time_threshold = cur->timestamp; return cur; } } return NULL; } psm2_mq_req_t mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, int remove) { psm2_mq_req_t match[4]; int table; uint64_t best_ts = -1; if (mq->nohash_fastpath) { table = PSM2_ANYTAG_ANYSRC; match[table] = mq_list_scan(&mq->expected_q, src, tag, PSM2_ANYTAG_ANYSRC, &best_ts); if (match[table] && remove) { mq->expected_list_len--; mq_qq_remove_which(match[table], table); } return match[table]; } mq->hashvals[PSM2_TAG_SRC] = hash_64(*(uint64_t *) tag->tag) % NUM_HASH_BUCKETS; mq->hashvals[PSM2_TAG_ANYSRC] = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS; mq->hashvals[PSM2_ANYTAG_SRC] = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS; for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++) match[table] = mq_list_scan(&mq->expected_htab[table][mq->hashvals[table]], src, tag, table, &best_ts); table = PSM2_ANYTAG_ANYSRC; match[table] = mq_list_scan(&mq->expected_q, src, tag, table, &best_ts); table = min_timestamp_4(match); if (table == -1) return NULL; if (remove) { if_pt (table == PSM2_ANYTAG_ANYSRC) mq->expected_list_len--; else mq->expected_hash_len--; mq_qq_remove_which(match[table], table); psmi_mq_fastpath_try_reenable(mq); } return match[table]; } /* * This handles the rendezvous MPI envelopes, the packet might have the whole * message payload, or zero payload. */ int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, uint32_t send_msglen, const void *payload, uint32_t paylen, int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o) { psm2_mq_req_t req; uint32_t msglen; int rc; PSMI_LOCK_ASSERT(mq->progress_lock); if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { /* we have a match, no need to callback */ msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen); /* reset send_msglen because sender only sends this many */ req->req_data.send_msglen = msglen; req->state = MQ_STATE_MATCHED; req->req_data.peer = src; req->req_data.tag = *tag; if (paylen > msglen) paylen = msglen; if (paylen) { psmi_mq_mtucpy(req->req_data.buf, payload, paylen); } req->recv_msgoff = req->send_msgoff = paylen; *req_o = req; /* yes match */ PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); rc = MQ_RET_MATCH_OK; } else if (msgorder > 1) { /* There is NO request match, and this is the first time * to try to process this packet, we leave the packet in * hardware queue for retry in hope there is a request * match next time, this is for performance * consideration. */ rc = MQ_RET_UNEXP_NO_RESOURCES; } else { /* No match, keep track of callback */ req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); psmi_assert(req != NULL); /* We don't know recv_msglen yet but we set it here for * mq_iprobe */ req->req_data.send_msglen = req->req_data.recv_msglen = send_msglen; PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->hfi_thresh_rv, OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); req->state = MQ_STATE_UNEXP_RV; req->req_data.peer = src; req->req_data.tag = *tag; req->rts_callback = cb; if (paylen > send_msglen) paylen = send_msglen; if (paylen) { req->req_data.buf = psmi_mq_sysbuf_alloc(mq, paylen); psmi_assert(paylen == 0 || req->req_data.buf != NULL); mq->stats.rx_sysbuf_num++; mq->stats.rx_sysbuf_bytes += paylen; psmi_mq_mtucpy(req->req_data.buf, payload, paylen); } req->recv_msgoff = req->send_msgoff = paylen; if (msgorder) { mq_add_to_unexpected_hashes(mq, req); } /* caller will handle out of order case */ *req_o = req; /* no match, will callback */ rc = MQ_RET_UNEXP_OK; } #ifdef PSM_DEBUG if (req) _HFI_VDBG("match=%s (req=%p) src=%s mqtag=%08x.%08x.%08x recvlen=%d " "sendlen=%d errcode=%d\n", rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, psmi_epaddr_get_name(src->epid), req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], req->req_data.recv_msglen, req->req_data.send_msglen, req->req_data.error_code); else _HFI_VDBG("match=%s (req=%p) src=%s\n", rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, psmi_epaddr_get_name(src->epid)); #endif /* #ifdef PSM_DEBUG */ return rc; } /* * This handles the regular (i.e. non-rendezvous MPI envelopes) */ int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, uint32_t send_msglen, uint32_t offset, const void *payload, uint32_t paylen, int msgorder, uint32_t opcode, psm2_mq_req_t *req_o) { psm2_mq_req_t req; uint32_t msglen; psmi_mtucpy_fn_t psmi_mtucpy_fn; if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { /* we have a match */ void *user_buffer = req->req_data.buf; psmi_assert(MQE_TYPE_IS_RECV(req->type)); req->req_data.peer = src; req->req_data.tag = *tag; msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen); _HFI_VDBG("match=YES (req=%p) opcode=%x src=%s mqtag=%x.%x.%x" " msglen=%d paylen=%d\n", req, opcode, psmi_epaddr_get_name(src->epid), tag->tag[0], tag->tag[1], tag->tag[2], msglen, paylen); switch (opcode) { case MQ_MSG_TINY: /* mq_copy_tiny() can handle zero byte */ #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, msglen)) { user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->req_data.buf, msglen, 1, src->proto); } #endif mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen); req->state = MQ_STATE_COMPLETE; ips_barrier(); mq_qq_append(&mq->completed_q, req); break; case MQ_MSG_SHORT: /* message fits in 1 payload */ psmi_mtucpy_fn = psmi_mq_mtucpy; #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, msglen)) { user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->req_data.buf, msglen, 1, src->proto); psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; } #endif if (msglen <= paylen) { psmi_mtucpy_fn(user_buffer, payload, msglen); } else { psmi_assert((msglen & ~0x3) == paylen); psmi_mtucpy_fn(user_buffer, payload, paylen); /* * there are nonDW bytes attached in header, * copy after the DW payload. */ mq_copy_tiny((uint32_t *)(user_buffer+paylen), (uint32_t *)&offset, msglen & 0x3); } req->state = MQ_STATE_COMPLETE; ips_barrier(); mq_qq_append(&mq->completed_q, req); break; case MQ_MSG_EAGER: req->state = MQ_STATE_MATCHED; req->type |= MQE_TYPE_EAGER_QUEUE; req->send_msgoff = req->recv_msgoff = 0; STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq); _HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n", msglen, paylen); #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->user_gpu_buffer, req->req_data.send_msglen, 1, src->proto); } #endif if (paylen > 0) psmi_mq_handle_data(mq, req, offset, payload, paylen); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Internal error, unknown packet 0x%x", opcode); } mq->stats.rx_user_bytes += msglen; mq->stats.rx_user_num++; *req_o = req; /* yes match */ return MQ_RET_MATCH_OK; } /* unexpected message or out of order message. */ #if 0 /* * Keep a callback here in case we want to fit some other high-level * protocols over MQ (i.e. shmem). These protocols would bypass the * normal message handling and go to higher-level message handlers. */ if (msgorder && mq->unexpected_callback) { mq->unexpected_callback(mq, opcode, epaddr, req_data.tag, send_msglen, payload, paylen); *req_o = NULL; return MQ_RET_UNEXP_OK; } #endif if (msgorder > 1) { /* There is NO request match, and this is the first time * to try to process this packet, we leave the packet in * hardware queue for retry in hope there is a request * match nex time, this is for performance * consideration. */ return MQ_RET_UNEXP_NO_RESOURCES; } req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); psmi_assert(req != NULL); req->req_data.peer = src; req->req_data.tag = *tag; req->recv_msgoff = 0; req->req_data.recv_msglen = req->req_data.send_msglen = req->req_data.buf_len = msglen = send_msglen; _HFI_VDBG("match=NO (req=%p) opcode=%x src=%s mqtag=%08x.%08x.%08x" " send_msglen=%d\n", req, opcode, psmi_epaddr_get_name(src->epid), tag->tag[0], tag->tag[1], tag->tag[2], send_msglen); switch (opcode) { case MQ_MSG_TINY: if (msglen > 0) { req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); psmi_assert(msglen == 0 || req->req_data.buf != NULL); mq->stats.rx_sysbuf_num++; mq->stats.rx_sysbuf_bytes += paylen; mq_copy_tiny((uint32_t *) req->req_data.buf, (uint32_t *) payload, msglen); } else req->req_data.buf = NULL; req->state = MQ_STATE_COMPLETE; break; case MQ_MSG_SHORT: req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); psmi_assert(msglen == 0 || req->req_data.buf != NULL); mq->stats.rx_sysbuf_num++; mq->stats.rx_sysbuf_bytes += paylen; if (msglen <= paylen) { psmi_mq_mtucpy(req->req_data.buf, payload, msglen); } else { psmi_assert((msglen & ~0x3) == paylen); psmi_mq_mtucpy(req->req_data.buf, payload, paylen); /* * there are nonDW bytes attached in header, * copy after the DW payload. */ mq_copy_tiny((uint32_t *)(req->req_data.buf+paylen), (uint32_t *)&offset, msglen & 0x3); } req->state = MQ_STATE_COMPLETE; break; case MQ_MSG_EAGER: req->send_msgoff = 0; req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); psmi_assert(msglen == 0 || req->req_data.buf != NULL); mq->stats.rx_sysbuf_num++; mq->stats.rx_sysbuf_bytes += paylen; req->state = MQ_STATE_UNEXP; req->type |= MQE_TYPE_EAGER_QUEUE; STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq); _HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n", msglen, paylen); if (paylen > 0) psmi_mq_handle_data(mq, req, offset, payload, paylen); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Internal error, unknown packet 0x%x", opcode); } mq->stats.rx_sys_bytes += msglen; mq->stats.rx_sys_num++; if (msgorder) { mq_add_to_unexpected_hashes(mq, req); } /* caller will handle out of order case */ *req_o = req; /* no match, will callback */ return MQ_RET_UNEXP_OK; } int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) { psm2_mq_req_t ereq; uint32_t msglen; ereq = mq_req_match(mq, ureq->req_data.peer, &ureq->req_data.tag, 1); if (ereq == NULL) { mq_add_to_unexpected_hashes(mq, ureq); return 0; } psmi_assert(MQE_TYPE_IS_RECV(ereq->type)); ereq->req_data.peer = ureq->req_data.peer; ereq->req_data.tag = ureq->req_data.tag; msglen = mq_set_msglen(ereq, ereq->req_data.buf_len, ureq->req_data.send_msglen); switch (ureq->state) { case MQ_STATE_COMPLETE: if (ureq->req_data.buf != NULL) { /* 0-byte don't alloc a sysreq_data.buf */ psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf, msglen); psmi_mq_sysbuf_free(mq, ureq->req_data.buf); } ereq->state = MQ_STATE_COMPLETE; ips_barrier(); mq_qq_append(&mq->completed_q, ereq); break; case MQ_STATE_UNEXP: /* not done yet */ ereq->state = MQ_STATE_MATCHED; ereq->msg_seqnum = ureq->msg_seqnum; ereq->ptl_req_ptr = ureq->ptl_req_ptr; ereq->send_msgoff = ureq->send_msgoff; ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); if (ereq->recv_msgoff) { psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf, ereq->recv_msgoff); } psmi_mq_sysbuf_free(mq, ureq->req_data.buf); ereq->type = ureq->type; STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq); STAILQ_REMOVE(&mq->eager_q, ureq, psm2_mq_req, nextq); break; case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ ereq->state = MQ_STATE_MATCHED; ereq->rts_peer = ureq->rts_peer; ereq->rts_sbuf = ureq->rts_sbuf; ereq->send_msgoff = ureq->send_msgoff; ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); if (ereq->recv_msgoff) { psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf, ereq->recv_msgoff); } if (ereq->send_msgoff) { psmi_mq_sysbuf_free(mq, ureq->req_data.buf); } ereq->rts_callback = ureq->rts_callback; ereq->rts_reqidx_peer = ureq->rts_reqidx_peer; ereq->type = ureq->type; ereq->rts_callback(ereq, 0); break; default: fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state, ureq); fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n", ureq->type, ureq->mq, ureq->req_data.tag.tag[0], ureq->req_data.tag.tag[1], ureq->req_data.tag.tag[2]); abort(); } psmi_mq_req_free(ureq); return 0; } opa-psm2-PSM2_11.2.185/psm_mq_utils.c000066400000000000000000000175341370564314600171130ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm_mq_internal.h" /* * * MQ request allocator * */ psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type) { psm2_mq_req_t req; psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND); if (type == MQE_TYPE_SEND) req = psmi_mpool_get(mq->sreq_pool); else req = psmi_mpool_get(mq->rreq_pool); if_pt(req != NULL) { /* A while ago there were issues about forgetting to zero-out parts of the * structure, I'm leaving this as a debug-time option */ #ifdef PSM_DEBUG memset(req, 0, sizeof(struct psm2_mq_req)); #endif req->type = type; req->state = MQ_STATE_FREE; memset(req->next, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t)); memset(req->prev, 0, NUM_MQ_SUBLISTS * sizeof(psm2_mq_req_t)); memset(req->q, 0, NUM_MQ_SUBLISTS * sizeof(struct mqq *)); req->req_data.error_code = PSM2_OK; req->mq = mq; req->testwait_callback = NULL; req->rts_peer = NULL; req->req_data.peer = NULL; req->ptl_req_ptr = NULL; #ifdef PSM_CUDA req->is_buf_gpu_mem = 0; req->user_gpu_buffer = NULL; #endif req->flags_user = 0; req->flags_internal = 0; return req; } else { /* we're out of reqs */ int issend = (type == MQE_TYPE_SEND); uint32_t reqmax, reqchunk; psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, &reqchunk, &reqmax); psmi_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, "Exhausted %d MQ %s request descriptors, which usually indicates " "a user program error or insufficient request descriptors (%s=%d)", reqmax, issend ? "isend" : "irecv", issend ? "PSM2_MQ_SENDREQS_MAX" : "PSM2_MQ_RECVREQS_MAX", reqmax); return NULL; } } MOCK_DEF_EPILOGUE(psmi_mq_req_alloc); #ifdef PSM_CUDA void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) { psm2_mq_req_t recvreq = (psm2_mq_req_t)obj; if (PSMI_IS_CUDA_ENABLED) { if (is_alloc) PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT); else PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event); } return; } #endif psm2_error_t psmi_mq_req_init(psm2_mq_t mq) { psm2_mq_req_t warmup_req; psm2_error_t err = PSM2_OK; _HFI_VDBG("mq element sizes are %d bytes\n", (int)sizeof(struct psm2_mq_req)); /* * Send MQ requests */ { struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS; uint32_t maxsz, chunksz; if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) goto fail; if ((mq->sreq_pool = psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, maxsz, 0, DESCRIPTORS, NULL, NULL)) == NULL) { err = PSM2_NO_MEMORY; goto fail; } } /* * Receive MQ requests */ { struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS; uint32_t maxsz, chunksz; if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) goto fail; /* Have a callback function for receive req mpool which creates * and destroy events. */ #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) { if ((mq->rreq_pool = psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz, maxsz, 0, DESCRIPTORS, NULL, NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) { err = PSM2_NO_MEMORY; goto fail; } } else { if ((mq->rreq_pool = psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, maxsz, 0, DESCRIPTORS, NULL, NULL)) == NULL) { err = PSM2_NO_MEMORY; goto fail; } } #else if ((mq->rreq_pool = psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, maxsz, 0, DESCRIPTORS, NULL, NULL)) == NULL) { err = PSM2_NO_MEMORY; goto fail; } #endif } /* Warm up the allocators */ warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); psmi_assert_always(warmup_req != NULL); psmi_mq_req_free(warmup_req); warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); psmi_assert_always(warmup_req != NULL); psmi_mq_req_free(warmup_req); fail: return err; } psm2_error_t psmi_mq_req_fini(psm2_mq_t mq) { psmi_mpool_destroy(mq->rreq_pool); psmi_mpool_destroy(mq->sreq_pool); return PSM2_OK; } /* * Hooks to plug into QLogic MPI stats */ static void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args) { uint64_t *entry = args->stats; psm2_mq_t mq = (psm2_mq_t) args->context; psm2_mq_stats_t mqstats; psm2_mq_get_stats(mq, &mqstats); if (args->num < 8) return; entry[0] = mqstats.tx_eager_num; entry[1] = mqstats.tx_eager_bytes; entry[2] = mqstats.tx_rndv_num; entry[3] = mqstats.tx_rndv_bytes; entry[4] = mqstats.rx_user_num; entry[5] = mqstats.rx_user_bytes; entry[6] = mqstats.rx_sys_num; entry[7] = mqstats.rx_sys_bytes; } void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn) { char *desc[8]; uint16_t flags[8]; int i; struct mpspawn_stats_add_args mp_add; /* * Hardcode flags until we correctly move mpspawn to its own repo. * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN; */ for (i = 0; i < 8; i++) flags[i] = MPSPAWN_STATS_REDUCTION_ALL; desc[0] = "Eager count sent"; desc[1] = "Eager bytes sent"; desc[2] = "Rendezvous count sent"; desc[3] = "Rendezvous bytes sent"; desc[4] = "Expected count received"; desc[5] = "Expected bytes received"; desc[6] = "Unexpect count received"; desc[7] = "Unexpect bytes received"; mp_add.version = MPSPAWN_STATS_VERSION; mp_add.num = 8; mp_add.header = "MPI Statistics Summary (max,min @ rank)"; mp_add.req_fn = psmi_mq_stats_callback; mp_add.desc = desc; mp_add.flags = flags; mp_add.context = mq; add_fn(&mp_add); } opa-psm2-PSM2_11.2.185/psm_perf.c000066400000000000000000000166441370564314600162130ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef RDPMC_PERF_FRAMEWORK #include "psm_user.h" #include #include #include #include #include #include #include #include #include /* Configuration */ #define RDPMC_PERF_DEFAULT_TYPE (PERF_TYPE_HARDWARE) #define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES) __thread struct rdpmc_ctx global_rdpmc_ctx; u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER]; u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER]; u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER]; char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME]; __thread unsigned int global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE; __thread unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG; struct rdpmc_ctx { int fd; struct perf_event_mmap_page *buf; }; typedef unsigned long long u64; #if defined(__ICC) || defined(__INTEL_COMPILER) #include "immintrin.h" #endif /** * DOC: Ring 3 counting for CPU performance counters * * This library allows accessing CPU performance counters from ring 3 * using the perf_events subsystem. This is useful to measure specific * parts of programs (e.g. excluding initialization code) * * Requires a Linux 3.3+ kernel */ /** * rdpmc_open_attr - initialize a raw ring 3 readable performance counter * @attr: perf struct %perf_event_attr for the counter * @ctx: Pointer to struct %rdpmc_ctx that is initialized. * @leader_ctx: context of group leader or NULL * * This allows more flexible setup with a custom &perf_event_attr. * For simple uses rdpmc_open() should be used instead. * Must be called for each thread using the counter. * Must be closed with rdpmc_close() */ PSMI_ALWAYS_INLINE(int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx, struct rdpmc_ctx *leader_ctx)) { ctx->fd = syscall(__NR_perf_event_open, attr, 0, -1, leader_ctx ? leader_ctx->fd : -1, 0); if (ctx->fd < 0) { perror("perf_event_open"); return -1; } ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0); if (ctx->buf == MAP_FAILED) { close(ctx->fd); perror("mmap on perf fd"); return -1; } return 0; } /** * rdpmc_open - initialize a simple ring 3 readable performance counter * @counter: Raw event descriptor (UUEE UU unit mask EE event) * @ctx: Pointer to struct &rdpmc_ctx that is initialized * * The counter will be set up to count CPU events excluding the kernel. * Must be called for each thread using the counter. * The caller must make sure counter is suitable for the running CPU. * Only works in 3.3+ kernels. * Must be closed with rdpmc_close() */ PSMI_ALWAYS_INLINE(int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx)) { struct perf_event_attr attr = { .type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE, .size = PERF_ATTR_SIZE_VER0, .config = counter, .sample_type = PERF_SAMPLE_READ, .exclude_kernel = 1, }; return rdpmc_open_attr(&attr, ctx, NULL); } /** * rdpmc_close: free a ring 3 readable performance counter * @ctx: Pointer to &rdpmc_ctx context. * * Must be called by each thread for each context it initialized. */ PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx)) { close(ctx->fd); munmap(ctx->buf, sysconf(_SC_PAGESIZE)); } static void psmi_rdpmc_perf_framework_init() { int rdpmc_retval; struct rdpmc_ctx *leader = NULL; int env_result = 1; char * env_type = NULL; char * env_config = NULL; env_type = getenv("RDPMC_PERF_TYPE"); if (env_type) { global_rdpmc_type = (int)strtoll(env_type, NULL, 16); } else { env_result = 0; } env_config = getenv("RDPMC_PERF_CONFIG"); if (env_config) { global_rdpmc_config = (int)strtoll(env_config, NULL, 16); } else { env_result = 0; } if (env_result != 1) { global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE; global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG; } struct perf_event_attr attr = { .type = global_rdpmc_type, .size = sizeof(struct perf_event_attr), .config = global_rdpmc_config, .sample_type = PERF_SAMPLE_READ, }; rdpmc_retval = rdpmc_open_attr(&attr, &global_rdpmc_ctx, leader); if (rdpmc_retval < 0) { printf("Unable to initialize RDPMC. Error: %d\n", rdpmc_retval); exit(-1); } } /** * rdpmc_read: read a ring 3 readable performance counter * @ctx: Pointer to initialized &rdpmc_ctx structure. * * Read the current value of a running performance counter. */ unsigned long long rdpmc_read(struct rdpmc_ctx *ctx) { static __thread int rdpmc_perf_initialized = 0; if_pf(!rdpmc_perf_initialized) { psmi_rdpmc_perf_framework_init(); rdpmc_perf_initialized = 1; } u64 val; unsigned seq; u64 offset = 0; typeof (ctx->buf) buf = ctx->buf; do { seq = buf->lock; ips_rmb(); if (buf->index <= 0) return buf->offset; #if defined(__ICC) || defined(__INTEL_COMPILER) val = _rdpmc(buf->index - 1); #else /* GCC */ val = __builtin_ia32_rdpmc(buf->index - 1); #endif offset = buf->offset; ips_rmb(); } while (buf->lock != seq); return val + offset; } #endif /* RDPMC_PERF_FRAMEWORK */ opa-psm2-PSM2_11.2.185/psm_perf.h000066400000000000000000000126451370564314600162150ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define PSM_TX_SPEEDPATH_CTR 0 #define PSM_RX_SPEEDPATH_CTR 1 #ifdef RDPMC_PERF_FRAMEWORK /* Configuration */ #define RDPMC_PERF_MAX_SLOT_NUMBER (8) #define RDPMC_PERF_MAX_SLOT_NAME (256) /* RDPMC infrastructure */ extern __thread struct rdpmc_ctx global_rdpmc_ctx; typedef unsigned long long u64; extern u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER]; extern u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER]; extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER]; extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME]; extern __thread unsigned int global_rdpmc_type; extern __thread unsigned int global_rdpmc_config; extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); #define RDPMC_PERF_INIT() \ { \ int i; \ for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \ { \ global_rdpmc_begin[i] = 0; \ global_rdpmc_summ[i] = 0; \ global_rdpmc_number[i] = 0; \ global_rdpmc_slot_name[i][0] = '\0'; \ } \ } /* There is no slot_number max range check */ #define RDPMC_PERF_SET_SLOT_NAME(slot_number, name) \ { \ strncpy(global_rdpmc_slot_name[(slot_number)], (name), RDPMC_PERF_MAX_SLOT_NAME - 1); \ global_rdpmc_slot_name[(slot_number)][RDPMC_PERF_MAX_SLOT_NAME - 1] = '\0'; \ } #define RDPMC_PERF_BEGIN(slot_number) \ { \ global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \ } #define RDPMC_PERF_END(slot_number) \ { \ global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ global_rdpmc_number[(slot_number)]++; \ } #define RDPMC_PERF_DUMP(stream) \ { \ int i; \ for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \ { \ if (global_rdpmc_slot_name[i][0]) \ { \ fprintf((stream), "RDPMC [%s] (%x, %04x) avg = %g (%llu times)\n", \ global_rdpmc_slot_name[i], global_rdpmc_type, global_rdpmc_config, \ (double)global_rdpmc_summ[i] / global_rdpmc_number[i], global_rdpmc_number[i]); \ fflush((stream)); \ } \ } \ } #define GENERIC_PERF_INIT() RDPMC_PERF_INIT() #define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) RDPMC_PERF_SET_SLOT_NAME(slot_number, name) #define GENERIC_PERF_BEGIN(slot_number) RDPMC_PERF_BEGIN(slot_number) #define GENERIC_PERF_END(slot_number) RDPMC_PERF_END(slot_number) #define GENERIC_PERF_DUMP(stream) RDPMC_PERF_DUMP(stream) #else /* RDPMC_PERF_FRAMEWORK */ #define GENERIC_PERF_INIT() #define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) #define GENERIC_PERF_BEGIN(slot_number) #define GENERIC_PERF_END(slot_number) #define GENERIC_PERF_DUMP(stream) #endif /* RDPMC_PERF_FRAMEWORK */ opa-psm2-PSM2_11.2.185/psm_stats.c000066400000000000000000000452231370564314600164100ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm_mq_internal.h" struct psmi_stats_type { STAILQ_ENTRY(psmi_stats_type) next; struct psmi_stats_entry *entries; int num_entries; void *heading; uint32_t statstype; void *context; }; static STAILQ_HEAD(, psmi_stats_type) psmi_stats = STAILQ_HEAD_INITIALIZER(psmi_stats); psm2_error_t psmi_stats_register_type(const char *heading, uint32_t statstype, const struct psmi_stats_entry *entries_i, int num_entries, void *context) { struct psmi_stats_entry *entries; struct psmi_stats_type *type; int i; psm2_error_t err = PSM2_OK; entries = psmi_calloc(PSMI_EP_NONE, STATS, num_entries, sizeof(struct psmi_stats_entry)); type = psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type)); PSMI_CHECKMEM(err, entries); PSMI_CHECKMEM(err, type); type->entries = entries; type->num_entries = num_entries; type->statstype = statstype; type->context = context; type->heading = (char *)heading; for (i = 0; i < num_entries; i++) { type->entries[i].desc = entries_i[i].desc; type->entries[i].flags = entries_i[i].flags; type->entries[i].getfn = entries_i[i].getfn; type->entries[i].u.val = entries_i[i].u.val; } STAILQ_INSERT_TAIL(&psmi_stats, type, next); return err; fail: if (entries) psmi_free(entries); if (type) psmi_free(type); return err; } psm2_error_t psmi_stats_deregister_all(void) { struct psmi_stats_type *type; /* Currently our mpi still reads stats after finalize so this isn't safe * yet */ while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) { STAILQ_REMOVE_HEAD(&psmi_stats, next); psmi_free(type->entries); psmi_free(type); } return PSM2_OK; } static uint32_t typestring_to_type(const char *typestr) { if (strncasecmp(typestr, "all", 4) == 0) return PSMI_STATSTYPE_ALL; else if (strncasecmp(typestr, "p2p", 4) == 0) return PSMI_STATSTYPE_P2P; else if (strncasecmp(typestr, "hfi", 6) == 0) return PSMI_STATSTYPE_HFI; else if (strncasecmp(typestr, "ips", 4) == 0) return PSMI_STATSTYPE_IPSPROTO; else if ((strncasecmp(typestr, "intr", 5) == 0) || (strncasecmp(typestr, "thread", 7) == 0) || (strncasecmp(typestr, "rcvthread", 10) == 0)) return PSMI_STATSTYPE_RCVTHREAD; else if ((strncasecmp(typestr, "mq", 3) == 0) || (strncasecmp(typestr, "mpi", 4) == 0)) return PSMI_STATSTYPE_MQ; else if ((strncasecmp(typestr, "tid", 4) == 0) || (strncasecmp(typestr, "tids", 5) == 0)) return PSMI_STATSTYPE_TIDS; else if ((strncasecmp(typestr, "counter", 8) == 0) || (strncasecmp(typestr, "counters", 9) == 0)) return PSMI_STATSTYPE_DEVCOUNTERS; else if (strncasecmp(typestr, "devstats", 9) == 0) return PSMI_STATSTYPE_DEVSTATS; else if ((strncasecmp(typestr, "memory", 7) == 0) || (strncasecmp(typestr, "alloc", 6) == 0) || (strncasecmp(typestr, "malloc", 7) == 0)) return PSMI_STATSTYPE_MEMORY; else return 0; } static uint32_t stats_parse_enabled_mask(const char *stats_string) { char *b = (char *)stats_string; char *e = b; char buf[128]; uint32_t stats_enabled_mask = 0; while (*e) { b = e; while (*e && *e != ',' && *e != '+' && *e != '.' && *e != '|' && *e != ':') e++; if (e > b) { /* something new to parse */ int len = ((e - b) > (sizeof(buf) - 1)) ? (sizeof(buf) - 1) : (e - b); strncpy(buf, b, len); buf[len] = '\0'; stats_enabled_mask |= typestring_to_type(buf); } if (*e) e++; /* skip delimiter */ } return stats_enabled_mask; } static void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args) { const struct psmi_stats_entry *entry; struct psmi_stats_type *type = (struct psmi_stats_type *)args->context; int i, num = args->num; uint64_t *stats = args->stats; uint64_t *c = NULL; uint64_t *s = NULL; psmi_assert(num == type->num_entries); if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS || type->statstype == PSMI_STATSTYPE_DEVSTATS) { int unit_id = ((psm2_ep_t) type->context)->unit_id; int portno = ((psm2_ep_t) type->context)->portnum; uintptr_t off; uint8_t *p = NULL; int nc, npc, ns; int nstats = hfi_get_stats_names_count(); int nctrs = hfi_get_ctrs_unit_names_count(unit_id); int npctrs = hfi_get_ctrs_port_names_count(unit_id); if (nctrs != -1 && npctrs != -1) c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs + npctrs, sizeof(uint64_t)); if (nstats != -1) s = psmi_calloc(PSMI_EP_NONE, STATS, nstats, sizeof(uint64_t)); /* * If hfifs is not loaded, we set NAN everywhere. We don't want * stats to break just because 1 node didn't have hfi-stats */ if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) { nc = hfi_get_ctrs_unit(unit_id, c, nctrs); if (nc != -1 && nc == nctrs) p = (uint8_t *) c; if (nc == -1) nc = 0; npc = hfi_get_ctrs_port(unit_id, portno, c + nc, npctrs); if (!p && npc > 0 && npc == npctrs) p = (uint8_t *) c; } else if (s != NULL) { ns = hfi_get_stats(s, nstats); if (ns != -1) p = (uint8_t *) s; } for (i = 0; i < num; i++) { entry = &type->entries[i]; if (p) { off = (uintptr_t) entry->u.off; stats[i] = *((uint64_t *) (p + off)); } else stats[i] = MPSPAWN_NAN_U64; } } else if (type->statstype == PSMI_STATSTYPE_MEMORY) { for (i = 0; i < num; i++) { entry = &type->entries[i]; stats[i] = *(uint64_t *) ((uintptr_t) &psmi_stats_memory + (uintptr_t) entry->u.off); } } else { for (i = 0; i < num; i++) { entry = &type->entries[i]; if (entry->getfn != NULL) stats[i] = entry->getfn(type->context); else stats[i] = *entry->u.val; } } if (c != NULL) psmi_free(c); if (s != NULL) psmi_free(s); } static void stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn, char *heading, int num_entries, struct psmi_stats_entry *entries, mpspawn_stats_req_fn req_fn, void *context) { int i; struct mpspawn_stats_add_args mp_add; mp_add.version = MPSPAWN_STATS_VERSION; mp_add.num = num_entries; mp_add.header = heading; mp_add.req_fn = req_fn; mp_add.context = context; mp_add.desc = (char **)alloca(sizeof(char *) * num_entries); mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries); for (i = 0; i < num_entries; i++) { mp_add.desc[i] = (char *)entries[i].desc; mp_add.flags[i] = entries[i].flags; } /* Ignore return code, doesn't matter to *us* if register failed */ add_fn(&mp_add); return; } static void stats_register_hfi_counters(psm2_ep_t ep); static void stats_register_hfi_stats(psm2_ep_t ep); static void stats_register_mem_stats(psm2_ep_t ep); static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args); /* * Downcall from QLogic MPI into PSM, so we can register stats */ void *psmi_stats_register(struct mpspawn_stats_init_args *args) { struct psmi_stats_type *type; uint32_t statsmask; /* * Args has a version string in it, but we can ignore it since mpspawn * will decide if it supports *our* version */ /* * Eventually, parse the stats_types to add various "flavours" of stats */ if (args->stats_types == NULL) return NULL; statsmask = stats_parse_enabled_mask(args->stats_types); /* MQ (MPI-level) statistics */ if (statsmask & PSMI_STATSTYPE_MQ) psmi_mq_stats_register(args->mq, args->add_fn); /* PSM and hfi level statistics */ if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS) stats_register_hfi_counters(args->mq->ep); if (statsmask & PSMI_STATSTYPE_DEVSTATS) stats_register_hfi_stats(args->mq->ep); if (statsmask & PSMI_STATSTYPE_MEMORY) stats_register_mem_stats(args->mq->ep); /* * At this point all PSM and hfi-level components have registered stats * with the PSM stats interface. We register with the mpspawn stats * interface with an upcall in add_fn */ STAILQ_FOREACH(type, &psmi_stats, next) { if (type->statstype & statsmask) stats_register_mpspawn_single(args->add_fn, type->heading, type->num_entries, type->entries, psmi_stats_mpspawn_callback, type); } /* * Special handling for per-endpoint statistics * Only MPI knows what the endpoint-addresses are in the running program, * PSM has no sense of MPI worlds. In stats register, MPI tells PSM how * many endpoints it anticipates having and PSM simply reserves that amount * of stats entries X the amount of per-endpoint stats. */ if (statsmask & PSMI_STATSTYPE_P2P) psmi_stats_epaddr_register(args); return NULL; } struct stats_epaddr { psm2_ep_t ep; mpspawn_map_epaddr_fn epaddr_map_fn; int num_ep; int num_ep_stats; }; static void psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args) { int i, num, off; uint64_t *statsp; struct stats_epaddr *stats_ctx = (struct stats_epaddr *)args->context; psm2_ep_t ep = stats_ctx->ep; psm2_epaddr_t epaddr; num = stats_ctx->num_ep * stats_ctx->num_ep_stats; /* First always NAN the entire stats request */ for (i = 0; i < num; i++) { if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE) args->stats[i] = MPSPAWN_NAN; else args->stats[i] = MPSPAWN_NAN_U64; } for (i = 0; i < stats_ctx->num_ep; i++) { statsp = args->stats + i * stats_ctx->num_ep_stats; off = 0; epaddr = stats_ctx->epaddr_map_fn(i); if (epaddr == NULL) continue; /* Self */ if (&ep->ptl_self == epaddr->ptlctl) { if (ep->ptl_self.epaddr_stats_get != NULL) off += ep->ptl_self.epaddr_stats_get(epaddr, statsp + off); } else { if (ep->ptl_self.epaddr_stats_num != NULL) off += ep->ptl_self.epaddr_stats_num(); } /* Shm */ if (&ep->ptl_amsh == epaddr->ptlctl) { if (ep->ptl_amsh.epaddr_stats_get != NULL) off += ep->ptl_amsh.epaddr_stats_get(epaddr, statsp + off); } else { if (ep->ptl_amsh.epaddr_stats_num != NULL) off += ep->ptl_amsh.epaddr_stats_num(); } /* ips */ if (&ep->ptl_ips == epaddr->ptlctl) { if (ep->ptl_ips.epaddr_stats_get != NULL) off += ep->ptl_ips.epaddr_stats_get(epaddr, statsp + off); } else { if (ep->ptl_ips.epaddr_stats_num != NULL) off += ep->ptl_ips.epaddr_stats_num(); } } return; } static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args) { int i = 0, j; int num_ep = args->num_epaddr; int num_ep_stats = 0; int nz; char **desc, **desc_i; uint16_t *flags, *flags_i; char *p; char buf[128]; psm2_ep_t ep; struct mpspawn_stats_add_args mp_add; struct stats_epaddr *stats_ctx; psm2_error_t err = PSM2_OK; if (args->mq == NULL) return PSM2_OK; ep = args->mq->ep; /* Figure out how many stats there are in an endpoint from all devices */ if (ep->ptl_self.epaddr_stats_num != NULL) num_ep_stats += ep->ptl_self.epaddr_stats_num(); if (ep->ptl_amsh.epaddr_stats_num != NULL) num_ep_stats += ep->ptl_amsh.epaddr_stats_num(); if (ep->ptl_ips.epaddr_stats_num != NULL) num_ep_stats += ep->ptl_ips.epaddr_stats_num(); /* Allocate desc and flags and let each device initialize their * descriptions and flags */ desc = psmi_malloc(ep, STATS, sizeof(char *) * num_ep_stats * (num_ep + 1)); if (desc == NULL) return PSM2_NO_MEMORY; flags = psmi_malloc(ep, STATS, sizeof(uint16_t) * num_ep_stats * (num_ep + 1)); if (flags == NULL) { psmi_free(desc); return PSM2_NO_MEMORY; } /* Get the descriptions/flags from each device */ i = 0; i += ep->ptl_self.epaddr_stats_num != NULL ? ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0; i += ep->ptl_amsh.epaddr_stats_num != NULL ? ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0; i += ep->ptl_ips.epaddr_stats_num != NULL ? ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0; psmi_assert_always(i == num_ep_stats); /* * Clone the descriptions for each endpoint but append "rank %d" to it * beforehand. */ nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 : /* cheap log */ (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 : (num_ep < 10000 ? 5 : 6))))); desc_i = desc + num_ep_stats; flags_i = flags + num_ep_stats; memset(desc_i, 0, sizeof(char *) * num_ep * num_ep_stats); for (i = 0; i < num_ep; i++) { for (j = 0; j < num_ep_stats; j++) { snprintf(buf, sizeof(buf) - 1, "<%*d> %s", nz, i, desc[j]); buf[sizeof(buf) - 1] = '\0'; p = psmi_strdup(ep, buf); if (p == NULL) { err = PSM2_NO_MEMORY; goto clean; } desc_i[i * num_ep_stats + j] = p; flags_i[i * num_ep_stats + j] = flags[j]; } } mp_add.version = MPSPAWN_STATS_VERSION; mp_add.num = num_ep_stats * num_ep; mp_add.header = "Endpoint-to-Endpoint Stats (by )"; mp_add.req_fn = psmi_stats_epaddr_callback; mp_add.desc = desc_i; mp_add.flags = flags_i; stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr)); if (stats_ctx == NULL) { err = PSM2_NO_MEMORY; goto clean; } stats_ctx->ep = ep; stats_ctx->epaddr_map_fn = args->epaddr_map_fn; stats_ctx->num_ep = num_ep; stats_ctx->num_ep_stats = num_ep_stats; mp_add.context = stats_ctx; args->add_fn(&mp_add); clean: /* Now we can free all the descriptions */ for (i = 0; i < num_ep; i++) { for (j = 0; j < num_ep_stats; j++) if (desc_i[i * num_ep_stats + j]) psmi_free(desc_i[i * num_ep_stats + j]); } psmi_free(desc); psmi_free(flags); return err; } static void stats_register_hfi_counters(psm2_ep_t ep) { int i, nc, npc; char *cnames = NULL, *pcnames = NULL; struct psmi_stats_entry *entries = NULL; nc = hfi_get_ctrs_unit_names(ep->unit_id, &cnames); if (nc == -1 || cnames == NULL) goto bail; npc = hfi_get_ctrs_port_names(ep->unit_id, &pcnames); if (npc == -1 || pcnames == NULL) goto bail; entries = psmi_calloc(ep, STATS, nc + npc, sizeof(struct psmi_stats_entry)); if (entries == NULL) goto bail; for (i = 0; i < nc; i++) { entries[i].desc = hfi_get_next_name(&cnames); entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO; entries[i].getfn = NULL; entries[i].u.off = i * sizeof(uint64_t); } for (i = nc; i < nc + npc; i++) { entries[i].desc = hfi_get_next_name(&pcnames); entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO; entries[i].getfn = NULL; entries[i].u.off = i * sizeof(uint64_t); } psmi_stats_register_type("OPA device counters", PSMI_STATSTYPE_DEVCOUNTERS, entries, nc + npc, ep); // psmi_stats_register_type makes it's own copy of entries // so we should free the entries buffer. // The snames will be freed when we deregister the hfi. psmi_free(entries); return; bail: if (cnames != NULL) hfi_release_names(cnames); if (pcnames != NULL) hfi_release_names(pcnames); if (entries != NULL) psmi_free(entries); } static void stats_register_hfi_stats(psm2_ep_t ep) { int i, ns; char *snames = NULL; struct psmi_stats_entry *entries = NULL; ns = hfi_get_stats_names(&snames); if (ns <= 0 || snames == NULL) goto bail; entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry)); if (entries == NULL) goto bail; for (i = 0; i < ns; i++) { entries[i].desc = hfi_get_next_name(&snames); entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO; entries[i].getfn = NULL; entries[i].u.off = i * sizeof(uint64_t); } psmi_stats_register_type("OPA device statistics", PSMI_STATSTYPE_DEVSTATS, entries, ns, ep); // psmi_stats_register_type makes it's own copy of entries // so we should free the entries buffer. // The snames will be freed when we deregister the hfi. psmi_free(entries); return; bail: if (snames != NULL) hfi_release_names(snames); if (entries != NULL) psmi_free(entries); } #undef _SDECL #define _SDECL(_desc, _param) { \ .desc = _desc, \ .flags = MPSPAWN_STATS_REDUCTION_ALL \ | MPSPAWN_STATS_SKIP_IF_ZERO, \ .getfn = NULL, \ .u.off = offsetof(struct psmi_stats_malloc, _param) \ } static void stats_register_mem_stats(psm2_ep_t ep) { struct psmi_stats_entry entries[] = { _SDECL("Total (current)", m_all_total), _SDECL("Total (max)", m_all_max), _SDECL("All Peers (current)", m_perpeer_total), _SDECL("All Peers (max)", m_perpeer_max), _SDECL("Network Buffers (current)", m_netbufs_total), _SDECL("Network Buffers (max)", m_netbufs_max), _SDECL("PSM desctors (current)", m_descriptors_total), _SDECL("PSM desctors (max)", m_descriptors_max), _SDECL("Unexp. buffers (current)", m_unexpbufs_total), _SDECL("Unexp. Buffers (max)", m_unexpbufs_max), _SDECL("Other (current)", m_undefined_total), _SDECL("Other (max)", m_undefined_max), }; psmi_stats_register_type("PSM memory allocation statistics", PSMI_STATSTYPE_MEMORY, entries, PSMI_STATS_HOWMANY(entries), ep); } opa-psm2-PSM2_11.2.185/psm_stats.h000066400000000000000000000102551370564314600164120ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PSMI_IN_USER_H #error psm_stats.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSM_STATS_H #define _PSM_STATS_H #include "mpspawn_stats.h" #define PSMI_STATSTYPE_MQ 0x00001 #define PSMI_STATSTYPE_RCVTHREAD 0x00100 /* num_wakups, ratio, etc. */ #define PSMI_STATSTYPE_IPSPROTO 0x00200 /* acks,naks,err_chks */ #define PSMI_STATSTYPE_TIDS 0x00400 #define PSMI_STATSTYPE_MEMORY 0x01000 #define PSMI_STATSTYPE_HFI (PSMI_STATSTYPE_RCVTHREAD| \ PSMI_STATSTYPE_IPSPROTO | \ PSMI_STATSTYPE_MEMORY | \ PSMI_STATSTYPE_TIDS) #define PSMI_STATSTYPE_P2P 0x00800 /* ep-to-ep details */ #define PSMI_STATSTYPE_DEVCOUNTERS 0x10000 #define PSMI_STATSTYPE_DEVSTATS 0x20000 #define PSMI_STATSTYPE_ALL 0xfffff #define _PSMI_STATSTYPE_DEVMASK 0xf0000 /* Used to determine how many stats in static array decl. */ #define PSMI_STATS_HOWMANY(entries) \ (sizeof(entries)/sizeof(entries[0])) #define PSMI_STATS_NO_HEADING NULL #define PSMI_STATS_DECL(_desc, _flags, _getfn, _val) \ { .desc = _desc, \ .flags = _flags, \ .getfn = _getfn, \ .u.val = _val, \ } #define PSMI_STATS_DECLU64(_desc, _val) \ PSMI_STATS_DECL(_desc, \ MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \ NULL, \ _val) struct psmi_stats_entry { const char *desc; uint16_t flags; uint64_t(*getfn) (void *context); /* optional fn ptr to get value */ union { uint64_t *val; /* where value is stored if getfn is NULL */ uint64_t off; /* of offset if that makes more sense */ } u; }; /* * Copy the array of entries and keep track of the context */ psm2_error_t psmi_stats_register_type(const char *heading, uint32_t statstype, const struct psmi_stats_entry *entries, int num_entries, void *context); psm2_error_t psmi_stats_deregister_all(void); #endif /* PSM_STATS_H */ opa-psm2-PSM2_11.2.185/psm_sysbuf.c000066400000000000000000000154011370564314600165600ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm_mq_internal.h" /* * * System buffer (unexpected message) allocator * */ #define MM_FLAG_NONE 0 #define MM_FLAG_TRANSIENT 0x1 struct psmi_mem_block_ctrl { union { psmi_mem_ctrl_t *mem_handler; struct psmi_mem_block_ctrl *next; }; }; /* Per MQ allocators */ void psmi_mq_sysbuf_init(psm2_mq_t mq) { int i; uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1}; uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0}; if (mq->mem_ctrl_is_init) return; mq->mem_ctrl_is_init = 1; for (i=0; i < MM_NUM_OF_POOLS; i++) { mq->handler_index[i].block_size = block_sizes[i]; mq->handler_index[i].current_available = 0; mq->handler_index[i].free_list = NULL; mq->handler_index[i].total_alloc = 0; mq->handler_index[i].replenishing_rate = replenishing_rate[i]; if (block_sizes[i] == -1) { psmi_assert_always(replenishing_rate[i] == 0); mq->handler_index[i].flags = MM_FLAG_TRANSIENT; } else { psmi_assert_always(replenishing_rate[i] > 0); mq->handler_index[i].flags = MM_FLAG_NONE; } } /* Hit once on each block size so we have a pool that's allocated */ for (i=0; i < MM_NUM_OF_POOLS; i++) { void *ptr; if (block_sizes[i] == -1) continue; ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]); psmi_mq_sysbuf_free(mq, ptr); } } void psmi_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently not used { struct psmi_mem_block_ctrl *block; int i; if (mq->mem_ctrl_is_init == 0) return; for (i=0; i < MM_NUM_OF_POOLS; i++) { while ((block = mq->handler_index[i].free_list) != NULL) { mq->handler_index[i].free_list = block->next; psmi_free(block); } } mq->mem_ctrl_is_init = 0; } void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len) { snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n", mq->mem_ctrl_total_bytes); buf[len-1] = '\0'; return; } void *psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) { psmi_mem_ctrl_t *mm_handler = mq->handler_index; struct psmi_mem_block_ctrl *new_block; int replenishing; /* There is a timing race with ips initialization, fix later. * * XXX */ if (!mq->mem_ctrl_is_init) psmi_mq_sysbuf_init(mq); mq->stats.rx_sysbuf_num++; mq->stats.rx_sysbuf_bytes += alloc_size; while (mm_handler->block_size < alloc_size) mm_handler++; replenishing = mm_handler->replenishing_rate; if (mm_handler->current_available == 0) { // allocate more buffers if (mm_handler->flags & MM_FLAG_TRANSIENT) { uint32_t newsz = alloc_size + sizeof(struct psmi_mem_block_ctrl); new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { new_block->mem_handler = mm_handler; new_block++; mm_handler->total_alloc++; mq->mem_ctrl_total_bytes += newsz; } return new_block; } do { uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl); new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); mq->mem_ctrl_total_bytes += newsz; if (new_block) { mm_handler->current_available++; mm_handler->total_alloc++; new_block->next = mm_handler->free_list; mm_handler->free_list = new_block; } } while (--replenishing && new_block); } if (mm_handler->current_available) { mm_handler->current_available--; new_block = mm_handler->free_list; mm_handler->free_list = new_block->next; new_block->mem_handler = mm_handler; new_block++; return new_block; } return NULL; } void psmi_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) { struct psmi_mem_block_ctrl * block_to_free; psmi_mem_ctrl_t *mm_handler; psmi_assert_always(mq->mem_ctrl_is_init); block_to_free = (struct psmi_mem_block_ctrl *)mem_to_free - 1; mm_handler = block_to_free->mem_handler; if (mm_handler->flags & MM_FLAG_TRANSIENT) { psmi_free(block_to_free); } else { block_to_free->next = mm_handler->free_list; mm_handler->free_list = block_to_free; mm_handler->current_available++; } return; } opa-psm2-PSM2_11.2.185/psm_sysbuf.h000066400000000000000000000056271370564314600165760ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef SYSBUF_INT_H #define SYSBUF_INT_H #include "psm_user.h" #define MM_NUM_OF_POOLS 7 typedef struct psmi_mem_ctrl { struct psmi_mem_block_ctrl *free_list; uint32_t total_alloc; uint32_t current_available; uint32_t block_size; uint32_t flags; uint32_t replenishing_rate; } psmi_mem_ctrl_t; /* * MQ unexpected buffer management */ void psmi_mq_sysbuf_init(psm2_mq_t mq); void psmi_mq_sysbuf_fini(psm2_mq_t mq); void* psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t nbytes); void psmi_mq_sysbuf_free(psm2_mq_t mq, void *); void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len); #endif /* SYSBUF_INT_H */ opa-psm2-PSM2_11.2.185/psm_timer.c000066400000000000000000000142241370564314600163670ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #if PSMI_TIMER_STATS # define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) ((ctrl)->num_insertions++) # define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) ((ctrl)->num_traversals++) #else # define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) # define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) #endif psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl) { ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; #if PSMI_TIMER_STATS ctrl->num_insertions = 0; ctrl->num_traversals = 0; #endif TAILQ_INIT(&ctrl->timerq); return PSM2_OK; } psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl) { #if PSMI_TIMER_STATS if (ctrl->num_insertions > 0) { _HFI_INFO("avg elem traversals/insertion = %3.2f %%\n", 100.0 * (double)ctrl->num_traversals / ctrl->num_insertions); } #endif return PSM2_OK; } void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, struct psmi_timer *t_insert, uint64_t t_cyc_expire) { struct psmi_timer *t_cursor; psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING)); t_insert->t_timeout = t_cyc_expire; t_insert->flags |= PSMI_TIMER_FLAG_PENDING; /* * We keep the list from oldest (head) to newest (tail), with the * assumption that insert and remove occur much more often than search * (when the timer expires). Newly added timers are more likely to expire * later rather than sooner, which is why the head is older. */ PSMI_TIMER_STATS_ADD_INSERTION(ctrl); if (TAILQ_EMPTY(&ctrl->timerq)) { /* Common case */ TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); ctrl->t_cyc_next_expire = t_cyc_expire; PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); return; } else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) { TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) { if (t_cursor->t_timeout <= t_cyc_expire) { TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer); return; } PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); } /* Got to the end of the list -- We're the next to expire */ ctrl->t_cyc_next_expire = t_cyc_expire; TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); return; } else { TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) { if (t_cursor->t_timeout >= t_cyc_expire) { TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor, t_insert, timer); ctrl->t_cyc_next_expire = min(t_cyc_expire, ctrl->t_cyc_next_expire); return; } PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); } TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer); /* No need to check if we inserted last, given first branch case */ /* if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) */ /* ctrl->t_cyc_next_expire = t_cyc_expire; */ return; } return; } psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire) { psm2_error_t err = PSM2_OK_NO_PROGRESS; struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq); PSM2_LOG_MSG("entering"); while (t_cursor) { if (t_cursor->t_timeout > t_cyc_expire) break; err = PSM2_OK; psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING); t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING; TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer); t_cursor->expire_callback(t_cursor, t_cyc_expire); t_cursor = TAILQ_PREV(t_cursor, timerq, timer); } if (TAILQ_EMPTY(&ctrl->timerq)) ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; else ctrl->t_cyc_next_expire = TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; PSM2_LOG_MSG("leaving"); return err; } void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, struct psmi_timer *t_remove) { psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING); t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING; TAILQ_REMOVE(&ctrl->timerq, t_remove, timer); /* * If we're removing the last entry, we need to reset the * expiration cycle time. */ if (TAILQ_EMPTY(&ctrl->timerq)) ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; else ctrl->t_cyc_next_expire = TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; return; } opa-psm2-PSM2_11.2.185/psm_timer.h000066400000000000000000000123311370564314600163710ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PSMI_IN_USER_H #error psm_timer.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSMI_TIMER_H #define _PSMI_TIMER_H typedef struct psmi_timer psmi_timer; typedef psm2_error_t(*psmi_timer_expire_callback_t) (struct psmi_timer *, uint64_t); struct psmi_timer { TAILQ_ENTRY(psmi_timer) timer; /* opaque */ uint64_t t_timeout; /* opaque */ uint8_t flags; /* opaque */ psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */ void *context; /* user -- callback param */ }; struct psmi_timer_ctrl { uint64_t t_cyc_next_expire; TAILQ_HEAD(timerq, psmi_timer) timerq; #if PSMI_TIMER_STATS uint64_t num_insertions; uint64_t num_traversals; #endif }; /* * Some events need to be unconditionally enqueued at the beginning of the * timerq -- they are not timers meant to expire but merely operations that * need to be delayed. For delayed operations, there are 5 levels of * priority. */ #define PSMI_TIMER_PRIO_0 0ULL #define PSMI_TIMER_PRIO_1 1ULL #define PSMI_TIMER_PRIO_2 2ULL #define PSMI_TIMER_PRIO_3 3ULL #define PSMI_TIMER_PRIO_4 4ULL #define PSMI_TIMER_PRIO_LAST PSMI_TIMER_PRIO_4 #define PSMI_TIMER_INFINITE 0xFFFFFFFFFFFFFFFFULL #define PSMI_TIMER_FLAG_PENDING 0x01 /* * Timer control initialization and finalization */ psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl); psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl); /* * Timer entry initialization (a timer must be initialized before it can be * added to the timer request queue). */ PSMI_ALWAYS_INLINE( void psmi_timer_entry_init(struct psmi_timer *t_init, psmi_timer_expire_callback_t expire_fn, void *context)) { t_init->flags = 0; t_init->expire_callback = expire_fn; t_init->context = context; return; } /* * Timer requests, conditional (macro) or unconditional */ #define psmi_timer_request(ctrl, t_insert, t_cyc) \ if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING)) \ psmi_timer_request_always((ctrl), (t_insert), (t_cyc)) void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, struct psmi_timer *t_insert, uint64_t t_cyc_expire); /* * Timer cancelations, conditional (macro) only (cancel_inner is internal) */ #define psmi_timer_cancel(ctrl, t_remove) \ if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \ psmi_timer_cancel_inner(ctrl, t_remove) void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, struct psmi_timer *t_remove); /* * Timer processing, conditional or unconditional. */ #define psmi_timer_process_if_expired(ctrl, t_cyc_expire) \ (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ? \ psmi_timer_process_expired(ctrl, t_cyc_expire) : \ PSM2_OK_NO_PROGRESS) #define psmi_timer_is_expired(ctrl, t_cyc_expire) \ ((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire); #endif /* _PSMI_TIMER_H */ opa-psm2-PSM2_11.2.185/psm_user.h000066400000000000000000000441101370564314600162270ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef _PSMI_USER_H #define _PSMI_USER_H #ifdef __cplusplus extern "C" { #endif #include "psm_config.h" #include #include #include #include #include #include #include #include "psm2.h" #include "psm2_mq.h" #include "ptl.h" #include "opa_user.h" #include "opa_queue.h" #include "psm_log.h" #include "psm_perf.h" #define PSMI_LOCK_NO_OWNER ((pthread_t)(-1)) #define _PSMI_IN_USER_H /* Opaque hw context pointer used in HAL, and defined by each HAL instance. */ typedef void *psmi_hal_hw_context; #include "psm_help.h" #include "psm_error.h" #include "psm_context.h" #include "psm_utils.h" #include "psm_timer.h" #include "psm_mpool.h" #include "psm_ep.h" #include "psm_lock.h" #include "psm_stats.h" #include "psm2_mock_testing.h" #undef _PSMI_IN_USER_H #define PSMI_VERNO_MAKE(major, minor) ((((major)&0xff)<<8)|((minor)&0xff)) #define PSMI_VERNO PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR) #define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff) #define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff) int psmi_verno_client(); int psmi_verno_isinteroperable(uint16_t verno); int MOCKABLE(psmi_isinitialized)(); MOCK_DCL_EPILOGUE(psmi_isinitialized); psm2_error_t psmi_poll_internal(psm2_ep_t ep, int poll_amsh); psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq); int psmi_get_current_proc_location(); extern int psmi_epid_ver; extern uint32_t non_dw_mul_sdma; extern psmi_lock_t psmi_creation_lock; extern psm2_ep_t psmi_opened_endpoint; extern int psmi_affinity_shared_file_opened; extern uint64_t *shared_affinity_ptr; extern char *affinity_shm_name; extern sem_t *sem_affinity_shm_rw; extern int psmi_affinity_semaphore_open; extern char *sem_affinity_shm_rw_name; PSMI_ALWAYS_INLINE( int _psmi_get_epid_version()) { return psmi_epid_ver; } #define PSMI_EPID_VERSION_SHM 0 #define PSMI_EPID_SHM_ONLY 1 #define PSMI_EPID_IPS_SHM 0 #define PSMI_EPID_VERSION _psmi_get_epid_version() #define PSMI_MAX_EPID_VERNO_SUPPORTED 2 #define PSMI_MIN_EPID_VERNO_SUPPORTED 1 #define PSMI_EPID_VERNO_DEFAULT 2 #define PSMI_EPID_V1 1 #define PSMI_EPID_V2 2 #define PSMI_EPID_GET_LID(epid) (PSMI_EPID_VERSION == PSMI_EPID_V1) ? \ (int)PSMI_EPID_GET_LID_V1(epid) \ : (int)PSMI_EPID_GET_LID_V2(epid) #define PSMI_GET_SUBNET_ID(gid_hi) (gid_hi & 0xffff) /* * Following is the definition of various lock implementations. The choice is * made by defining specific lock type in relevant section of psm_config.h */ #ifdef PSMI_LOCK_IS_SPINLOCK #define _PSMI_LOCK_INIT(pl) psmi_spin_init(&((pl).lock)) #define _PSMI_LOCK_TRY(pl) psmi_spin_trylock(&((pl).lock)) #define _PSMI_LOCK(pl) psmi_spin_lock(&((pl).lock)) #define _PSMI_UNLOCK(pl) psmi_spin_unlock(&((pl).lock)) #define _PSMI_LOCK_ASSERT(pl) #define _PSMI_UNLOCK_ASSERT(pl) #define PSMI_LOCK_DISABLED 0 #elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) PSMI_ALWAYS_INLINE( int _psmi_mutex_trylock_inner(pthread_mutex_t *mutex, const char *curloc, pthread_t *lock_owner)) { psmi_assert_always_loc(*lock_owner != pthread_self(), curloc); int ret = pthread_mutex_trylock(mutex); if (ret == 0) *lock_owner = pthread_self(); return ret; } PSMI_ALWAYS_INLINE( int _psmi_mutex_lock_inner(pthread_mutex_t *mutex, const char *curloc, pthread_t *lock_owner)) { psmi_assert_always_loc(*lock_owner != pthread_self(), curloc); int ret = pthread_mutex_lock(mutex); psmi_assert_always_loc(ret != EDEADLK, curloc); *lock_owner = pthread_self(); return ret; } PSMI_ALWAYS_INLINE( void _psmi_mutex_unlock_inner(pthread_mutex_t *mutex, const char *curloc, pthread_t *lock_owner)) { psmi_assert_always_loc(*lock_owner == pthread_self(), curloc); *lock_owner = PSMI_LOCK_NO_OWNER; psmi_assert_always_loc(pthread_mutex_unlock(mutex) != EPERM, curloc); return; } #define _PSMI_LOCK_INIT(pl) /* static initialization */ #define _PSMI_LOCK_TRY(pl) \ _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC, \ &((pl).lock_owner)) #define _PSMI_LOCK(pl) \ _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC, \ &((pl).lock_owner)) #define _PSMI_UNLOCK(pl) \ _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC, \ &((pl).lock_owner)) #define _PSMI_LOCK_ASSERT(pl) \ psmi_assert_always((pl).lock_owner == pthread_self()); #define _PSMI_UNLOCK_ASSERT(pl) \ psmi_assert_always((pl).lock_owner != pthread_self()); #define PSMI_LOCK_DISABLED 0 #elif defined(PSMI_LOCK_IS_MUTEXLOCK) #define _PSMI_LOCK_INIT(pl) /* static initialization */ #define _PSMI_LOCK_TRY(pl) pthread_mutex_trylock(&((pl).lock)) #define _PSMI_LOCK(pl) pthread_mutex_lock(&((pl).lock)) #define _PSMI_UNLOCK(pl) pthread_mutex_unlock(&((pl).lock)) #define PSMI_LOCK_DISABLED 0 #define _PSMI_LOCK_ASSERT(pl) #define _PSMI_UNLOCK_ASSERT(pl) #elif defined(PSMI_PLOCK_IS_NOLOCK) #define _PSMI_LOCK_TRY(pl) 0 /* 0 *only* so progress thread never succeeds */ #define _PSMI_LOCK(pl) #define _PSMI_UNLOCK(pl) #define PSMI_LOCK_DISABLED 1 #define _PSMI_LOCK_ASSERT(pl) #define _PSMI_UNLOCK_ASSERT(pl) #else #error No LOCK lock type declared #endif #define PSMI_YIELD(pl) \ do { _PSMI_UNLOCK((pl)); sched_yield(); _PSMI_LOCK((pl)); } while (0) #ifdef PSM2_MOCK_TESTING /* If this is a mocking tests build, all the operations on the locks * are routed through functions which may be mocked, if necessary. */ void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl); MOCK_DCL_EPILOGUE(psmi_mockable_lock_init); int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl); MOCK_DCL_EPILOGUE(psmi_mockable_lock_try); void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl); MOCK_DCL_EPILOGUE(psmi_mockable_lock); void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl); MOCK_DCL_EPILOGUE(psmi_mockable_unlock); void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl); MOCK_DCL_EPILOGUE(psmi_mockable_lock_assert); void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl); MOCK_DCL_EPILOGUE(psmi_mockable_unlock_assert); #define PSMI_LOCK_INIT(pl) psmi_mockable_lock_init(&(pl)) #define PSMI_LOCK_TRY(pl) psmi_mockable_lock_try(&(pl)) #define PSMI_LOCK(pl) psmi_mockable_lock(&(pl)) #define PSMI_UNLOCK(pl) psmi_mockable_unlock(&(pl)) #define PSMI_LOCK_ASSERT(pl) psmi_mockable_lock_assert(&(pl)) #define PSMI_UNLOCK_ASSERT(pl) psmi_mockable_unlock_assert(&(pl)) #else #define PSMI_LOCK_INIT(pl) _PSMI_LOCK_INIT(pl) #define PSMI_LOCK_TRY(pl) _PSMI_LOCK_TRY(pl) #define PSMI_LOCK(pl) _PSMI_LOCK(pl) #define PSMI_UNLOCK(pl) _PSMI_UNLOCK(pl) #define PSMI_LOCK_ASSERT(pl) _PSMI_LOCK_ASSERT(pl) #define PSMI_UNLOCK_ASSERT(pl) _PSMI_UNLOCK_ASSERT(pl) #endif #ifdef PSM_PROFILE void psmi_profile_block() __attribute__ ((weak)); void psmi_profile_unblock() __attribute__ ((weak)); void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); #define PSMI_PROFILE_BLOCK() psmi_profile_block() #define PSMI_PROFILE_UNBLOCK() psmi_profile_unblock() #define PSMI_PROFILE_REBLOCK(noprog) psmi_profile_reblock(noprog) #else #define PSMI_PROFILE_BLOCK() #define PSMI_PROFILE_UNBLOCK() #define PSMI_PROFILE_REBLOCK(noprog) #endif #ifdef PSM_CUDA #include #include #if CUDA_VERSION < 7000 #error Please update CUDA driver, required minimum version is 7.0 #endif extern int is_cuda_enabled; extern int is_gdr_copy_enabled; extern int device_support_gpudirect; extern int gpu_p2p_supported; extern int my_gpu_device; extern int cuda_lib_version; extern CUcontext ctxt; extern void *psmi_cuda_lib; extern CUresult (*psmi_cuInit)(unsigned int Flags ); extern CUresult (*psmi_cuCtxDetach)(CUcontext c); extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); extern CUresult (*psmi_cuDeviceGetCount)(int* count); extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream); extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent); extern CUresult (*psmi_cuEventQuery)(CUevent hEvent); extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); extern CUresult (*psmi_cuMemFreeHost)(void* p); extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); #define PSMI_CUDA_CALL(func, args...) do { \ CUresult cudaerr; \ cudaerr = psmi_##func(args); \ if (cudaerr != CUDA_SUCCESS) { \ if (ctxt == NULL) \ _HFI_ERROR( \ "Check if CUDA is initialized" \ "before psm2_ep_open call \n"); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ "returned %d\n", \ #func, __FILE__, __LINE__, cudaerr); \ psmi_handle_error( \ PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ "Error returned from CUDA function.\n");\ } \ } while (0) /** * Similar to PSMI_CUDA_CALL() except does not error out * if func(args) returns CUDA_SUCCESS or except_err * * Invoker must provide 'CUresult cudaerr' in invoked scope * so invoker can inspect whether cudaerr == CUDA_SUCCESS or * cudaerr == except_err after expanded code is executed. * * As except_err is an allowed value, message is printed at * DBG level. */ #define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \ cudaerr = psmi_##func(args); \ if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \ if (ctxt == NULL) \ _HFI_ERROR( \ "Check if CUDA is initialized" \ "before psm2_ep_open call \n"); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ "returned %d\n", \ #func, __FILE__, __LINE__, cudaerr); \ psmi_handle_error( \ PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ "Error returned from CUDA function.\n");\ } else if (cudaerr == except_err) { \ _HFI_DBG( \ "CUDA non-zero return value: %s() (at %s:%d)" \ "returned %d\n", \ #func, __FILE__, __LINE__, cudaerr); \ } \ } while (0) #define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \ cudaerr = psmi_cuEventQuery(event); \ if ((cudaerr != CUDA_SUCCESS) && \ (cudaerr != CUDA_ERROR_NOT_READY)) { \ _HFI_ERROR( \ "CUDA failure: %s() returned %d\n", \ "cuEventQuery", cudaerr); \ psmi_handle_error( \ PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ "Error returned from CUDA function.\n");\ } \ } while (0) #define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do { \ psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func)); \ if (!psmi_##func) { \ psmi_handle_error(PSMI_EP_NORETURN, \ PSM2_INTERNAL_ERR, \ " Unable to resolve %s symbol" \ " in CUDA libraries.\n",STRINGIFY(func));\ } \ } while (0) PSMI_ALWAYS_INLINE( int _psmi_is_cuda_mem(const void *ptr)) { CUresult cres; CUmemorytype mt; unsigned uvm = 0; cres = psmi_cuPointerGetAttribute( &mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr); if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE)) { cres = psmi_cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); if ((cres == CUDA_SUCCESS) && (uvm == 0)) return 1; else return 0; } else return 0; } #define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled) #define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled) PSMI_ALWAYS_INLINE( int _psmi_is_gdr_copy_enabled()) { return is_gdr_copy_enabled; } #define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled() #define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p) struct ips_cuda_hostbuf { STAILQ_ENTRY(ips_cuda_hostbuf) req_next; STAILQ_ENTRY(ips_cuda_hostbuf) next; uint32_t size, offset, bytes_read; /* This flag indicates whether a chb is * pulled from a mpool or dynamically * allocated using calloc. */ uint8_t is_tempbuf; CUevent copy_status; psm2_mq_req_t req; void *host_buf; CUdeviceptr gpu_buf; }; struct ips_cuda_hostbuf_mpool_cb_context { unsigned bufsz; }; void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj); #define CUDA_HOSTBUFFER_LIMITS { \ .env = "PSM_CUDA_BOUNCEBUFFERS_MAX", \ .descr = "Max CUDA bounce buffers (in MB)", \ .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ .minval = 1, \ .maxval = 1<<30, \ .mode[PSMI_MEMMODE_NORMAL] = { 16, 256 }, \ .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ .mode[PSMI_MEMMODE_LARGE] = { 32, 512 } \ } extern uint32_t gpudirect_send_threshold; extern uint32_t gpudirect_recv_threshold; extern uint32_t cuda_thresh_rndv; /* This threshold dictates when the sender turns off * GDR Copy. The threshold needs to be less than * CUDA RNDV threshold. */ extern uint32_t gdr_copy_threshold_send; /* This threshold dictates when the reciever turns off * GDR Copy. The threshold needs to be less than * CUDA RNDV threshold. */ extern uint32_t gdr_copy_threshold_recv; #define PSMI_USE_GDR_COPY(req, len) req->is_buf_gpu_mem && \ PSMI_IS_GDR_COPY_ENABLED && \ len >=1 && len <= gdr_copy_threshold_recv enum psm2_chb_match_type { /* Complete data found in a single chb */ PSMI_CUDA_FULL_MATCH_FOUND = 0, /* Data is spread across two chb's */ PSMI_CUDA_SPLIT_MATCH_FOUND = 1, /* Data is only partially prefetched */ PSMI_CUDA_PARTIAL_MATCH_FOUND = 2, PSMI_CUDA_CONTINUE = 3 }; typedef enum psm2_chb_match_type psm2_chb_match_type_t; /* * CUDA documentation dictates the use of SYNC_MEMOPS attribute * when the buffer pointer received into PSM has been allocated * by the application. This guarantees that all memory operations * to this region of memory (used by multiple layers of the stack) * always synchronize. */ static inline void psmi_cuda_set_attr_sync_memops(const void *ubuf) { int true_flag = 1; PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf); } #endif /* PSM_CUDA */ #define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND] #ifdef __cplusplus } /* extern "C" */ #endif #endif /* _PSMI_USER_H */ opa-psm2-PSM2_11.2.185/psm_utils.c000066400000000000000000002157071370564314600164200ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include /* gethostbyname */ #include /* malloc_usable_size */ #include "psm_user.h" #include "psm2_hal.h" #include "psm_am_internal.h" #include "psm_mq_internal.h" int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); struct psmi_epid_table psmi_epid_table; /* Iterator to access the epid table. * 'ep' can be NULL if remote endpoints from all endpoint handles are requested */ void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep) { itor->i = 0; itor->ep = ep; pthread_mutex_lock(&psmi_epid_table.tablock); } void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor) { int i; struct psmi_epid_tabentry *e; if (itor->i >= psmi_epid_table.tabsize) return NULL; for (i = itor->i; i < psmi_epid_table.tabsize; i++) { e = &psmi_epid_table.table[i]; if (!e->entry || e->entry == EPADDR_DELETED) continue; if (itor->ep && e->ep != itor->ep) continue; itor->i = i + 1; return e->entry; } itor->i = psmi_epid_table.tabsize; /* put at end of table */ return NULL; } void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor) { pthread_mutex_unlock(&psmi_epid_table.tablock); itor->i = 0; } #define mix64(a, b, c) \ { \ a -= b; a -= c; a ^= (c>>43); \ b -= c; b -= a; b ^= (a<<9); \ c -= a; c -= b; c ^= (b>>8); \ a -= b; a -= c; a ^= (c>>38); \ b -= c; b -= a; b ^= (a<<23); \ c -= a; c -= b; c ^= (b>>5); \ a -= b; a -= c; a ^= (c>>35); \ b -= c; b -= a; b ^= (a<<49); \ c -= a; c -= b; c ^= (b>>11); \ a -= b; a -= c; a ^= (c>>12); \ b -= c; b -= a; b ^= (a<<18); \ c -= a; c -= b; c ^= (b>>22); \ } psm2_error_t psmi_epid_init() { pthread_mutexattr_t attr; psmi_epid_table.table = NULL, psmi_epid_table.tabsize = 0; psmi_epid_table.tabsize_used = 0; pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); pthread_mutex_init(&psmi_epid_table.tablock, &attr); pthread_mutexattr_destroy(&attr); return PSM2_OK; }; psm2_error_t psmi_epid_fini() { if (psmi_epid_table.table != NULL) { psmi_free(psmi_epid_table.table); psmi_epid_table.table = NULL; } psmi_epid_table.tabsize = 0; psmi_epid_table.tabsize_used = 0; return PSM2_OK; } PSMI_ALWAYS_INLINE( uint64_t hash_this(const psm2_ep_t ep, const psm2_epid_t epid)) { uint64_t ep_i = (uint64_t) (uintptr_t) ep; uint64_t epid_i = (uint64_t) epid; uint64_t hash = 0x9e3779b97f4a7c13LL; mix64(ep_i, epid_i, hash); return hash; } PSMI_ALWAYS_INLINE( void * psmi_epid_lookup_inner(psm2_ep_t ep, psm2_epid_t epid, int remove)) { uint64_t key = hash_this(ep, epid); struct psmi_epid_tabentry *e; void *entry = NULL; int idx; pthread_mutex_lock(&psmi_epid_table.tablock); if (!psmi_epid_table.table) goto ret; idx = (int)(key % psmi_epid_table.tabsize); while (psmi_epid_table.table[idx].entry != NULL) { /* An epid can be added twice if there's more than one opened endpoint, * but really we match on epid *and* on endpoint */ e = &psmi_epid_table.table[idx]; if (e->entry != EPADDR_DELETED && e->key == key) { entry = e->entry; if (remove) psmi_epid_table.table[idx].entry = EPADDR_DELETED; goto ret; } if (++idx == psmi_epid_table.tabsize) idx = 0; } ret: pthread_mutex_unlock(&psmi_epid_table.tablock); return entry; } void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid) { void *entry = psmi_epid_lookup_inner(ep, epid, 0); if (PSMI_EP_HOSTNAME != ep) _HFI_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid, entry); return entry; } void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid) { if (PSMI_EP_HOSTNAME != ep) _HFI_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid); return psmi_epid_lookup_inner(ep, epid, 1); } void psmi_epid_remove_all(psm2_ep_t ep) { size_t i; struct psmi_epid_tabentry *e; pthread_mutex_lock(&psmi_epid_table.tablock); for (i = 0; i < psmi_epid_table.tabsize; i++) { e = &psmi_epid_table.table[i]; if (e->entry == NULL || e->entry == EPADDR_DELETED) continue; if (e->ep == ep) { /* unspecified fields implicitly zeroed */ *e = (struct psmi_epid_tabentry) { .entry = EPADDR_DELETED }; } } pthread_mutex_unlock(&psmi_epid_table.tablock); } psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry) { uint64_t key; int idx, i, newsz; struct psmi_epid_tabentry *e; psm2_error_t err = PSM2_OK; if (PSMI_EP_HOSTNAME != ep) _HFI_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid, entry); pthread_mutex_lock(&psmi_epid_table.tablock); /* Leave this here, mostly for sanity and for the fact that the epid * table is currently not used in the critical path */ if (++psmi_epid_table.tabsize_used > (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) { struct psmi_epid_tabentry *newtab; newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK; newtab = (struct psmi_epid_tabentry *) psmi_calloc(ep, PER_PEER_ENDPOINT, newsz, sizeof(struct psmi_epid_tabentry)); if (newtab == NULL) { err = PSM2_NO_MEMORY; goto fail; } if (psmi_epid_table.table) { /* rehash the table */ for (i = 0; i < psmi_epid_table.tabsize; i++) { e = &psmi_epid_table.table[i]; if (e->entry == NULL) continue; /* When rehashing, mark deleted as free again */ if (e->entry == EPADDR_DELETED) { psmi_epid_table.tabsize_used--; continue; } idx = (int)(e->key % newsz); while (newtab[idx].entry != NULL) if (++idx == newsz) idx = 0; newtab[idx].entry = e->entry; newtab[idx].key = e->key; newtab[idx].ep = e->ep; newtab[idx].epid = e->epid; } psmi_free(psmi_epid_table.table); } psmi_epid_table.table = newtab; psmi_epid_table.tabsize = newsz; } key = hash_this(ep, epid); idx = (int)(key % psmi_epid_table.tabsize); e = &psmi_epid_table.table[idx]; while (e->entry && e->entry != EPADDR_DELETED) { if (++idx == psmi_epid_table.tabsize) idx = 0; e = &psmi_epid_table.table[idx]; } e->entry = entry; e->key = key; e->epid = epid; e->ep = ep; fail: pthread_mutex_unlock(&psmi_epid_table.tablock); return err; } static psmi_lock_t psmi_gethostname_lock; static void __attribute__ ((constructor)) __psmi_gethostname_lock_constructor(void) { psmi_init_lock(&psmi_gethostname_lock); } char *psmi_gethostname(void) { static char hostname[80] = { '\0' }; char *c; if (hostname[0] == '\0') { PSMI_LOCK(psmi_gethostname_lock); /* CRITICAL SECTION START */ if (hostname[0] == '\0') { gethostname(hostname, sizeof(hostname)); hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */ if ((c = strchr(hostname, '.'))) *c = '\0'; } PSMI_UNLOCK(psmi_gethostname_lock); /* CRITICAL SECTION END */ } return hostname; } /* * Hostname stuff. We really only register the network portion of the epid * since all epids from the same nid are assumed to have the same hostname. */ psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite) { size_t hlen; char *h; psm2_error_t err = PSM2_OK; if (hostname == NULL) return PSM2_OK; /* First see if a hostname already exists */ if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) { if (!overwrite) return PSM2_OK; h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid); if (h != NULL) /* free the previous hostname if so exists */ psmi_free(h); } hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname) + 1); h = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen); if (h == NULL) return PSM2_NO_MEMORY; snprintf(h, hlen, "%s", hostname); h[hlen - 1] = '\0'; err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h); return err; } /* XXX These two functions are not thread safe, we'll use a rotating buffer * trick whenever we need to make them thread safe */ const char *psmi_epaddr_get_hostname(psm2_epid_t epid) { static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; static int bufno; uint64_t nid = psm2_epid_nid(epid); char *h, *hostname; hostname = hostnamebufs[bufno]; bufno = (bufno + 1) % 4; /* First, if we have registered a host for this epid, just return that, or * else try to return something with lid and context */ h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid); if (h != NULL) return h; else { snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "LID=%d:%d.%d", (int)PSMI_EPID_GET_LID(epid), (int)PSMI_EPID_GET_CONTEXT(epid), (int)PSMI_EPID_GET_SUBCONTEXT(epid)); hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0'; return hostname; } } /* This one gives the hostname with a lid */ const char *psmi_epaddr_get_name(psm2_epid_t epid) { static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; static int bufno; char *h, *hostname; hostname = hostnamebufs[bufno]; bufno = (bufno + 1) % 4; h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm2_epid_nid(epid)); if (h == NULL) return psmi_epaddr_get_hostname(epid); else { snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "%s (LID=%d:%d.%d)", h, (int)PSMI_EPID_GET_LID(epid), (int)PSMI_EPID_GET_CONTEXT(epid), (int)PSMI_EPID_GET_SUBCONTEXT(epid)); hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0'; } return hostname; } /* Wrapper, in case we port to OS xyz that doesn't have sysconf */ uintptr_t psmi_getpagesize(void) { static uintptr_t pagesz = (uintptr_t) -1; long sz; if (pagesz != (uintptr_t) -1) return pagesz; sz = sysconf(_SC_PAGESIZE); if (sz == -1) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Can't query system page size"); } pagesz = (uintptr_t) sz; return pagesz; } /* If PSM2_VERBOSE_ENV is set in the environment, we determine * what its verbose level is and print the environment at "INFO" * level if the environment's level matches the desired printlevel. */ static int psmi_getenv_verblevel = -1; static int psmi_getenv_is_verblevel(int printlevel) { if (psmi_getenv_verblevel == -1) { char *env = getenv("PSM2_VERBOSE_ENV"); if (env && *env) { char *ep; int val = (int)strtol(env, &ep, 0); if (ep == env) psmi_getenv_verblevel = 0; else if (val == 2) psmi_getenv_verblevel = 2; else psmi_getenv_verblevel = 1; } else psmi_getenv_verblevel = 0; } return (printlevel <= psmi_getenv_verblevel); } #define GETENV_PRINTF(_level, _fmt, ...) \ do { \ if ((_level & PSMI_ENVVAR_LEVEL_NEVER_PRINT) == 0) \ { \ int nlevel = _level; \ if (psmi_getenv_is_verblevel(nlevel)) \ nlevel = 0; \ _HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__); \ } \ } while (0) int MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level, int type, union psmi_envvar_val defval, union psmi_envvar_val *newval) { int used_default = 0; union psmi_envvar_val tval; char *env = getenv(name); #if _HFI_DEBUGGING int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS || type == PSMI_ENVVAR_TYPE_UINT_FLAGS); #endif /* If we're not using the default, always reset the print * level to '1' so the changed value gets seen at low * verbosity */ #define _GETENV_PRINT(used_default, fmt, val, defval) \ do { \ if (used_default) \ GETENV_PRINTF(level, "%s%-25s %-40s =>%s" fmt \ "\n", level > 1 ? "*" : " ", name, \ descr, ishex ? "0x" : " ", val); \ else \ GETENV_PRINTF(1, "%s%-25s %-40s =>%s" \ fmt " (default was%s" fmt ")\n", \ level > 1 ? "*" : " ", name, descr, \ ishex ? " 0x" : " ", val, \ ishex ? " 0x" : " ", defval); \ } while (0) /* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all of the input passed to it. */ #define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) #define _CONVERT_TO_NUM(DEST,TYPE,STRTOL) \ do { \ char *ep; \ /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ \ DEST = (TYPE)STRTOL(env, &ep, 10); \ if (! _CONSUMED_ALL(ep)) { \ DEST = (TYPE)STRTOL(env, &ep, 16); \ if (! _CONSUMED_ALL(ep)) { \ used_default = 1; \ tval = defval; \ } \ } \ } while (0) switch (type) { case PSMI_ENVVAR_TYPE_YESNO: if (!env || *env == '\0') { tval = defval; used_default = 1; } else if (env[0] == 'Y' || env[0] == 'y') tval.e_int = 1; else if (env[0] == 'N' || env[0] == 'n') tval.e_int = 0; else { char *ep; tval.e_ulong = strtoul(env, &ep, 0); if (ep == env) { used_default = 1; tval = defval; } else if (tval.e_ulong != 0) tval.e_ulong = 1; } _GETENV_PRINT(used_default, "%s", tval.e_long ? "YES" : "NO", defval.e_int ? "YES" : "NO"); break; case PSMI_ENVVAR_TYPE_STR: if (!env || *env == '\0') { tval = defval; used_default = 1; } else tval.e_str = env; _GETENV_PRINT(used_default, "%s", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_INT: if (!env || *env == '\0') { tval = defval; used_default = 1; } else { _CONVERT_TO_NUM(tval.e_int,int,strtol); } _GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int); break; case PSMI_ENVVAR_TYPE_UINT: case PSMI_ENVVAR_TYPE_UINT_FLAGS: if (!env || *env == '\0') { tval = defval; used_default = 1; } else { _CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul); } if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) _GETENV_PRINT(used_default, "%x", tval.e_uint, defval.e_uint); else _GETENV_PRINT(used_default, "%u", tval.e_uint, defval.e_uint); break; case PSMI_ENVVAR_TYPE_LONG: if (!env || *env == '\0') { tval = defval; used_default = 1; } else { _CONVERT_TO_NUM(tval.e_long,long,strtol); } _GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long); break; case PSMI_ENVVAR_TYPE_ULONG_ULONG: if (!env || *env == '\0') { tval = defval; used_default = 1; } else { _CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull); } _GETENV_PRINT(used_default, "%llu", tval.e_ulonglong, defval.e_ulonglong); break; case PSMI_ENVVAR_TYPE_ULONG: case PSMI_ENVVAR_TYPE_ULONG_FLAGS: default: if (!env || *env == '\0') { tval = defval; used_default = 1; } else { _CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul); } if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) _GETENV_PRINT(used_default, "%lx", tval.e_ulong, defval.e_ulong); else _GETENV_PRINT(used_default, "%lu", tval.e_ulong, defval.e_ulong); break; } #undef _GETENV_PRINT *newval = tval; return used_default; } MOCK_DEF_EPILOGUE(psmi_getenv); /* * Parsing int parameters set in string tuples. * Output array int *vals should be able to store 'ntup' elements. * Values are only overwritten if they are parsed. * Tuples are always separated by colons ':' */ int psmi_parse_str_tuples(const char *string, int ntup, int *vals) { char *b = (char *)string; char *e = b; int tup_i = 0; int n_parsed = 0; char *buf = psmi_strdup(NULL, string); psmi_assert_always(buf != NULL); while (*e && tup_i < ntup) { b = e; while (*e && *e != ':') e++; if (e > b) { /* something to parse */ char *ep; int len = e - b; long int l; strncpy(buf, b, len); buf[len] = '\0'; l = strtol(buf, &ep, 0); if (ep != buf) { /* successful conversion */ vals[tup_i] = (int)l; n_parsed++; } } if (*e == ':') e++; /* skip delimiter */ tup_i++; } psmi_free(buf); return n_parsed; } /* * Memory footprint/usage mode. * * This can be used for debug or for separating large installations from * small/medium ones. The default is to assume a medium installation. Large * is not that much larger in memory footprint, but we make a conscious effort * an consuming only the amount of memory we need. */ int psmi_parse_memmode(void) { union psmi_envvar_val env_mmode; int used_default = psmi_getenv("PSM2_MEMORY", "Memory usage mode (normal or large)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"normal", &env_mmode); if (used_default || !strcasecmp(env_mmode.e_str, "normal")) return PSMI_MEMMODE_NORMAL; else if (!strcasecmp(env_mmode.e_str, "min")) return PSMI_MEMMODE_MINIMAL; else if (!strcasecmp(env_mmode.e_str, "large") || !strcasecmp(env_mmode.e_str, "big")) return PSMI_MEMMODE_LARGE; else { _HFI_PRDBG("PSM2_MEMORY env value %s unrecognized, " "using 'normal' memory mode instead\n", env_mmode.e_str); return PSMI_MEMMODE_NORMAL; } } static const char *psmi_memmode_string(int mode) { psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM); switch (mode) { case PSMI_MEMMODE_NORMAL: return "normal"; case PSMI_MEMMODE_MINIMAL: return "minimal"; case PSMI_MEMMODE_LARGE: return "large"; default: return "unknown"; } } psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level, const struct psmi_rlimit_mpool *rlim, uint32_t *valo, uint32_t *chunkszo) { uint32_t val; const char *env = rlim->env; int mode = mq->memmode; psm2_error_t err = PSM2_OK; union psmi_envvar_val env_val; psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM); psmi_getenv(rlim->env, rlim->descr, rlim->env_level, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)rlim->mode[mode].obj_max, &env_val); val = env_val.e_uint; if (val < rlim->minval || val > rlim->maxval) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Env. var %s=%u is invalid (valid settings in mode PSM2_MEMORY=%s" " are inclusively between %u and %u)", env, val, psmi_memmode_string(mode), rlim->minval, rlim->maxval); goto fail; } _HFI_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n", env, val, rlim->mode[mode].obj_chunk, psmi_memmode_string(mode), mode, rlim->minval, rlim->maxval); *valo = val; *chunkszo = rlim->mode[mode].obj_chunk; fail: return err; } uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns) { if (timeout_ns < 0) return 0ULL; else if (timeout_ns == 0ULL || timeout_ns == ~0ULL) return ~0ULL; else { uint64_t t_end = nanosecs_to_cycles(timeout_ns); uint64_t t_now = get_cycles() - start_cycles; if (t_now >= t_end) return 0ULL; else return (t_end - t_now); } } uint32_t psmi_get_ipv4addr() { struct hostent *he; uint32_t addr = 0; he = gethostbyname(psmi_gethostname()); if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) { memcpy(&addr, he->h_addr, sizeof(uint32_t)); return addr; } else return 0; } #define PSMI_EP_IS_PTR(ptr) ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT) void psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...) { va_list ap; /* If we've never syslogged anything from this ep at the PSM level, make * sure we log context information */ if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) { char uuid_str[64]; ep->did_syslog = 1; memset(&uuid_str, 0, sizeof(uuid_str)); psmi_uuid_unparse(ep->uuid, uuid_str); hfi_syslog("PSM", 0, LOG_WARNING, "uuid_key=%s,unit=%d,context=%d,subcontext=%d", uuid_str, psmi_hal_get_unit_id(ep->context.psm_hw_ctxt), psmi_hal_get_context(ep->context.psm_hw_ctxt), psmi_hal_get_subctxt(ep->context.psm_hw_ctxt)); } va_start(ap, format); hfi_vsyslog("PSM", to_console, level, format, ap); va_end(ap); } /* Table of CRCs of all 8-bit messages. */ static uint32_t crc_table[256]; /* Flag: has the table been computed? Initially false. */ static int crc_table_computed; /* Make the table for a fast CRC. */ static void make_crc_table(void) { uint32_t c; int n, k; for (n = 0; n < 256; n++) { c = (uint32_t) n; for (k = 0; k < 8; k++) { if (c & 1) c = 0xedb88320 ^ (c >> 1); else c = c >> 1; } crc_table[n] = c; } crc_table_computed = 1; } /* Update a running CRC with the bytes buf[0..len-1]--the CRC * should be initialized to all 1's, and the transmitted value * is the 1's complement of the final running CRC (see the * crc() routine below)). */ static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len) { uint32_t c = crc; int n; if_pf(!crc_table_computed) make_crc_table(); for (n = 0; n < len; n++) { c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8); } return c; } /* Return the CRC of the bytes buf[0..len-1]. */ uint32_t psmi_crc(unsigned char *buf, int len) { return update_crc(0xffffffff, buf, len) ^ 0xffffffff; } struct psmi_faultinj_spec { STAILQ_ENTRY(psmi_faultinj_spec) next; char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN]; unsigned long long num_faults; unsigned long long num_calls; struct drand48_data drand48_data; int num; int denom; }; int psmi_multi_ep_enabled = 0; void psmi_multi_ep_init() { union psmi_envvar_val env_fi; psmi_getenv("PSM2_MULTI_EP", "PSM2 Multiple Endpoints (yes/no)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, PSMI_ENVVAR_VAL_NO, &env_fi); psmi_multi_ep_enabled = env_fi.e_uint; } #ifdef PSM_FI int psmi_faultinj_enabled = 0; int psmi_faultinj_verbose = 0; char *psmi_faultinj_outfile = NULL; static struct psmi_faultinj_spec psmi_faultinj_dummy; static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head = STAILQ_HEAD_INITIALIZER(psmi_faultinj_head); void psmi_faultinj_init() { union psmi_envvar_val env_fi; psmi_getenv("PSM2_FI", "PSM Fault Injection (yes/no)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO, PSMI_ENVVAR_VAL_NO, &env_fi); psmi_faultinj_enabled = !!env_fi.e_uint; if (psmi_faultinj_enabled) { char *def = NULL; if (!psmi_getenv ("PSM2_FI_TRACEFILE", "PSM Fault Injection output file", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)def, &env_fi)) { psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str); } } return; } void psmi_faultinj_fini() { struct psmi_faultinj_spec *fi; FILE *fp; int do_fclose = 0; if (!psmi_faultinj_enabled || psmi_faultinj_outfile == NULL) return; if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0) fp = stdout; else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0) fp = stderr; else { char *c = psmi_faultinj_outfile; char buf[192]; int append = 0; if (*c == '+') { append = 1; ++c; } do_fclose = 1; snprintf(buf, sizeof(buf) - 1, "%s.%s", c, hfi_get_mylabel()); buf[sizeof(buf) - 1] = '\0'; fp = fopen(buf, append ? "a" : "w"); } if (fp != NULL) { STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { fprintf(fp, "%s:%s PSM2_FI_%-12s %2.3f%% => " "%2.3f%% %10lld faults/%10lld events\n", __progname, hfi_get_mylabel(), fi->spec_name, (double)fi->num * 100.0 / fi->denom, (double)fi->num_faults * 100.0 / fi->num_calls, fi->num_faults, fi->num_calls); } fflush(fp); if (do_fclose) fclose(fp); } psmi_free(psmi_faultinj_outfile); return; } /* * Intended to be used only once, not in the critical path */ struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, int num, int denom) { struct psmi_faultinj_spec *fi; if (!psmi_faultinj_enabled) return &psmi_faultinj_dummy; STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { if (strcmp(fi->spec_name, spec_name) == 0) return fi; } /* We got here, so no spec -- allocate one */ fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED, sizeof(struct psmi_faultinj_spec)); psmi_assert_always(fi != NULL); strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN - 1); fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN - 1] = '\0'; fi->num = num; fi->denom = denom; fi->num_faults = 0; fi->num_calls = 0; /* * See if we get a hint from the environment. * Format is * * * By default, we chose the initial seed to be the 'pid'. If users need * repeatability, they should set initial_seed to be the 'pid' when the * error was observed or force the initial_seed to be a constant number in * each running process. Using 'pid' is useful because core dumps store * pids and our backtrace format does as well so if a crash is observed for * a specific seed, programs can reuse the 'pid' to regenerate the same * error condition. */ { int fvals[3] = { num, denom, (int)getpid() }; union psmi_envvar_val env_fi; char fvals_str[128]; char fname[128]; char fdesc[300]; snprintf(fvals_str, sizeof(fvals_str) - 1, "%d:%d:1", num, denom); fvals_str[sizeof(fvals_str) - 1] = '\0'; snprintf(fname, sizeof(fname) - 1, "PSM2_FI_%s", spec_name); fname[sizeof(fname) - 1] = '\0'; snprintf(fdesc, sizeof(fdesc) - 1, "Fault Injection %s <%s>", fname, fvals_str); if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)fvals_str, &env_fi)) { /* not using default values */ int n_parsed = psmi_parse_str_tuples(env_fi.e_str, 3, fvals); if (n_parsed >= 1) fi->num = fvals[0]; if (n_parsed >= 2) fi->denom = fvals[1]; if (n_parsed >= 3) srand48_r((long int) fvals[2], &fi->drand48_data); } } STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next); return fi; } int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi) { if (!psmi_faultinj_enabled) /* never fault if disabled */ return 0; if (fi->num == 0) return 0; fi->num_calls++; long int rnum; lrand48_r(&fi->drand48_data, &rnum); if (((int) (rnum % INT_MAX)) % fi->denom <= fi->num) { fi->num_faults++; return 1; } else return 0; } #endif /* #ifdef PSM_FI */ /* For memory allocation, we kind of break the PSM error handling rules. * If the caller gets NULL, it has to assume that the error has been handled * and should always return PSM2_NO_MEMORY */ /* * Log memory increments or decrements of type memstats_t. */ struct psmi_memtype_hdr { struct { uint64_t size:48; uint64_t magic:8; uint64_t type:8; }; void *original_allocation; }; struct psmi_stats_malloc psmi_stats_memory; void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes) { #define _add_max_total(type, nbytes) \ psmi_stats_memory.m_ ## type ## _total += (nbytes); \ psmi_stats_memory.m_ ## type ## _max = max( \ psmi_stats_memory.m_ ## type ## _total, \ psmi_stats_memory.m_ ## type ## _max); switch (type) { case PER_PEER_ENDPOINT: _add_max_total(perpeer, nbytes); break; case NETWORK_BUFFERS: _add_max_total(netbufs, nbytes); break; case DESCRIPTORS: _add_max_total(descriptors, nbytes); break; case UNEXPECTED_BUFFERS: _add_max_total(unexpbufs, nbytes); break; case STATS: _add_max_total(stats, nbytes); break; case UNDEFINED: _add_max_total(undefined, nbytes); break; default: psmi_assert_always(type == TOTAL); break; } _add_max_total(all, nbytes); psmi_stats_memory.m_all_max++; #undef _add_max_total return; } // Memory stats will only be collected under debug builds #ifdef PSM_DEBUG #define psmi_stats_mask PSMI_STATSTYPE_MEMORY #else #define psmi_stats_mask 0 #endif #ifdef malloc #undef malloc #endif #ifdef PSM_HEAP_DEBUG /* PSM HEAP DEBUG documentation: In the following code, the acronym: 'HD' is short for "Heap Debug". Each actual heap allocation will have a header and a trailer surrounding it, and the header itself may have some vacant space preceding it due to alignment needs: 0. This area is the actual return value of posix_memalign and is due to alignment requirements. (This area does not exist for heap allocations from malloc()). 1. HD HEADER 2. Actual allocation 3. HD TRAILER malloc() / posix_memalign returns area 0 through 3 to the Heap Debug (HD) code, then the HD code writes areas 1 and 3, and then returns a pointer to area 2 to the caller. Thereafter, the HD code will inspect areas 1 and 3 of all heap allocations to make sure they have retained their integrity. Surrounding the actual allocation like this enables: 1. Checking for heap overrun / underrun of all allocations. 2. Checking for double frees. 3. Use of an area that has been freed. 4. Identifying orphaned heap allocations. Constant no-mans-land written to areas that no-one should be writing to: */ #define HD_NO_MANS_LAND -15 /* The following is the declaration of the HD header. */ /* Heap debug header magic number type: */ typedef char HD_Hdr_Magic_Type[8]; typedef struct HD_Header_Struct { HD_Hdr_Magic_Type magic1; /* Magic number to ensure this allocation has integrity. (guards against heap overrun from above). */ const char *allocLoc; /* Source file name/line number where this heap allocation was made. */ const char *freeLoc; /* Source filename/line number where this heap allocation was freed. */ struct HD_Header_Struct *nextHD_header; /* Creates a singly-linked list of all heap allocations. */ uint64_t sizeOfAlloc; /* size of this heap allocation. */ void *systemAlloc; /* The actual return value from malloc()/posix_memaligh(). */ uint64_t systemAllocSize;/* The size that is actually allocated by malloc()/posix_memalign(). */ HD_Hdr_Magic_Type magic2; /* Second magic number to ensure this allocation has integrity. (guards against heap underrun from the actual allocation that follows). */ } __attribute__ ((packed)) HD_Header_Type; typedef struct HD_free_list_struct { HD_Header_Type *freedStuct; struct HD_free_list_struct *next_free_struct; } HD_Free_Struct_Type; static HD_Free_Struct_Type *HD_free_list_root = NULL; static HD_Free_Struct_Type **HD_free_list_bottom = &HD_free_list_root; typedef char HD_Trlr_Magic_Type[16]; static const HD_Hdr_Magic_Type HD_HDR_MGC_1 = "Eric"; static const HD_Hdr_Magic_Type HD_HDR_MGC_2 = "Emily"; static const HD_Trlr_Magic_Type HD_TRLR_MGC = "Erin&Elaine"; /* Convert a pointer of an actual allocation to a pointer to its HD header: */ static inline HD_Header_Type *HD_AA_TO_HD_HDR(void *aa) { char *p = (char*)aa; return (HD_Header_Type*)(p - sizeof(HD_Header_Type)); } /* Convert a pointer to an HD header to the actual allocation: */ static inline void *HD_HDR_TO_AA(HD_Header_Type *phdHdr) { char *p = (char*)phdHdr; return p + sizeof(HD_Header_Type); } /* Get the address of the trailer that follows the actual allocation: */ static inline void *HD_GET_HD_TRLR(HD_Header_Type *phdr) { char *p = (char*)HD_HDR_TO_AA(phdr); return p + phdr->sizeOfAlloc; } static HD_Header_Type * HD_root_of_list = NULL; /* Root of singly linked list of all heap allocations */ static HD_Header_Type **HD_end_of_list = &HD_root_of_list; /* Pointer to the last pointer of the singly linked list of all heap allocations. */ /* Number of allocations in the list. Maintained to assert the integrity of the singly linked list of heap allocations. */ static int n_allocations = 0; /* HD_check_one_struct() checks one heap allocation for integrity. */ static inline void HD_check_one_struct(HD_Header_Type *p, int checkAA,const char *curloc) { int s=0; /* First check the magic values in the header and trailer: */ s |= memcmp(p->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)) ? 1 : 0; s |= memcmp(p->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)) ? 2 : 0; s |= memcmp(HD_GET_HD_TRLR(p),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)) ? 4 : 0; if (s != 0) { fprintf(stderr,"header/trailer error: checking location: %s, s: %d, p: %p, " "p->allocLoc: %s\n",curloc,s,p,p->allocLoc); fprintf(stderr,"actual allocation starts at: %p, length: %" PRIu64 "\n", (char*)HD_HDR_TO_AA(p),p->sizeOfAlloc); fflush(0); abort(); } /* Next, check the area between systemAlloc and the start of the header */ signed char *pchr = (signed char *)p->systemAlloc; while (pchr < (signed char*)p) { psmi_assert_always(*pchr == (signed char) HD_NO_MANS_LAND); pchr++; } /* Lastly, check the actual allocation area if directed to do so: */ if (checkAA) { uint64_t i; signed char *pchr = HD_HDR_TO_AA(p); for (i=0;i < p->sizeOfAlloc;i++) if (pchr[i] != (signed char) HD_NO_MANS_LAND) { fprintf(stderr, "use after free; ptr: %p,\n" " allocated from: %s,\n" " validated from: %s\n" " freed from: %s\n", pchr+i,p->allocLoc,curloc,p->freeLoc); fflush(0); psmi_assert_always(0); } } } /* _psmi_heapdebug_val_heapallocs() walks the singly linked list and inspects all * heap allocations to ensure all of them have integrity still. */ void _psmi_heapdebug_val_heapallocs(const char *curloc) { /* first check current allocation list: */ HD_Header_Type *p = HD_root_of_list; int cnt = 0; while (p) { HD_check_one_struct(p,0,curloc); p = p->nextHD_header; cnt++; } psmi_assert_always(cnt == n_allocations); /* Next check free list */ HD_Free_Struct_Type *pfreestruct = HD_free_list_root; while (pfreestruct) { HD_check_one_struct(pfreestruct->freedStuct,1,curloc); pfreestruct = pfreestruct->next_free_struct; } } /* psmi_heapdebug_finalize() validates the heap and then emits all of the allocations to stdout. to help debug heap memory leaks. */ void psmi_heapdebug_finalize(void) { /* First validate the existing heap allocations: */ psmi_heapdebug_val_heapallocs(); printf("orphaned heap allocations: %d\n", n_allocations); if (n_allocations > 0) { /* Now, emit all of the alloations to stdout. */ HD_Header_Type *p = HD_root_of_list; while (p) { printf("orphaned heap allocation: %p allocated at: %s, size: %lu\n", p, p->allocLoc, p->sizeOfAlloc); p = p->nextHD_header; } fflush(0); /* Abort if any allocations still exist: */ abort(); } } /* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds * the header and trailer to the allocation. Lastly, it validates the existing singly-linked * list for integrity. */ static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc, void *systemAlloc, uint64_t systemSize, uint64_t actualSize, const char *curloc) { /* First, write HD_NO_MANS_LAND to the entire allocation: */ memset(systemAlloc,HD_NO_MANS_LAND,systemSize); /* Write the HD header info: */ memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)); hd_alloc->allocLoc = curloc; hd_alloc->freeLoc = NULL; hd_alloc->nextHD_header = NULL; hd_alloc->sizeOfAlloc = actualSize; hd_alloc->systemAlloc = systemAlloc; hd_alloc->systemAllocSize = systemSize; memcpy(hd_alloc->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)); memcpy(HD_GET_HD_TRLR(hd_alloc),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)); *HD_end_of_list = hd_alloc; HD_end_of_list = &hd_alloc->nextHD_header; n_allocations++; psmi_heapdebug_val_heapallocs(); } /* hd_malloc() is the heap debug version of malloc that will create the header and trailer * and link the allocation into the singly linked list. */ static inline void *hd_malloc(size_t sz, const char *curloc) { const uint64_t wholeSize = sizeof(HD_Header_Type) + sz + sizeof(HD_TRLR_MGC); HD_Header_Type *hd_alloc = (HD_Header_Type*)malloc(wholeSize); hd_est_hdr_trlr(hd_alloc,hd_alloc,wholeSize,sz,curloc); return HD_HDR_TO_AA(hd_alloc); } /* hd_memalign() is the heap debug version of posix_memalign(). */ static inline int hd_memalign(void **ptr,uint64_t alignment, size_t sz, const char *curloc) { void *systemAlloc = NULL; const uint64_t alignMask = alignment - 1; uint64_t systemSize = sizeof(HD_Header_Type) + alignMask + sz + sizeof(HD_TRLR_MGC); int rv = posix_memalign(&systemAlloc,alignment,systemSize); char *actualAlloc = NULL; const char *endOfSystemAlloc = ((char*)systemAlloc) + systemSize; if (rv) return rv; uint64_t actualAllocu64 = (uint64_t) systemAlloc; actualAllocu64 += sizeof(HD_Header_Type) + alignMask; actualAllocu64 &= ~ alignMask; actualAlloc = (char*)actualAllocu64; psmi_assert_always((actualAllocu64 & alignMask) == 0); psmi_assert_always((actualAlloc+sz+sizeof(HD_TRLR_MGC)) <= endOfSystemAlloc); psmi_assert_always((actualAlloc - (char*)systemAlloc) >= sizeof(HD_Header_Type)); hd_est_hdr_trlr(HD_AA_TO_HD_HDR(actualAlloc),systemAlloc,systemSize,sz,curloc); *ptr = actualAlloc; return rv; } /* hd_free() is the heap debug version of free(). First, hd_free() ensures that the ptr to be * freed in fact is known by the HD code. Next, hd_free() removes the ptr from the list. Then, * hd_free scribbles to the ptr's area and actually frees the heap space. */ static inline void hd_free(void *ptr,const char *curloc) { HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr); HD_Header_Type *p = HD_root_of_list, *q = NULL; psmi_heapdebug_val_heapallocs(); while (p) { if (p == hd_alloc) { /* first, fix the next pointers: */ if (q) { q->nextHD_header = p->nextHD_header; } else { psmi_assert_always(p == HD_root_of_list); HD_root_of_list = p->nextHD_header; } /* Now, handle the case of removing the last entry in the list. */ if (&p->nextHD_header == HD_end_of_list) { if (q) { q->nextHD_header = NULL; HD_end_of_list = &q->nextHD_header; } else { HD_root_of_list = NULL; HD_end_of_list = &HD_root_of_list; } } /* Scribble to the actual allocation to make further access to the heap area unusable. */ n_allocations--; memset(HD_HDR_TO_AA(hd_alloc),HD_NO_MANS_LAND,hd_alloc->sizeOfAlloc); hd_alloc->freeLoc = curloc; /* Add this allocation to the free list. */ HD_Free_Struct_Type *pfreestruct = (HD_Free_Struct_Type*)malloc(sizeof(HD_Free_Struct_Type)); *HD_free_list_bottom = pfreestruct; HD_free_list_bottom = &pfreestruct->next_free_struct; pfreestruct->freedStuct = hd_alloc; pfreestruct->next_free_struct = NULL; psmi_heapdebug_val_heapallocs(); return; } q = p; p = p->nextHD_header; } /* trying to free a heap allocation that we did not allocate. */ psmi_assert_always(0); } size_t hd_malloc_usable_size(void *ptr,const char *curloc) { HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr); return hd_alloc->systemAllocSize; } #endif #ifdef PSM_HEAP_DEBUG /* For HD code, we retarget the malloc, memaligh and free calls to the hd versions * of the code. */ #define my_malloc(SZ,CURLOC) hd_malloc(SZ,CURLOC) #define my_memalign(PTR,ALIGN,SZ,CURLOC) hd_memalign(PTR,ALIGN,SZ,CURLOC) #define my_free(PTR,CURLOC) hd_free(PTR,CURLOC) #define my_malloc_usable_size(PTR,CURLOC) hd_malloc_usable_size(PTR,CURLOC) #else /* For non-HD code, we target the code to the usual functions: */ #define my_malloc(SZ,CURLOC) malloc(SZ) #define my_memalign(PTR,ALIGN,SZ,CURLOC) posix_memalign(PTR,ALIGN,SZ) #define my_free(PTR,CURLOC) free(PTR) #define my_malloc_usable_size(PTR,CURLOC) malloc_usable_size(PTR) #endif void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t sz, const char *curloc) { size_t newsz = sz; void *newa; if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) newsz += sizeof(struct psmi_memtype_hdr); newa = my_malloc(newsz,curloc); if (newa == NULL) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, "Out of memory for malloc at %s", curloc); return NULL; } if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)newa; hdr->size = newsz; hdr->type = type; hdr->magic = 0x8c; hdr->original_allocation = newa; psmi_log_memstats(type, newsz); newa = (void *)(hdr + 1); /* _HFI_INFO("alloc is %p\n", newa); */ } return newa; } void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t type, void *ptr, size_t nsz, const char *curloc) { if (ptr) { size_t existingSize = psmi_malloc_usable_size_internal(ptr,curloc); if (nsz > existingSize) { void *newPtr = psmi_malloc_internal(ep,type,nsz,curloc); memcpy(newPtr,ptr,existingSize); psmi_free_internal(ptr,curloc); return newPtr; } else /* We will not support shrinking virtual space for performance reasons. */ return ptr; } else return psmi_malloc_internal(ep,type,nsz,curloc); } #ifdef memalign #undef memalign #endif void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t type, size_t alignment, size_t sz, const char *curloc) { size_t newsz = sz; void *newa; int ret, preambleSize = 0; if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { if (sizeof(struct psmi_memtype_hdr) > alignment) { int n = sizeof(struct psmi_memtype_hdr) / alignment; int r = sizeof(struct psmi_memtype_hdr) % alignment; if (r) n++; preambleSize = n * alignment; } else preambleSize = alignment; newsz += preambleSize; } ret = my_memalign(&newa, alignment, newsz, curloc); if (ret) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, "Out of memory for malloc at %s", curloc); return NULL; } if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { void *rv = newa + preambleSize; struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)(rv-sizeof(struct psmi_memtype_hdr)); hdr->size = newsz; hdr->type = type; hdr->magic = 0x8c; hdr->original_allocation = newa; psmi_log_memstats(type, newsz); newa = rv; /* _HFI_INFO("alloc is %p\n", newa); */ } return newa; } #ifdef calloc #undef calloc #endif void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t nelem, size_t elemsz, const char *curloc) { void *newa = psmi_malloc_internal(ep, type, nelem * elemsz, curloc); if (newa == NULL) /* error handled above */ return NULL; memset(newa, 0, nelem * elemsz); return newa; } #ifdef strdup #undef strdup #endif void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc) { size_t len = strlen(string) + 1; void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc); if (newa == NULL) return NULL; memcpy(newa, string, len); /* copy with \0 */ return newa; } #ifdef free #undef free #endif void MOCKABLE(psmi_free_internal)(void *ptr,const char *curloc) { if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)ptr - 1; /* _HFI_INFO("hdr is %p, ptr is %p\n", hdr, ptr); */ psmi_memtype_t type = hdr->type; int64_t size = hdr->size; int magic = (int)hdr->magic; psmi_log_memstats(type, -size); psmi_assert_always(magic == 0x8c); ptr = hdr->original_allocation; } my_free(ptr,curloc); } MOCK_DEF_EPILOGUE(psmi_free_internal); #ifdef malloc_usable_size #undef malloc_usable_size #endif size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc) { return my_malloc_usable_size(ptr,curLoc); } PSMI_ALWAYS_INLINE( psm2_error_t psmi_coreopt_ctl(const void *core_obj, int optname, void *optval, uint64_t *optlen, int get)) { psm2_error_t err = PSM2_OK; switch (optname) { case PSM2_CORE_OPT_DEBUG: /* Sanity check length */ if (*optlen < sizeof(unsigned)) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Option value length error"); *optlen = sizeof(unsigned); return err; } if (get) { *((unsigned *)optval) = hfi_debug; } else hfi_debug = *(unsigned *)optval; break; case PSM2_CORE_OPT_EP_CTXT: { /* core object is epaddr */ psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj; /* Sanity check epaddr */ if (!epaddr) { return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Invalid endpoint address"); } /* Sanity check length */ if (*optlen < sizeof(unsigned long)) { err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Option value length error"); *optlen = sizeof(void *); return err; } if (get) { *((unsigned long *)optval) = (unsigned long)epaddr->usr_ep_ctxt; } else epaddr->usr_ep_ctxt = optval; } break; default: /* Unknown/unrecognized option */ err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown PSM2_CORE option %u.", optname); break; } return err; } psm2_error_t psmi_core_setopt(const void *core_obj, int optname, const void *optval, uint64_t optlen) { return psmi_coreopt_ctl(core_obj, optname, (void *)optval, &optlen, 0); } psm2_error_t psmi_core_getopt(const void *core_obj, int optname, void *optval, uint64_t *optlen) { return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1); } /* PSM AM component option handling */ PSMI_ALWAYS_INLINE( psm2_error_t psmi_amopt_ctl(const void *am_obj, int optname, void *optval, uint64_t *optlen, int get)) { psm2_error_t err = PSM2_OK; /* AM object is a psm2_epaddr (or NULL for global minimum sz) */ /* psm2_epaddr_t epaddr = (psm2_epaddr_t) am_obj; */ /* All AM options are read-only. */ if (!get) { return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OPT_READONLY, "Attempted to set read-only option value"); } /* Sanity check length -- all AM options are uint32_t. */ if (*optlen < sizeof(uint32_t)) { *optlen = sizeof(uint32_t); return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Option value length error"); } switch (optname) { case PSM2_AM_OPT_FRAG_SZ: *((uint32_t *) optval) = psmi_am_parameters.max_request_short; break; case PSM2_AM_OPT_NARGS: *((uint32_t *) optval) = psmi_am_parameters.max_nargs; break; case PSM2_AM_OPT_HANDLERS: *((uint32_t *) optval) = psmi_am_parameters.max_handlers; break; default: err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown PSM2_AM option %u.", optname); } return err; } psm2_error_t psmi_am_setopt(const void *am_obj, int optname, const void *optval, uint64_t optlen) { return psmi_amopt_ctl(am_obj, optname, (void *)optval, &optlen, 0); } psm2_error_t psmi_am_getopt(const void *am_obj, int optname, void *optval, uint64_t *optlen) { return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1); } #ifdef PSM_LOG #include #include #include #include #include #include "ptl_ips/ips_proto_header.h" /* A treeNode is used to store the list of Function Name Lists that are passed to the PSM_LOG facility via environment variables. See psm_log.h for more information. Note that treeNode is a node in a binary tree data structure. */ typedef struct _treeNode { const char *name; int line1,line2; struct _treeNode *left,*right; } treeNode; /* An epmTreeNode is used to track the number of protocol packets that are send/recevied, for a given opcode, and source epid to another epid. */ typedef struct _epmTreeNode { int opcode,count,txrx; uint64_t fromepid,toepid; struct _epmTreeNode *left,*right; } epmTreeNode; /* given a line range: [*line1 .. *line2], and another line, line 'join' the line range to the new line if the line immediately abuts the line range. The new line does not abut the existing range, return 0. Else, return 1. For example, take the line range [ 20 .. 30 ] and the line: 19. Since 19 comes immediately before 20, the line range can be joined resulting in the line rage: [ 19 .. 30 ]. The function returns 1 for this case. The following other examples gives the new line range given the new line and range [ 20 .. 30 ], and gives the return value: 31 [ 20 .. 31 ] 1 18 [ 20 .. 30 ] 0 32 [ 20 .. 30 ] 0 25 [ 20 .. 30 ] 1 */ static int joinOverlap(int *line1,int *line2,int line) { long long ll_line = line; if (ll_line+1 >= *line1 && ll_line-1 <= *line2) { *line1 = min(*line1,line); *line2 = max(*line2,line); return 1; } return 0; } /* given two line ranges, determine the range that encompasses both line ranges if an overlap has occurred. Returns 0 if the two ranges do not overlap and do not abutt. Some examples, if line1=20 and line2=30 [20 30] [20 30] 2 [19 30] [19 30] 2 [19 20] [19 30] 2 [10 15] [20 30] 0 [40 50] [20 30] 0 */ static int joinOverlapRange(int *line1,int *line2,int l1,int l2) { return joinOverlap(line1,line2,l1) + joinOverlap(line1,line2,l2); } /* inserts a new treeNode into the FNL tree, or, merges the lines that are already present in the tree. */ static void insertNodeInTree(treeNode **root,const char *name,int line1,int line2) { if (*root) { int c = strcmp(name,(*root)->name); if (c < 0) insertNodeInTree(&((*root)->left),name,line1,line2); else if (c > 0) insertNodeInTree(&((*root)->right),name,line1,line2); else { if (joinOverlapRange(&(*root)->line1,&(*root)->line2,line1,line2)) return; else if (line1 < (*root)->line1) insertNodeInTree(&((*root)->left),name,line1,line2); else if (line2 > (*root)->line2) insertNodeInTree(&((*root)->right),name,line1,line2); else psmi_assert_always(0); /* should never happen. */ } } else { *root = malloc(sizeof(treeNode)); (*root)->name = strdup(name); (*root)->line1 = line1; (*root)->line2 = line2; (*root)->left = (*root)->right = NULL; } } /* Returns -1 if the data in the node is less than the data supplied as parameter, else Returns 1 if the data in the node is greater than the data supplied as parameter, else Returns 0. */ static int compareEpmNode(epmTreeNode *node,int opcode,int txrx,uint64_t fromepid,uint64_t toepid) { #define COMPARE_ONE(X) if (node->X != X) return node->X < X ? -1 : 1 COMPARE_ONE(opcode); COMPARE_ONE(txrx); COMPARE_ONE(fromepid); COMPARE_ONE(toepid); return 0; } /* Inserts a new node in the tree corresponding to the parameters, or, retrieves the node in the tree. In either case, this code returns a pointer to the count in the node. */ static int *insertNodeInEpmTree(epmTreeNode **root,int opcode,int txrx,uint64_t fromepid,uint64_t toepid) { if (*root) { int a = compareEpmNode((*root),opcode,txrx,fromepid,toepid); if (a < 0) return insertNodeInEpmTree(&((*root)->left),opcode,txrx,fromepid,toepid); else if (a > 0) return insertNodeInEpmTree(&((*root)->right),opcode,txrx,fromepid,toepid); else return &((*root)->count); } else { *root = malloc(sizeof(epmTreeNode)); (*root)->opcode = opcode; (*root)->txrx = txrx; (*root)->count = 0; (*root)->fromepid = fromepid; (*root)->toepid = toepid; (*root)->left = (*root)->right = NULL; return &((*root)->count); } } /* returns 0, if the node is present, non-zero if it is absent. */ static int lookupNodeInTree(const treeNode *root,const char *name,int line) { if (root) { int c = strcmp(name,root->name); if (c < 0) return lookupNodeInTree(root->left,name,line); else if (c > 0) return lookupNodeInTree(root->right,name,line); else { if (line < root->line1) return lookupNodeInTree(root->left,name,line); else if (line > root->line2) return lookupNodeInTree(root->right,name,line); else /* line must be >= root->line1 and line must be <= root->line2. */ return 0; } } else { return 1; } } /* Declare a prototype for a parserFunc - referenced in the following code: */ typedef void parserFunc(char *,int,int,void *); /* breaks down a string into 'c'-delimited substrings, and calls the parser func for each substring. */ static void parseString(char *ps,char c,parserFunc pf,void *ctx) { int idx,n=0; char *p; /* first, count the number of instances of c in ps, for use by the parser function: */ for (idx=0;ps[idx];idx++) if (ps[idx] == c) n++; /* next, break down ps into 'c'-delimited substrings, and call parser function, pf for each substring: */ for (idx=0,p=ps;p && *p;idx++) { char *t = strchr(p,c); if (!t) { break; } else { *t = 0; pf(p,idx,n,ctx); p = t+1; } } /* finally, call pf on the final substring. */ pf(p,idx,n,ctx); } /* fncNameCtx is the context used while parsing FNL's (see psm_log.h for more info) from the environment: */ typedef struct { const char *currentFuncName; int firstLineNumber; treeNode **root; } funcNameCtx; /* This is the start of the parser code for parsing FNL's. Here is the grammar: An FNL is a 'Function Name List' that is defined by the following grammar: # A LINE1 is either a single line number of a range of line numbers: (1) LINE1 :: lineNumber | (2) lineNumber1 '-' lineNumber2 # LINES is a list of LINE1's separated by commas: (3) LINES :: LINE1 | (4) LINE1 ',' LINES # An FN is either a function name, or a function name with a list of lines: (5) FN :: functionName | (6) functionName ';' LINES # A FNL is a list of FN's separated by colons: (7) FNL :: FN | (8) FN ':' FNL # Examples: foo:bar the two functions foo and bar foo;1-10 lines 1 to 10 of function foo. bar;1,3,5 lines 1, 3 and 5 of function bar */ /* p4() inserts a (function name and line number) pair into the FNL tree or a (function name and line number range) in the FNL tree. */ static void p4(char *s,int idx,int n,void *ctx) { funcNameCtx *pfnc = (funcNameCtx *)ctx; if (n == 0) /* production (1) */ { pfnc->firstLineNumber = atoi(s); insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,pfnc->firstLineNumber); } else if (n == 1) /* production (2) */ { if (idx == 0) /* lhs of production (2) */ pfnc->firstLineNumber = atoi(s); else /* rhs of production (2). */ insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,atoi(s)); } } /* p3 puts an entry into the FNL tree for all of the lines of a given functionname, or, it parses the list of line number ranges and uses p4 to spill each individual range (or just one line number) into the tree */ static void p3(char *s,int idx,int n,void *ctx) { funcNameCtx *pfnc = (funcNameCtx *)ctx; if (n == 0 && *s == 0) /* production (5)/(7) */ { insertNodeInTree(pfnc->root,pfnc->currentFuncName,0,INT_MAX); } else if (*s) /* production (2) */ { /* breakdown the string into hyphen-delimited substrings, and further parses each substring with p4: */ parseString(s,'-',p4,ctx); } } /* p2 parses the function name, and caches it into the context, and thereafter uses p3 to parse the line number range list. */ static void p2(char *s,int idx,int n,void *ctx) { funcNameCtx *pfnc = (funcNameCtx *)ctx; if (n) { if (idx == 0) pfnc->currentFuncName = s; else { /* production (4) */ /* breakdown the string into comma-delimited substrings, and further parses each substring with p3: */ parseString(s,',',p3,ctx); } } else { /* production (7)/(5). */ insertNodeInTree(pfnc->root,pfnc->currentFuncName=s,0,INT_MAX); } } /* p1 parses each function name and line range list. */ static void p1(char *s,int idx,int n,void *ctx) { /* production (5)/(6)) */ /* breakdown the string into semi-colon-delimited substrings, and further parses each substring with p2: */ parseString(s,';',p2,ctx); } static void parseAndInsertInTree(const char *buf,treeNode **root) { funcNameCtx t; t.root = root; char *p = alloca(strlen(buf)+1); strcpy(p,buf); /* productions (7)/(8) */ /* separates string into colon-separated strings, and then parses each substring in p1: */ parseString(p,':',p1,(void*)&t); } /* initialization code for the psmi log mechanism. */ static inline void psmi_initialize(const char **plmf_fileName_kernel, const char **plmf_search_format_string, treeNode **includeFunctionNamesTreeRoot, treeNode **excludeFunctionNamesTreeRoot) { static volatile int plmf_initialized = 0; if (!plmf_initialized) { static pthread_mutex_t plmf_init_mutex = PTHREAD_MUTEX_INITIALIZER; if (pthread_mutex_lock(&plmf_init_mutex)) { perror("cannot lock mutex for psmi_log_message facility"); return; } /* CRITICAL SECTION BEGIN */ if (!plmf_initialized) { /* initializing psmi log message facility here. */ const char *env = getenv("PSM2_LOG_FILENAME"); if (env) *plmf_fileName_kernel = env; env = getenv("PSM2_LOG_SRCH_FORMAT_STRING"); if (env) { *plmf_search_format_string = env; } else { env = getenv("PSM2_LOG_INC_FUNCTION_NAMES"); if (env) { parseAndInsertInTree(env,includeFunctionNamesTreeRoot); } env = getenv("PSM2_LOG_EXC_FUNCTION_NAMES"); if (env) { parseAndInsertInTree(env,excludeFunctionNamesTreeRoot); } } /* initialization of psmi log message facility is completed. */ plmf_initialized = 1; } /* CRITICAL SECTION END */ if (pthread_mutex_unlock(&plmf_init_mutex)) { perror("cannot unlock mutex for psmi_log_message facility"); return; } } } /* Utility function to map the integer txrx value to the given strings for emitting to the log file. */ static const char * const TxRxString(int txrx) { switch(txrx) { case PSM2_LOG_TX: return "Sent"; case PSM2_LOG_RX: return "Received"; case PSM2_LOG_PEND: return "Pending"; default: return "Unknown"; } } /* Utility function to map an integer opcode value to the given strings for emitting to the log file. */ static const char * const OpcodeString(int opcode) { switch(opcode) { case OPCODE_LONG_RTS: return "RTS"; case OPCODE_LONG_CTS: return "CTS"; case OPCODE_LONG_DATA: return "DATA"; case OPCODE_EXPTID: return "EXPTID"; case OPCODE_EXPTID_COMPLETION: return "EXPTID_COMPLETION"; default: return "UNKNOWN"; } } static const char *plmf_fileName_kernel = "/tmp/psm2_log"; static const char *plmf_search_format_string = NULL; static treeNode *includeFunctionNamesTreeRoot = NULL; static treeNode *excludeFunctionNamesTreeRoot = NULL; void psmi_log_initialize(void) { /* If not initialized, then, initialize in a single thread of execution. */ psmi_initialize(&plmf_fileName_kernel, &plmf_search_format_string, &includeFunctionNamesTreeRoot, &excludeFunctionNamesTreeRoot); } #ifdef PSM_LOG_FAST_IO struct psmi_log_io_thread_info { pthread_t thread_id; char *buff; unsigned long max_buff_length, curr_buff_length; pthread_mutex_t flags_mutex; volatile int flags; #define PSMI_LOG_IO_FLAG_IO_IN_PROGRESS 1 /* io is currently in progress */ #define PSMI_LOG_IO_FLAG_IO_SHUTDOWN 2 /* we are shutting down logging. */ }; /* Please note that psmi_log_io_info is in thread local storage. */ static __thread struct psmi_log_io_thread_info psmi_log_io_info = { .thread_id = 0, .buff = NULL, .max_buff_length = 0, .curr_buff_length = 0, .flags_mutex = PTHREAD_MUTEX_INITIALIZER, .flags = 0 }; static struct { unsigned int nTableEntries,maxTableEntries; pthread_mutex_t table_mutex; struct psmi_log_io_thread_info **table; } psmi_log_io_table = { .nTableEntries = 0, .maxTableEntries = 0, .table_mutex = PTHREAD_MUTEX_INITIALIZER, .table = NULL }; void psmi_log_fini() { if (pthread_mutex_lock(&psmi_log_io_table.table_mutex)) { perror("Cannot lock mutex for psmi_log_io_table"); return; } /* Start critical section. */ unsigned int i; for (i=0;i < psmi_log_io_table.nTableEntries;i++) { if (psmi_log_io_table.table[i]) { struct psmi_log_io_thread_info *pti = psmi_log_io_table.table[i]; int flags; if (pthread_mutex_lock(&pti->flags_mutex)) { perror("can't lock the flags mutex."); continue; } /* critical section */ flags = (pti->flags |= PSMI_LOG_IO_FLAG_IO_SHUTDOWN); /* end critical section */ pthread_mutex_unlock(&pti->flags_mutex); /* if io is currenctly in progress, allow it to complete. */ while (flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS) { sleep(1); if (pthread_mutex_lock(&pti->flags_mutex)) { perror("can't lock the flags mutex."); continue; } flags = pti->flags; pthread_mutex_unlock(&pti->flags_mutex); } if (pti->buff) { char logFileName[256]; FILE *fout; snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld", plmf_fileName_kernel,getpid(),pti->thread_id); fout = fopen(logFileName,"w"); if (!fout) { perror(logFileName); continue; } fwrite(pti->buff,pti->curr_buff_length,1,fout); fclose(fout); } } psmi_log_io_table.table[i] = NULL; } psmi_log_io_table.nTableEntries = 0; psmi_free(psmi_log_io_table.table); psmi_log_io_table.table = NULL; psmi_log_io_table.maxTableEntries = 0; /* End critical section. */ pthread_mutex_unlock(&psmi_log_io_table.table_mutex); } static int psmi_log_register_tls(void) { if (psmi_log_io_info.thread_id != pthread_self()) { psmi_log_io_info.thread_id = pthread_self(); if (pthread_mutex_lock(&psmi_log_io_table.table_mutex)) { perror("cannot lock table mutex"); return -1; } /* critical section start. */ if (psmi_log_io_table.maxTableEntries < psmi_log_io_table.nTableEntries+1) { if (psmi_log_io_table.maxTableEntries == 0) { psmi_log_io_table.maxTableEntries = 2; psmi_log_io_table.table = psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, psmi_log_io_table.maxTableEntries * sizeof(struct psmi_log_io_thread_info *)); } else { psmi_log_io_table.maxTableEntries *= 2; psmi_log_io_table.table = psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, psmi_log_io_table.table, psmi_log_io_table.maxTableEntries * sizeof(struct psmi_log_io_thread_info *)); } } psmi_log_io_table.table[psmi_log_io_table.nTableEntries] = &psmi_log_io_info; psmi_log_io_table.nTableEntries++; /* critical section end. */ pthread_mutex_unlock(&psmi_log_io_table.table_mutex); } if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex)) { perror("cannot lock table mutex"); return -1; } /* critical section start. */ int old_flags = psmi_log_io_info.flags; int new_flags = old_flags; if (0 == (old_flags & PSMI_LOG_IO_FLAG_IO_SHUTDOWN)) new_flags |= PSMI_LOG_IO_FLAG_IO_IN_PROGRESS; psmi_log_io_info.flags = new_flags; /* critical section end. */ pthread_mutex_unlock(&psmi_log_io_info.flags_mutex); if (new_flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS) return 0; return -1; } static void psmi_buff_fclose(int port) { if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex)) { perror("cannot lock table mutex"); return; } /* critical section start. */ psmi_log_io_info.flags &= ~PSMI_LOG_IO_FLAG_IO_IN_PROGRESS; /* critical section end. */ pthread_mutex_unlock(&psmi_log_io_info.flags_mutex); } static void growBuff(size_t minExcess) { while (psmi_log_io_info.curr_buff_length+minExcess > psmi_log_io_info.max_buff_length) { if (!psmi_log_io_info.buff) psmi_log_io_info.buff = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, psmi_log_io_info.max_buff_length = 1 << 20); else { psmi_log_io_info.max_buff_length *= 2; psmi_log_io_info.buff = (char *)psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, psmi_log_io_info.buff, psmi_log_io_info.max_buff_length); } } } static int psmi_buff_vfprintf(int port, const char *format, va_list ap) { int done = 0; size_t excess = 1024; int length; while (!done) { growBuff(excess); length = vsnprintf(psmi_log_io_info.buff + psmi_log_io_info.curr_buff_length, excess, format, ap); if (length >= excess) excess *= 2; else done = 1; } psmi_log_io_info.curr_buff_length += length; return length; } static int psmi_buff_fprintf(int port,const char *format, ...) { int length; va_list ap; va_start(ap, format); length = psmi_buff_vfprintf(port,format,ap); va_end(ap); return length; } static int psmi_buff_fputc(int c, int port) { growBuff(1024); psmi_log_io_info.buff[psmi_log_io_info.curr_buff_length] = c; psmi_log_io_info.curr_buff_length++; return 1; } #endif #define IS_PSMI_LOG_MAGIC(S) ((((uint64_t)(S)) <= ((uint64_t)PSM2_LOG_MIN_MAGIC)) && \ (((uint64_t)(S)) >= ((uint64_t)PSM2_LOG_MAX_MAGIC))) /* plmf is short for 'psm log message facility. All of the PSM_LOG macros defined in psm_log.h are serviced from this back end. */ void psmi_log_message(const char *fileName, const char *functionName, int lineNumber, const char *format, ...) { va_list ap; va_start(ap, format); /* Next, determine if this log message is signal or noise. */ if (plmf_search_format_string) { if (!IS_PSMI_LOG_MAGIC(format)) { if (fnmatch(plmf_search_format_string, format, 0)) { va_end(ap); /* tis noise, return. */ return; } } } else { if (includeFunctionNamesTreeRoot) { if (lookupNodeInTree(includeFunctionNamesTreeRoot,functionName,lineNumber)) { va_end(ap); /* tis noise, return. */ return; } } if (excludeFunctionNamesTreeRoot) { if (!lookupNodeInTree(excludeFunctionNamesTreeRoot,functionName,lineNumber)) { va_end(ap); /* tis noise, return. */ return; } } } /* At this point, we think that this may be a message that we want to emit to the log. But, there is one more test, to apply to the cases where the format is one of the special formats for backtrack, and packet stream for example. */ { void **voidarray = NULL; int nframes = 0; const char *newFormat = format; int opcode = 0; psmi_log_tx_rx_t txrx = 0; uint64_t fromepid = 0; uint64_t toepid = 0; void *dumpAddr[2] = {0}; size_t dumpSize[2] = {0}; #ifdef PSM_LOG_FAST_IO #define IO_PORT 0 #define MY_FPRINTF psmi_buff_fprintf #define MY_VFPRINTF psmi_buff_vfprintf #define MY_FPUTC psmi_buff_fputc #define MY_FCLOSE psmi_buff_fclose #else char logFileName[256]; FILE *fout; #define IO_PORT fout #define MY_FPRINTF fprintf #define MY_VFPRINTF vfprintf #define MY_FPUTC fputc #define MY_FCLOSE fclose #endif struct timespec tp; /* Pop arguments for the alternative forms of PSM_LOG functionality: */ if (format == PSM2_LOG_BT_MAGIC) { voidarray = va_arg(ap,void **); nframes = va_arg(ap,int); newFormat = va_arg(ap,const char *); } else if (format == PSM2_LOG_EPM_MAGIC) { opcode = va_arg(ap,int); txrx = va_arg(ap,psmi_log_tx_rx_t); fromepid = va_arg(ap,uint64_t); toepid = va_arg(ap,uint64_t); newFormat = va_arg(ap,const char *); } else if (format == PSM2_LOG_DUMP_MAGIC) { dumpAddr[0] = va_arg(ap,void*); dumpSize[0] = va_arg(ap,size_t); newFormat = va_arg(ap,const char *); } else if (format == PSM2_LOG_PKT_STRM_MAGIC) { txrx = va_arg(ap,psmi_log_tx_rx_t); dumpAddr[0] = va_arg(ap,struct ips_message_header *); if (txrx == PSM2_LOG_RX) { dumpAddr[1] = va_arg(ap,uint32_t *); dumpSize[1] = sizeof(uint64_t); } newFormat = va_arg(ap,const char *); dumpSize[0] = sizeof(struct ips_message_header); } /* One last test to make sure that this message is signal: */ if (plmf_search_format_string && newFormat) { if (fnmatch(plmf_search_format_string, newFormat, 0)) { va_end(ap); /* tis noise, return. */ return; } } #ifdef PSM_LOG_FAST_IO if (psmi_log_register_tls() != 0) { va_end(ap); return; } #else /* At this point we know that the message is not noise, and it is going to be emitted to the log. */ snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld", plmf_fileName_kernel,getpid(), pthread_self()); fout = fopen(logFileName,"a"); if (!fout) { va_end(ap); return; } #endif #define M1() clock_gettime(CLOCK_REALTIME, &tp); \ MY_FPRINTF(IO_PORT,"%f %s %s:%d: ", \ (double)tp.tv_sec + ((double)tp.tv_nsec/1000000000.0), \ functionName,fileName,lineNumber) M1(); if (!IS_PSMI_LOG_MAGIC(format)) { MY_VFPRINTF(IO_PORT,format,ap); MY_FPUTC('\n',IO_PORT); } else if (format == PSM2_LOG_BT_MAGIC) { void *newframes[nframes]; int newframecnt = backtrace(newframes,nframes); int pframes = min(newframecnt,nframes); MY_VFPRINTF(IO_PORT,newFormat,ap); MY_FPUTC('\n',IO_PORT); if (memcmp(voidarray,newframes,pframes * sizeof(void*))) { int i; char **strings; memcpy(voidarray,newframes,sizeof(newframes)); M1(); MY_FPRINTF(IO_PORT, "backtrace() returned %d addresses\n", newframecnt); strings = backtrace_symbols(voidarray, pframes); if (strings == NULL) { perror("backtrace_symbols"); exit(EXIT_FAILURE); } for (i = 0; i < pframes; i++) { M1(); MY_FPRINTF(IO_PORT,"%s\n", strings[i]); } #undef free free(strings); } } else if (format == PSM2_LOG_EPM_MAGIC) { static epmTreeNode *root = 0; static pthread_mutex_t plmf_epm_mutex = PTHREAD_MUTEX_INITIALIZER; int *pcount = 0; if (pthread_mutex_lock(&plmf_epm_mutex)) { perror("cannot lock mutex for " "psmi_log_message facility"); va_end(ap); return; } /* START OF CRITICAL SECTION */ pcount = insertNodeInEpmTree(&root,opcode,txrx, fromepid,toepid); /* END OF CRITICAL SECTION */ if (pthread_mutex_unlock(&plmf_epm_mutex)) { perror("cannot unlock mutex for " "psmi_log_message facility"); va_end(ap); return; } (*pcount)++; MY_FPRINTF(IO_PORT,"%s %s from: %" PRIx64 ", to: %" PRIx64 ", count: %d, ", TxRxString(txrx),OpcodeString(opcode), fromepid,toepid,*pcount); MY_VFPRINTF(IO_PORT,newFormat,ap); MY_FPUTC('\n',IO_PORT); } else if (format == PSM2_LOG_PKT_STRM_MAGIC) { MY_FPRINTF(IO_PORT,"PKT_STRM: %s: imh: %p%s ", TxRxString(txrx), dumpAddr[0], (txrx == PSM2_LOG_RX) ? "," : ""); if (txrx == PSM2_LOG_RX) MY_FPRINTF(IO_PORT,"rhf: %p ", dumpAddr[1]); goto dumpit; } else if (format == PSM2_LOG_DUMP_MAGIC) { MY_VFPRINTF(IO_PORT,newFormat,ap); MY_FPUTC('\n',IO_PORT); dumpit: M1(); uint8_t *pu8 = (uint8_t *)dumpAddr[0]; size_t i,cnt=0; for (i=0;i < dumpSize[0];i++) { if ((i != 0) && ((i % 8) == 0)) { MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8)); M1(); cnt = 0; } else if (cnt) MY_FPUTC(',',IO_PORT); MY_FPRINTF(IO_PORT,"0x%02x", pu8[i]); cnt++; } if (cnt) MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8)); if (dumpSize[1]) { dumpSize[0] = dumpSize[1]; dumpAddr[0] = dumpAddr[1]; dumpSize[1] = 0; goto dumpit; } } MY_FCLOSE(IO_PORT); } va_end(ap); } #endif /* #ifdef PSM_LOG */ opa-psm2-PSM2_11.2.185/psm_utils.h000066400000000000000000000305171370564314600164170ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _PSMI_IN_USER_H #error psm_utils.h not meant to be included directly, include psm_user.h instead #endif #ifndef _PSMI_UTILS_H #define _PSMI_UTILS_H #include /* ipv4addr */ #include /* malloc/free */ /* * Endpoint 'id' hash table, with iterator interface */ struct psmi_epid_table { struct psmi_epid_tabentry *table; int tabsize; int tabsize_used; pthread_mutex_t tablock; }; /* * Endpoint address hash table */ struct psmi_epid_tabentry { void *entry; uint64_t key; psm2_ep_t ep; psm2_epid_t epid; }; extern struct psmi_epid_table psmi_epid_table; #define EPADDR_DELETED ((void *)-1) /* tag used to mark deleted entries */ psm2_error_t psmi_epid_init(); psm2_error_t psmi_epid_fini(); void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid); void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid); void psmi_epid_remove_all(psm2_ep_t ep); psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry); #define PSMI_EP_HOSTNAME ((psm2_ep_t) -1) /* Special endpoint handle we use * to register hostnames */ #define PSMI_EP_CROSSTALK ((psm2_ep_t) -2) /* Second special endpoint handle * to log which nodes we've seen * crosstalk from */ struct psmi_eptab_iterator { int i; /* last index looked up */ psm2_ep_t ep; }; void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep); void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor); void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor); uint64_t psmi_epid_version(psm2_epid_t epid); /* * Hostname manipulation */ char *psmi_gethostname(void); const char *psmi_epaddr_get_hostname(psm2_epid_t epid); const char *psmi_epaddr_get_name(psm2_epid_t epid); psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite); /* * Memory allocation, use macros only. * * In all calls, ep can be a specific endpoint (valid psm2_ep_t) or PSMI_EP_NONE * if no endpoint is available. * * psmi_malloc_usable_size(void *ptr) * psmi_malloc(ep, memtype, size) * psmi_realloc(ep, memtype, ptr, newsize) * psmi_memalign(ep, memtype, alignment, size) * psmi_calloc(ep, memtype, elemsz, numelems) * psmi_strdup(ep, memtype, ptr) * psmi_free(ptr) * */ typedef enum psmi_memtype { TOTAL = 0, /* Logged automatically by malloc/calloc */ UNDEFINED, /* For tracking "other types" of allocations */ PER_PEER_ENDPOINT, /* For tracking "per peer" allocations */ NETWORK_BUFFERS, /* For tracking network buffers */ DESCRIPTORS, /* For tracking send/recv descriptors */ UNEXPECTED_BUFFERS, /* For tracking unexpected recv buffers */ STATS, /* For tracking stats-related allocs */ } psmi_memtype_t; /* * We track allocation stats. */ struct psmi_stats_malloc { int64_t m_all_total; int64_t m_all_max; int64_t m_perpeer_total; int64_t m_perpeer_max; int64_t m_netbufs_total; int64_t m_netbufs_max; int64_t m_descriptors_total; int64_t m_descriptors_max; int64_t m_unexpbufs_total; int64_t m_unexpbufs_max; int64_t m_undefined_total; int64_t m_undefined_max; int64_t m_stats_total; int64_t m_stats_max; }; extern struct psmi_stats_malloc psmi_stats_memory; void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t sz, const char *curloc); void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t mt, void *ptr, size_t newSz, const char *curloc); void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t alignment, size_t sz, const char *curloc); void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t num, size_t sz, const char *curloc); void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc); void MOCKABLE(psmi_free_internal)(void *ptr, const char *curLoc); MOCK_DCL_EPILOGUE(psmi_free_internal); size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc); #ifdef PSM_HEAP_DEBUG /* During heap debug code, we can sprinkle function calls: psmi_heapdebug_val_heapallocs(), that will examine all of the heap allocations to ensure integrity. */ void _psmi_heapdebug_val_heapallocs(const char *curloc); #define psmi_heapdebug_val_heapallocs() _psmi_heapdebug_val_heapallocs(PSMI_CURLOC) /* Finialize the heapdebug functionality after tear down of the psm session when you are certain that all heap allocations have been freed. psmi_heapdebug_finalize() will emit all of the extant heap allocations and abort if there are any. This is to aid in debug of heap leaks. */ void psmi_heapdebug_finalize(void); #else #define psmi_heapdebug_val_heapallocs() /* nothing */ #define psmi_heapdebug_finalize() /* nothing */ #endif #define psmi_strdup(ep, string) psmi_strdup_internal(ep, string, PSMI_CURLOC) #define psmi_calloc(ep, mt, nelem, elemsz) \ psmi_calloc_internal(ep, mt, nelem, elemsz, PSMI_CURLOC) #define psmi_malloc(ep, mt, sz) psmi_malloc_internal(ep, mt, sz, PSMI_CURLOC) #define psmi_realloc(ep, mt, ptr, nsz) psmi_realloc_internal(ep, mt, ptr, nsz, PSMI_CURLOC) #define psmi_memalign(ep, mt, al, sz) \ psmi_memalign_internal(ep, mt, al, sz, PSMI_CURLOC) #define psmi_free(ptr) psmi_free_internal(ptr, PSMI_CURLOC) #define psmi_malloc_usable_size(ptr) psmi_malloc_usable_size_internal(ptr, PSMI_CURLOC) #ifndef PSM_IS_TEST #define malloc(sz) _use_psmi_malloc_instead_of_plain_malloc #define realloc(ptr,nsz) _use_psmi_realloc_instead_of_plain_realloc #define memalign(algn,sz) _use_psmi_memalign_instead_of_plain_memalign #define calloc(sz, nelm) _use_psmi_calloc_instead_of_plain_calloc #ifdef strdup #undef strdup #endif #define strdup(ptr) _use_psmi_strdup_instead_of_plain_strdup #define free(ptr) _use_psmi_free_instead_of_plain_free #define malloc_usable_size(ptr) _use_psmi_malloc_usable_size_instead_of_plain_malloc_usable_size #endif /* PSM_IS_TEST */ void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes); /* * Parsing int parameters set in string tuples. */ int psmi_parse_str_tuples(const char *str, int ntup, int *vals); /* * Resource Limiting based on PSM memory mode. */ #define PSMI_MEMMODE_NORMAL 0 #define PSMI_MEMMODE_MINIMAL 1 #define PSMI_MEMMODE_LARGE 2 #define PSMI_MEMMODE_NUM 3 struct psmi_rlimit_mpool { const char *env; const char *descr; int env_level; uint32_t minval; uint32_t maxval; struct { uint32_t obj_chunk; uint32_t obj_max; } mode[PSMI_MEMMODE_NUM]; }; psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level, const struct psmi_rlimit_mpool *rlim, uint32_t *valo, uint32_t *chunkszo); int psmi_parse_memmode(void); /* * Parsing environment variables */ union psmi_envvar_val { void *e_void; char *e_str; int e_int; unsigned int e_uint; long e_long; unsigned long e_ulong; unsigned long long e_ulonglong; }; #define PSMI_ENVVAR_LEVEL_USER 1 #define PSMI_ENVVAR_LEVEL_HIDDEN 2 #define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 #define PSMI_ENVVAR_TYPE_YESNO 0 #define PSMI_ENVVAR_TYPE_STR 1 #define PSMI_ENVVAR_TYPE_INT 2 #define PSMI_ENVVAR_TYPE_UINT 3 #define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 #define PSMI_ENVVAR_TYPE_LONG 5 #define PSMI_ENVVAR_TYPE_ULONG 6 #define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 #define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 #define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1) #define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0) int MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level, int type, union psmi_envvar_val defval, union psmi_envvar_val *newval); MOCK_DCL_EPILOGUE(psmi_getenv); /* * Misc functionality */ uintptr_t psmi_getpagesize(void); uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns); uint32_t psmi_get_ipv4addr(); void psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...); void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out); int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB); void *psmi_memcpyo(void *dst, const void *src, size_t n); uint32_t psmi_crc(unsigned char *buf, int len); /* * Internal CPUID detection */ #define CPUID_FAMILY_MASK 0x00000f00 #define CPUID_MODEL_MASK 0x000000f0 #define CPUID_EXMODEL_MASK 0x000f0000 /* * CPUID return values */ #define CPUID_FAMILY_XEON 0x00000600 #define CPUID_MODEL_PHI_GEN2 87 #define CPUID_MODEL_PHI_GEN2M 133 /* * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX * due to Little Endian and Hex it is not so obvious */ #define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */ #define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */ #define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */ /* * These values are internal only, not real register values */ #define CPUID_GENUINE_INTEL 0xf0000000 #define CPUID_MODEL_UNDEFINED -1 /* * Global model so we can tune defaults better for specific cpu's */ extern uint32_t psmi_cpu_model; /* * Diagnostics, all in psm_diags.c */ int psmi_diags(void); /* * Multiple Endpoints */ extern int psmi_multi_ep_enabled; void psmi_multi_ep_init(); #ifdef PSM_FI /* * Fault injection */ struct psmi_faultinj_spec; int psmi_faultinj_enabled; /* use macro to test */ #if 1 /* possible to disable at compile time */ #define PSMI_FAULTINJ_ENABLED() (!!psmi_faultinj_enabled) #else #define PSMI_FAULTINJ_ENABLED() 0 #endif void psmi_faultinj_init(); void psmi_faultinj_fini(); struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, int num, int denom); #define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, num, denom) \ static struct psmi_faultinj_spec *var; \ if (PSMI_FAULTINJ_ENABLED() && (var) == NULL) \ (var) = psmi_faultinj_getspec((spec_name), (num), (denom)); int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec); #endif /* #ifdef PSM_FI */ /* * PSM core component set/get options */ psm2_error_t psmi_core_setopt(const void *core_obj, int optname, const void *optval, uint64_t optlen); psm2_error_t psmi_core_getopt(const void *core_obj, int optname, void *optval, uint64_t *optlen); /* * PSM AM component set/get options */ psm2_error_t psmi_am_setopt(const void *am_obj, int optname, const void *optval, uint64_t optlen); psm2_error_t psmi_am_getopt(const void *am_obj, int optname, void *optval, uint64_t *optlen); #endif /* _PSMI_UTILS_H */ opa-psm2-PSM2_11.2.185/psmi_wrappers.c000066400000000000000000000060761370564314600172710ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "psmi_wrappers.h" #include /* The following indirection wrappers for external functions * are only created if this is a mocking tests build */ #ifdef PSM2_MOCK_TESTING void MOCKABLE(psmi_exit)(int status) { exit(status); } MOCK_DEF_EPILOGUE(psmi_exit); ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count) { return write(fd, buf, count); } MOCK_DEF_EPILOGUE(psmi_write); int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg) { return ioctl(fd, cmd, arg); } MOCK_DEF_EPILOGUE(psmi_ioctl); int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact) { return sigaction(signum, act, oldact); } MOCK_DEF_EPILOGUE(psmi_sigaction); void MOCKABLE(psmi_rmb)(void) { return ips_rmb(); } MOCK_DEF_EPILOGUE(psmi_rmb); #endif /* def PSM2_MOCK_TESTING */ opa-psm2-PSM2_11.2.185/psmi_wrappers.h000066400000000000000000000064631370564314600172760ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _PSMI_WRAPPERS_H #define _PSMI_WRAPPERS_H #include #include "psm2_mock_testing.h" #include "opa_intf.h" #if defined( IB_IOCTL_MAGIC ) #include #endif /* If this is a mocking tests build, we introduce "incision points" * through which we can easily mock external dependencies. * For non-mocking-tests build, we bypass those indirections * for performance reasons. */ #ifdef PSM2_MOCK_TESTING void MOCKABLE(psmi_exit)(int status); MOCK_DCL_EPILOGUE(psmi_exit); ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count); MOCK_DCL_EPILOGUE(psmi_write); int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg); MOCK_DCL_EPILOGUE(psmi_ioctl); int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact); MOCK_DCL_EPILOGUE(psmi_sigaction); void MOCKABLE(psmi_rmb)(void); MOCK_DCL_EPILOGUE(psmi_rmb); #else /* def PSM2_MOCK_TESTING */ #define psmi_exit exit #define psmi_write write #define psmi_ioctl ioctl #define psmi_sigaction sigaction #define psmi_rmb ips_rmb #endif /* def PSM2_MOCK_TESTING */ #endif // _PSMI_WRAPPERS_H opa-psm2-PSM2_11.2.185/ptl.h000066400000000000000000000162621370564314600152000ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* Interface implemented by Packet Transport layers such as * ips and active messages. * * This interface can be volatile, it is never seen by PSM clients, and it will * probably change as the AM ptl is developed. */ #ifndef PSM_PTL_H #define PSM_PTL_H #include #include #include #include /* We currently have 3 PTLs, 0 is reserved. */ #define PTL_DEVID_IPS 1 #define PTL_DEVID_AMSH 2 #define PTL_DEVID_SELF 3 /* We can currently initialize up to 3 PTLs */ #define PTL_MAX_INIT 3 /* struct ptl is an incomplete type, and it serves as a generic or opaque container. It should remain an incomplete type in the entire psm source base. concrete ptl types need to have a suffix such as ptl_self, ptl_ips. */ struct ptl; typedef struct ptl ptl_t; struct ptl_ctl; typedef struct ptl_ctl ptl_ctl_t; struct ptl_mq_req; typedef struct ptl_mq_req ptl_mq_req_t; struct ips_proto; typedef struct ips_proto ips_proto_t; /* To be filled in statically by all PTLs */ struct ptl_ctl_init { size_t(*sizeof_ptl) (void); psm2_error_t(*init) (const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl); psm2_error_t(*fini) (ptl_t *ptl, int force, uint64_t timeout_ns); psm2_error_t (*setopt) (const void *component_obj, int optname, const void *optval, uint64_t optlen); psm2_error_t (*getopt) (const void *component_obj, int optname, void *optval, uint64_t *optlen); }; struct ptl_ctl_rcvthread { uint32_t(*is_enabled) (const ptl_t *ptl); void(*transfer_ownership) (ptl_t *from_ptl, ptl_t *to_ptl); }; typedef struct ptl_arg { union { struct { uint16_t u16w3; uint16_t u16w2; uint16_t u16w1; uint16_t u16w0; }; struct { uint32_t u32w1; uint32_t u32w0; }; uint64_t u64w0; uint64_t u64; void *uptr; }; } ptl_arg_t; #include "ptl_self/ptl_fwd.h" #include "ptl_ips/ptl_fwd.h" #include "ptl_am/ptl_fwd.h" /* To be filled in as part of ptl_init */ struct ptl_ctl { ptl_t *ptl; /* pointer to ptl */ psm2_ep_t ep; /* pointer to ep */ /* EP-specific stuff */ psm2_error_t(*ep_poll) (ptl_t *ptl, int replyonly); /* PTL-level connect * * This PTL-level is slightly different from the top-level PSM connect. * * pre 1: Caller has masked off epids in epid array that are already * connected at the PSM level. * * post 0: PTL has allocate all epaddrs and whatever internal ptladdr * that ptl needs. * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i] * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't * be connected before a timeout occurred. * post 3: PTL returns OK if all epids are either OK or UNREACHABLE * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK. */ psm2_error_t(*ep_connect) (ptl_t *ptl, int num_ep, const psm2_epid_t input_array_of_epid[], const int array_of_epid_mask[], psm2_error_t output_array_of_errors[], psm2_epaddr_t output_array_of_epddr[], uint64_t timeout_ns); psm2_error_t (*ep_disconnect)(ptl_t *ptl, int force, int num_ep, psm2_epaddr_t input_array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t output_array_of_errors[], uint64_t timeout_ns); /* MQ stuff */ psm2_error_t(*mq_send) (psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, psm2_mq_tag_t *stag, const void *buf, uint32_t len); psm2_error_t(*mq_isend) (psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *ctxt, psm2_mq_req_t *req); int (*epaddr_stats_num) (void); int (*epaddr_stats_init) (char *desc[], uint16_t *flags); int (*epaddr_stats_get) (psm2_epaddr_t epaddr, uint64_t *stats); /* AM stuff */ psm2_error_t(*am_get_parameters) (psm2_ep_t ep, struct psm2_am_parameters * parameters); psm2_error_t(*am_short_request) (psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); psm2_error_t(*am_short_reply) (psm2_am_token_t token, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); /* Long messages currently unsupported */ #if 0 psm2_error_t(*am_long_request) (psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, void *dest, int flags); psm2_error_t(*am_long_reply) (psm2_am_token_t token, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, void *dest, int flags); #endif psm2_error_t (*msg_size_thresh_query) (enum psm2_info_query_thresh_et, uint32_t *out, psm2_mq_t mq, psm2_epaddr_t); }; #endif opa-psm2-PSM2_11.2.185/ptl_am/000077500000000000000000000000001370564314600154755ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/ptl_am/Makefile000066400000000000000000000063261370564314600171440ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2003-2014 Intel Corporation. All rights reserved. # OUTDIR = . this_srcdir := $(shell readlink -m .) top_srcdir := $(this_srcdir)/.. INCLUDES += -I$(top_srcdir) ${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o cmarwu.o ${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) DEPS := $(${TARGLIB}-objs:.o=.d) .PHONY: all clean IGNORE_DEP_TARGETS = clean all .DEFAULT: ${${TARGLIB}-objs} $(OUTDIR)/%.d: $(this_srcdir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ clean: @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno; \ cd -; \ fi #ifeq prevents the deps from being included during clean #-include line is required to pull in auto-dependecies during 2nd pass ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) -include ${DEPS} endif install: @echo "Nothing to do for install." opa-psm2-PSM2_11.2.185/ptl_am/am_config.h000066400000000000000000000061141370564314600175720ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2018 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2018 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef PTL_AM_AM_CONFIG_H #define PTL_AM_AM_CONFIG_H #include "psm_config.h" /* * Can change the rendezvous threshold based on usage of cma (or not) */ #define PSMI_MQ_RV_THRESH_CMA 16000 /* If no kernel assisted copy is available this is the rendezvous threshold */ #define PSMI_MQ_RV_THRESH_NO_KASSIST 16000 #define AMSH_HAVE_CMA 0x1 #define AMSH_HAVE_KASSIST 0x1 /* Each block reserves some space at the beginning to store auxiliary data */ #define AMSH_BLOCK_HEADER_SIZE 4096 /* AMLONG_SZ is the total size in memory of a bulk packet, including an * am_pkt_bulk_t header struct. * AMLONG_MTU is the number of bytes available in a bulk packet for payload. */ #define AMLONG_SZ 8192 #define AMLONG_MTU (AMLONG_SZ-sizeof(am_pkt_bulk_t)) #define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET #define PSMI_KASSIST_MODE_DEFAULT_STRING "cma-get" #endif /* PTL_AM_AM_CONFIG_H */ opa-psm2-PSM2_11.2.185/ptl_am/am_cuda_memhandle_cache.c000066400000000000000000000350351370564314600223750ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef PSM_CUDA #include "psm_user.h" #include "am_cuda_memhandle_cache.h" /* * rbtree cruft */ struct _cl_map_item; typedef struct { unsigned long start; /* start virtual address */ CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ uint16_t length; /* length*/ psm2_epid_t epid; struct _cl_map_item* i_prev; /* idle queue previous */ struct _cl_map_item* i_next; /* idle queue next */ }__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; typedef struct { uint32_t nelems; /* number of elements in the cache */ } rbtree_cuda_memhandle_cache_map_pl_t; static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); /* * Custom comparator */ typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item; static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b) { // When multi-ep is disabled, cache can assume // 1 epid == 1 remote process == 1 CUDA address space // But when multi-ep is enabled, one process can have many epids, so in this case // cannot use epid as part of cache key. if (!psmi_multi_ep_enabled) { if (a->epid < b->epid) return -1; if (a->epid > b->epid) return 1; } unsigned long a_end, b_end; // normalize into inclusive upper bounds to handle // 0-length entries a_end = (a->start + a->length); b_end = (b->start + b->length); if (a->length > 0) a_end--; if (b->length > 0) b_end--; if (a_end < b->start) return -1; if (b_end < a->start) return 1; return 0; } /* * Necessary rbtree cruft */ #define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t #define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t #define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b)) #define RBTREE_ASSERT psmi_assert #define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) #define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR #include "rbtree.h" #include "rbtree.c" /* * Convenience rbtree cruft */ #define NELEMS cuda_memhandle_cachemap.payload.nelems #define IHEAD cuda_memhandle_cachemap.root #define LAST IHEAD->payload.i_prev #define FIRST IHEAD->payload.i_next #define INEXT(x) x->payload.i_next #define IPREV(x) x->payload.i_prev /* * Actual module data */ static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ static uint8_t cuda_memhandle_cache_enabled; static mpool_t cuda_memhandle_mpool; static uint32_t cuda_memhandle_cache_size; static uint64_t cache_hit_counter; static uint64_t cache_miss_counter; static uint64_t cache_evict_counter; static uint64_t cache_collide_counter; static uint64_t cache_clear_counter; static void print_cuda_memhandle_cache_stats(void) { _HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu,clear=%lu\n", cuda_memhandle_cache_enabled, cuda_memhandle_cache_size, cache_hit_counter, cache_miss_counter, cache_evict_counter, cache_collide_counter, cache_clear_counter); } /* * This is the callback function when mempool are resized or destroyed. * Upon calling cache fini mpool is detroyed which in turn calls this callback * which helps in closing all memhandles. */ static void psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) { cl_map_item_t* memcache_item = (cl_map_item_t*)obj; if (!is_alloc) { if(memcache_item->payload.start) PSMI_CUDA_CALL(cuIpcCloseMemHandle, memcache_item->payload.cuda_ipc_dev_ptr); } } /* * Creating mempool for cuda memhandle cache nodes. */ static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size) { psm2_error_t err; if (memcache_size < 1) return PSM2_PARAM_ERR; cuda_memhandle_cache_size = memcache_size; /* Creating a memory pool of size PSM2_CUDA_MEMCACHE_SIZE * which includes the Root and NIL items */ cuda_memhandle_mpool = psmi_mpool_create_for_cuda(sizeof(cl_map_item_t), cuda_memhandle_cache_size, cuda_memhandle_cache_size, 0, UNDEFINED, NULL, NULL, psmi_cuda_memhandle_cache_alloc_func, NULL); if (cuda_memhandle_mpool == NULL) { err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, "Couldn't allocate CUDA host receive buffer pool"); return err; } return PSM2_OK; } /* * Initialize rbtree. */ psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size) { psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size); if (err != PSM2_OK) return err; cl_map_item_t *root, *nil_item; root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); if (root == NULL) return PSM2_NO_MEMORY; nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); if (nil_item == NULL) { psmi_free(root); return PSM2_NO_MEMORY; } nil_item->payload.start = 0; nil_item->payload.epid = 0; nil_item->payload.length = 0; cuda_memhandle_cache_enabled = 1; ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item); NELEMS = 0; cache_hit_counter = 0; cache_miss_counter = 0; cache_evict_counter = 0; cache_collide_counter = 0; cache_clear_counter = 0; return PSM2_OK; } void am_cuda_memhandle_cache_map_fini() { print_cuda_memhandle_cache_stats(); if (cuda_memhandle_cachemap.nil_item) { psmi_free(cuda_memhandle_cachemap.nil_item); cuda_memhandle_cachemap.nil_item = NULL; } if (cuda_memhandle_cachemap.root) { psmi_free(cuda_memhandle_cachemap.root); cuda_memhandle_cachemap.root = NULL; } if (cuda_memhandle_cache_enabled) { psmi_mpool_destroy(cuda_memhandle_mpool); cuda_memhandle_cache_enabled = 0; } cuda_memhandle_cache_size = 0; } /* * Insert at the head of Idleq. */ static void am_cuda_idleq_insert(cl_map_item_t* memcache_item) { if (FIRST == NULL) { FIRST = memcache_item; LAST = memcache_item; return; } INEXT(FIRST) = memcache_item; IPREV(memcache_item) = FIRST; FIRST = memcache_item; INEXT(FIRST) = NULL; return; } /* * Remove least recent used element. */ static void am_cuda_idleq_remove_last(cl_map_item_t* memcache_item) { if (!INEXT(memcache_item)) { LAST = NULL; FIRST = NULL; } else { LAST = INEXT(memcache_item); IPREV(LAST) = NULL; } // Null-out now-removed memcache_item's next and prev pointers out of // an abundance of caution INEXT(memcache_item) = IPREV(memcache_item) = NULL; } static void am_cuda_idleq_remove(cl_map_item_t* memcache_item) { if (LAST == memcache_item) { am_cuda_idleq_remove_last(memcache_item); } else if (FIRST == memcache_item) { FIRST = IPREV(memcache_item); INEXT(FIRST) = NULL; } else { INEXT(IPREV(memcache_item)) = INEXT(memcache_item); IPREV(INEXT(memcache_item)) = IPREV(memcache_item); } // Null-out now-removed memcache_item's next and prev pointers out of // an abundance of caution INEXT(memcache_item) = IPREV(memcache_item) = NULL; } static void am_cuda_idleq_reorder(cl_map_item_t* memcache_item) { if (FIRST == memcache_item && LAST == memcache_item ) { return; } am_cuda_idleq_remove(memcache_item); am_cuda_idleq_insert(memcache_item); return; } /* * After a successful cache hit, item is validated by doing a * memcmp on the handle stored and the handle we recieve from the * sender. If the validation fails the item is removed from the idleq, * the rbtree, is put back into the mpool and IpcCloseMemHandle function * is called. */ static psm2_error_t am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, uintptr_t sbuf, CUipcMemHandle* handle, uint32_t length, psm2_epid_t epid) { if ((0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle, sizeof(CUipcMemHandle))) && sbuf == memcache_item->payload.start && epid == memcache_item->payload.epid) { return PSM2_OK; } _HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length); cache_collide_counter++; ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item); PSMI_CUDA_CALL(cuIpcCloseMemHandle, memcache_item->payload.cuda_ipc_dev_ptr); am_cuda_idleq_remove(memcache_item); memset(memcache_item, 0, sizeof(*memcache_item)); psmi_mpool_put(memcache_item); return PSM2_OK_NO_PROGRESS; } /* * Current eviction policy: Least Recently Used. */ static void am_cuda_memhandle_cache_evict(void) { cache_evict_counter++; cl_map_item_t *p_item = LAST; _HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n", p_item->payload.epid, p_item->payload.start, p_item->payload.length, p_item->payload.cuda_ipc_dev_ptr, p_item); ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item); PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); am_cuda_idleq_remove_last(p_item); memset(p_item, 0, sizeof(*p_item)); psmi_mpool_put(p_item); } static psm2_error_t am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, uint32_t length, psm2_epid_t epid, CUdeviceptr cuda_ipc_dev_ptr) { if (NELEMS == cuda_memhandle_cache_size) am_cuda_memhandle_cache_evict(); cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool); /* memcache_item cannot be NULL as we evict * before the call to mpool_get. Check has * been fixed to help with klockwork analysis. */ if (memcache_item == NULL) return PSM2_NO_MEMORY; memcache_item->payload.start = sbuf; memcache_item->payload.cuda_ipc_handle = *handle; memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr; memcache_item->payload.length = length; memcache_item->payload.epid = epid; ips_cl_qmap_insert_item(&cuda_memhandle_cachemap, memcache_item); am_cuda_idleq_insert(memcache_item); return PSM2_OK; } static void am_cuda_memhandle_cache_clear(void) { _HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); while (NELEMS) { am_cuda_memhandle_cache_evict(); } _HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); } /* * The key used to search the cache is the senders buf address pointer. * Upon a succesful hit in the cache, additional validation is required * as multiple senders could potentially send the same buf address value. */ CUdeviceptr am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, uint32_t length, psm2_epid_t epid) { _HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n", sbuf, handle, length, epid); CUdeviceptr cuda_ipc_dev_ptr; if(!cuda_memhandle_cache_enabled) { PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); return cuda_ipc_dev_ptr; } cuda_cache_item key = { .start = (unsigned long) sbuf, .length= length, .epid = epid }; /* * preconditions: * 1) newrange [start,end) may or may not be in cachemap already * 2) there are no overlapping address ranges in cachemap * postconditions: * 1) newrange is in cachemap * 2) there are no overlapping address ranges in cachemap * * The key used to search the cache is the senders buf address pointer. * Upon a succesful hit in the cache, additional validation is required * as multiple senders could potentially send the same buf address value. */ cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); while (p_item->payload.start) { // Since a precondition is that there are no overlapping ranges in cachemap, // an exact match implies no need to check further if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) { cache_hit_counter++; am_cuda_idleq_reorder(p_item); return p_item->payload.cuda_ipc_dev_ptr; } // newrange is not in the cache and overlaps at least one existing range. // am_cuda_memhandle_cache_validate() closed and removed existing range. // Continue searching for more overlapping ranges p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); } cache_miss_counter++; CUresult cudaerr; PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) { // remote memory already mapped. Close all handles, clear cache, // and try again am_cuda_memhandle_cache_clear(); cache_clear_counter++; PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); } am_cuda_memhandle_cache_register(sbuf, handle, length, epid, cuda_ipc_dev_ptr); return cuda_ipc_dev_ptr; } void am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr) { if(!cuda_memhandle_cache_enabled) PSMI_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr); return; } #endif opa-psm2-PSM2_11.2.185/ptl_am/am_cuda_memhandle_cache.h000066400000000000000000000054511370564314600224010ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef PSM_CUDA #ifndef _AM_CUDA_MEMHANDLE_CACHE_H #define _AM_CUDA_MEMHANDLE_CACHE_H #include "psm_user.h" #include #ifdef __cplusplus extern "C" { #endif #define CUDA_MEMHANDLE_CACHE_SIZE 64 psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size); CUdeviceptr am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, uint32_t length, psm2_epid_t epid); void am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr); void am_cuda_memhandle_cache_map_fini(); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* _AM_CUDA_MEMHANDLE_CACHE_H */ #endif /* PSM_CUDA */ opa-psm2-PSM2_11.2.185/ptl_am/am_reqrep.c000066400000000000000000000102321370564314600176120ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_am.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" psm2_error_t psmi_amsh_am_short_request(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { psm2_amarg_t req_args[NSHORT_ARGS + NBULK_ARGS]; /* All sends are synchronous. Ignore PSM2_AM_FLAG_ASYNC. * Treat PSM2_AM_FLAG_NOREPLY as "advisory". This was mainly * used to optimize the IPS path though we could put a stricter interpretation * on it to disallow any replies. */ /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry * the handler index. */ psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1)); psmi_assert(epaddr->ptlctl->ptl != NULL); req_args[0].u32w0 = (uint32_t) handler; psmi_mq_mtucpy((void *)&req_args[1], (const void *)args, (nargs * sizeof(psm2_amarg_t))); psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, am_handler_hidx, req_args, nargs + 1, src, len, 0); if (completion_fn) completion_fn(completion_ctxt); return PSM2_OK; } psm2_error_t psmi_amsh_am_short_reply(psm2_am_token_t tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { psm2_amarg_t rep_args[NSHORT_ARGS + NBULK_ARGS]; /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry * the handler index. */ psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1)); rep_args[0].u32w0 = (uint32_t) handler; psmi_mq_mtucpy((void *)&rep_args[1], (const void *)args, (nargs * sizeof(psm2_amarg_t))); psmi_amsh_short_reply((amsh_am_token_t *) tok, am_handler_hidx, rep_args, nargs + 1, src, len, 0); if (completion_fn) completion_fn(completion_ctxt); return PSM2_OK; } opa-psm2-PSM2_11.2.185/ptl_am/am_reqrep_shmem.c000066400000000000000000002245611370564314600210170ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include /* shm_open and signal handling */ #include #include #include #include #include "psm_user.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" #include "cmarw.h" #include "psmi_wrappers.h" #ifdef PSM_CUDA #include "am_cuda_memhandle_cache.h" #endif int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; static const amsh_qinfo_t amsh_qcounts = { .qreqFifoShort = 1024, .qreqFifoLong = 256, .qrepFifoShort = 1024, .qrepFifoLong = 256 }; static const amsh_qinfo_t amsh_qelemsz = { .qreqFifoShort = sizeof(am_pkt_short_t), .qreqFifoLong = AMLONG_SZ, .qrepFifoShort = sizeof(am_pkt_short_t), .qrepFifoLong = AMLONG_SZ }; ustatic struct { void *addr; size_t len; struct sigaction SIGSEGV_old_act; struct sigaction SIGBUS_old_act; } action_stash; static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly); static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq); static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); /* Kassist helper functions */ #if _HFI_DEBUGGING static const char *psmi_kassist_getmode(int mode); #endif static int psmi_get_kassist_mode(); int psmi_epaddr_pid(psm2_epaddr_t epaddr); static inline void am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz) { pthread_spin_init(&q->lock, PTHREAD_PROCESS_SHARED); q->head = 0; q->tail = 0; q->elem_cnt = elem_cnt; q->elem_sz = elem_sz; } static void am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems) { int i; am_pkt_bulk_t *bulkpkt; uintptr_t bulkptr = (uintptr_t) base_ptr; for (i = 0; i < nelems; i++, bulkptr += elemsz) { bulkpkt = (am_pkt_bulk_t *) bulkptr; bulkpkt->idx = i; } } #define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \ PSMI_PAGESIZE) static inline uintptr_t am_ctl_sizeof_block() { return PSMI_ALIGNUP( PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + /* reqctrl block */ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + _PA(reqFifoShort) + _PA(reqFifoLong) + /*reqctrl block */ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + /* align to page size */ _PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE); } #undef _PA static uint32_t create_extra_ep_data() { uint32_t ret = getpid(); #ifdef PSM_CUDA /* PID is at maximum 22 bits */ ret |= my_gpu_device << 22; #endif return ret; } static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu) { uint32_t pid_mask = (1 << 22) - 1; *pid = data & pid_mask; *gpu = (data & ~pid_mask) >> 22; } static void am_update_directory(struct am_ctl_nodeinfo *); static void amsh_atexit() { static ips_atomic_t atexit_once = { 0 }; psm2_ep_t ep; struct ptl_am *ptl; /* bail out if previous value is non-zero */ if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0) return; ep = psmi_opened_endpoint; while (ep) { ptl = (struct ptl_am *)(ep->ptl_amsh.ptl); if (ptl->self_nodeinfo && ptl->amsh_keyname != NULL) { _HFI_VDBG("unlinking shm file %s\n", ptl->amsh_keyname); shm_unlink(ptl->amsh_keyname); } ep = ep->user_ep_next; } return; } ustatic void amsh_mmap_fault(int signo, siginfo_t *siginfo, void *context) { if ((unsigned long int) siginfo->si_addr >= (unsigned long int) action_stash.addr && (unsigned long int) siginfo->si_addr < (unsigned long int) action_stash.addr + (unsigned long int) action_stash.len) { static char shm_errmsg[256]; snprintf(shm_errmsg, sizeof(shm_errmsg), "%s: Unable to allocate shared memory for intra-node messaging.\n" "%s: Delete stale shared memory files in /dev/shm.\n", psmi_gethostname(), psmi_gethostname()); amsh_atexit(); if (psmi_write(2, shm_errmsg, strlen(shm_errmsg) + 1) == -1) psmi_exit(2); else psmi_exit(1); /* XXX revisit this... there's probably a better way to exit */ } else { if (signo == SIGSEGV) { if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_DFL) { psmi_sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); raise(SIGSEGV); struct sigaction act; act.sa_sigaction = amsh_mmap_fault; act.sa_flags = SA_SIGINFO; psmi_sigaction(SIGSEGV, &act, NULL); } else if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_IGN) { return; } else { action_stash.SIGSEGV_old_act.sa_sigaction(signo, siginfo, context); } } else if (signo == SIGBUS) { if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_DFL) { psmi_sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); raise(SIGBUS); struct sigaction act; act.sa_sigaction = amsh_mmap_fault; act.sa_flags = SA_SIGINFO; psmi_sigaction(SIGBUS, &act, NULL); } else if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_IGN) { return; } else { action_stash.SIGBUS_old_act.sa_sigaction(signo, siginfo, context); } } else { psmi_exit(signo); } } } /** * Create endpoint shared-memory object, containing ep's info * and message queues. */ psm2_error_t psmi_shm_create(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_ep_t ep = ptl->ep; char shmbuf[256]; void *mapptr; size_t segsz; psm2_error_t err = PSM2_OK; int shmfd = -1; char *amsh_keyname = NULL; int iterator; /* Get which kassist mode to use. */ ptl->psmi_kassist_mode = psmi_get_kassist_mode(); if (_HFI_PRDBG_ON) { _HFI_PRDBG_ALWAYS ("kassist_mode %d %s use_kassist %d\n", ptl->psmi_kassist_mode, psmi_kassist_getmode(ptl->psmi_kassist_mode), (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF)); } segsz = am_ctl_sizeof_block(); for (iterator = 0; iterator <= INT_MAX; iterator++) { snprintf(shmbuf, sizeof(shmbuf), "/psm2_shm.%ld%016lx%d", (long int) getuid(), ep->epid, iterator); amsh_keyname = psmi_strdup(NULL, shmbuf); if (amsh_keyname == NULL) { err = PSM2_NO_MEMORY; goto fail; } shmfd = shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); if (shmfd < 0) { psmi_free(amsh_keyname); amsh_keyname = NULL; if (errno == EACCES && iterator < INT_MAX) continue; else { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error creating shared " "memory object in " "shm_open: %s", strerror(errno)); goto fail; } } else { struct stat st; if (fstat(shmfd, &st) == -1) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error validating " "shared memory object " "with fstat: %s", strerror(errno)); goto fail; } if (getuid() == st.st_uid) { err = PSM2_OK; break; } else { err = PSM2_SHMEM_SEGMENT_ERR; close(shmfd); } } } if (err) { if (amsh_keyname) psmi_free(amsh_keyname); err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error creating shared memory object " "in shm_open: namespace exhausted."); goto fail; } /* Now register the atexit handler for cleanup, whether master or slave */ atexit(amsh_atexit); _HFI_PRDBG("Opened shmfile %s\n", amsh_keyname); if (ftruncate(shmfd, segsz) != 0) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error setting size of shared memory object to %u bytes in " "ftruncate: %s\n", (uint32_t) segsz, strerror(errno)); goto fail; } mapptr = mmap(NULL, segsz, PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0); if (mapptr == MAP_FAILED) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error mmapping shared memory: %s", strerror(errno)); psmi_free(amsh_keyname); goto fail; } memset((void *) mapptr, 0, segsz); /* touch all of my pages */ /* Our own ep's info for ptl_am resides at the start of the shm object. Other processes need some of this info to understand the rest of the queue structure and other details. */ ptl->self_nodeinfo = (struct am_ctl_nodeinfo *) mapptr; ptl->amsh_keyname = amsh_keyname; ptl->self_nodeinfo->amsh_shmbase = (uintptr_t) mapptr; fail: if (shmfd >= 0) close(shmfd); return err; } psm2_error_t psmi_epdir_extend(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; struct am_ctl_nodeinfo *new = NULL; new = (struct am_ctl_nodeinfo *) psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64, (ptl->am_ep_size + AMSH_DIRBLOCK_SIZE) * sizeof(struct am_ctl_nodeinfo)); if (new == NULL) return PSM2_NO_MEMORY; memcpy(new, ptl->am_ep, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); memset(new + ptl->am_ep_size, 0, AMSH_DIRBLOCK_SIZE * sizeof(struct am_ctl_nodeinfo)); psmi_free(ptl->am_ep); ptl->am_ep = new; ptl->am_ep_size += AMSH_DIRBLOCK_SIZE; return PSM2_OK; } /** * Unmap shm regions upon proper disconnect with other processes */ psm2_error_t psmi_do_unmap(uintptr_t shmbase) { psm2_error_t err = PSM2_OK; if (munmap((void *)shmbase, am_ctl_sizeof_block())) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error with munmap of shared segment: %s", strerror(errno)); } return err; } /** * Map a remote process' shared memory object. * * If the remote process has a shared memory object available, add it to our own * directory and return the shmidx. If the shared memory object does not exist, * return -1, and the connect poll function will try to map again later. * * If force_remap is true, then clear the entry that matches the epid. */ psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shmidx_o, int force_remap) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i; int use_kassist; uint16_t shmidx; char shmbuf[256]; void *dest_mapptr; size_t segsz; psm2_error_t err = PSM2_OK; int dest_shmfd; struct am_ctl_nodeinfo *dest_nodeinfo; int iterator; shmidx = *shmidx_o = -1; for (i = 0; i <= ptl->max_ep_idx; i++) { if (ptl->am_ep[i].epid == epid) { if (force_remap) { ptl->am_ep[i].epaddr = NULL; ptl->am_ep[i].epid = 0; break; } *shmidx_o = shmidx = i; return err; } } use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF); segsz = am_ctl_sizeof_block(); for (iterator = 0; iterator <= INT_MAX; iterator++) { snprintf(shmbuf, sizeof(shmbuf), "/psm2_shm.%ld%016lx%d", (long int) getuid(), epid, iterator); dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU); if (dest_shmfd < 0) { if (errno == EACCES && iterator < INT_MAX) continue; else { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error opening remote " "shared memory object " "in shm_open: %s", strerror(errno)); goto fail; } } else { struct stat st; if (fstat(dest_shmfd, &st) == -1) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error validating " "shared memory object " "with fstat: %s", strerror(errno)); close(dest_shmfd); goto fail; } if (getuid() == st.st_uid) { err = PSM2_OK; break; } else { err = PSM2_SHMEM_SEGMENT_ERR; close(dest_shmfd); } } } if (err) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error opening remote shared " "memory object in shm_open: " "namespace exhausted."); goto fail; } dest_mapptr = mmap(NULL, segsz, PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0); if (dest_mapptr == MAP_FAILED) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error mmapping remote shared memory: %s", strerror(errno)); close(dest_shmfd); goto fail; } close(dest_shmfd); dest_nodeinfo = (struct am_ctl_nodeinfo *)dest_mapptr; /* We core dump right after here if we don't check the mmap */ action_stash.addr = dest_mapptr; action_stash.len = segsz; struct sigaction act = { .sa_sigaction = amsh_mmap_fault, .sa_flags = SA_SIGINFO }; sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); { volatile uint16_t *is_init = &dest_nodeinfo->is_init; while (*is_init == 0) usleep(1); ips_sync_reads(); _HFI_PRDBG("Got a published remote dirpage page at " "%p, size=%dn", dest_mapptr, (int)segsz); } shmidx = -1; if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) { err = psmi_epdir_extend(ptl_gen); if (err) goto fail; for (i = 0; i <= ptl->max_ep_idx; i++) { if (ptl->am_ep[i].epid != 0) am_update_directory(&ptl->am_ep[i]); } } for (i = 0; i < ptl->am_ep_size; i++) { psmi_assert(ptl->am_ep[i].epid != epid); if (ptl->am_ep[i].epid == 0) { ptl->am_ep[i].epid = epid; ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno; ptl->am_ep[i].pid = dest_nodeinfo->pid; if (use_kassist) { /* If we are able to use CMA assume everyone * else on the node can also use it. * Advertise that CMA is active via the * feature flag. */ if (cma_available()) { ptl->am_ep[i].amsh_features |= AMSH_HAVE_CMA; psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_CMA; } else { ptl->psmi_kassist_mode = PSMI_KASSIST_OFF; use_kassist = 0; psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; } } else psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; _HFI_PRDBG("KASSIST MODE: %s\n", psmi_kassist_getmode(ptl->psmi_kassist_mode)); shmidx = *shmidx_o = i; _HFI_PRDBG("Mapped epid %lx into shmidx %d\n", epid, shmidx); ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr; ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes; if (i > ptl->max_ep_idx) ptl->max_ep_idx = i; break; } } /* install the old sighandler back */ sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); if (shmidx == (uint16_t)-1) err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Could not connect to local endpoint"); fail: return err; } /** * Initialize pointer structure and locks for endpoint shared-memory AM. */ #define AMSH_QSIZE(type) \ PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type, \ PSMI_PAGESIZE) static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK; /* Preconditions */ psmi_assert_always(ptl != NULL); psmi_assert_always(ptl->ep != NULL); psmi_assert_always(ptl->epaddr != NULL); psmi_assert_always(ptl->ep->epid != 0); if ((err = psmi_shm_create(ptl_gen))) goto fail; ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort); ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong); ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort); ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong); /* We core dump right after here if we don't check the mmap */ struct sigaction act = { .sa_sigaction = amsh_mmap_fault, .sa_flags = SA_SIGINFO }; sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); /* * Now that we know our epid, update it in the shmidx array */ ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL; ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL; am_update_directory(ptl->self_nodeinfo); ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort)); ptl->reqH.end = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) + amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort); ptl->repH.head = ptl->repH.base = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort)); ptl->repH.end = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) + amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq, amsh_qcounts.qreqFifoShort, amsh_qelemsz.qreqFifoShort); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq, amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq, amsh_qcounts.qrepFifoShort, amsh_qelemsz.qrepFifoShort); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq, amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong); /* Set bulkidx in every bulk packet */ am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong, amsh_qelemsz.qreqFifoLong, amsh_qcounts.qreqFifoLong); am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong, amsh_qelemsz.qrepFifoLong, amsh_qcounts.qrepFifoLong); /* install the old sighandler back */ sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); fail: return err; } psm2_error_t psmi_shm_detach(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK; uintptr_t shmbase; if (ptl->self_nodeinfo == NULL) return err; _HFI_VDBG("unlinking shm file %s\n", ptl->amsh_keyname + 1); shmbase = ptl->self_nodeinfo->amsh_shmbase; shm_unlink(ptl->amsh_keyname); psmi_free(ptl->amsh_keyname); if (munmap((void *)shmbase, am_ctl_sizeof_block())) { err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error with munmap of shared segment: %s", strerror(errno)); goto fail; } ptl->self_nodeinfo = NULL; return PSM2_OK; fail: return err; } /** * Update locally shared-pointer directory. The directory must be * updated when a new epaddr is connected to or on every epaddr already * connected to whenever the shared memory segment is relocated via mremap. * * @param epaddr Endpoint address for which to update local directory. */ static void am_update_directory(struct am_ctl_nodeinfo *nodeinfo) { uintptr_t base_this; base_this = nodeinfo->amsh_shmbase + AMSH_BLOCK_HEADER_SIZE; /* Request queues */ nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this; nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *) ((uintptr_t) nodeinfo->qdir.qreqH + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *) ((uintptr_t) nodeinfo->qdir.qreqFifoShort + nodeinfo->amsh_qsizes.qreqFifoShort); /* Reply queues */ nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *) ((uintptr_t) nodeinfo->qdir.qreqFifoLong + nodeinfo->amsh_qsizes.qreqFifoLong); nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *) ((uintptr_t) nodeinfo->qdir.qrepH + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); nodeinfo->qdir.qrepFifoLong = (am_pkt_bulk_t *) ((uintptr_t) nodeinfo->qdir.qrepFifoShort + nodeinfo->amsh_qsizes.qrepFifoShort); _HFI_VDBG("epaddr=%p Request Hdr=%p,Pkt=%p,Long=%p\n", nodeinfo->epaddr, nodeinfo->qdir.qreqH, nodeinfo->qdir.qreqFifoShort, nodeinfo->qdir.qreqFifoLong); _HFI_VDBG("epaddr=%p Reply Hdr=%p,Pkt=%p,Long=%p\n", nodeinfo->epaddr, nodeinfo->qdir.qrepH, nodeinfo->qdir.qrepFifoShort, nodeinfo->qdir.qrepFifoLong); /* Sanity check */ uintptr_t base_next = (uintptr_t) nodeinfo->qdir.qrepFifoLong + nodeinfo->amsh_qsizes.qrepFifoLong; psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block()); } /* ep_epid_share_memory wrapper */ static int amsh_epid_reachable(ptl_t *ptl_gen, psm2_epid_t epid) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int result; psm2_error_t err; err = psm2_ep_epid_share_memory(ptl->ep, epid, &result); psmi_assert_always(err == PSM2_OK); return result; } static psm2_error_t amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t *epaddr_o) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_epaddr_t epaddr; am_epaddr_t *amaddr; psm2_error_t err = PSM2_OK; psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL); /* The self PTL handles loopback communication. */ psmi_assert(epid != ptl->epid); /* note the size of the memory is am_epaddr_t */ epaddr = (psm2_epaddr_t) psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1, sizeof(am_epaddr_t)); if (epaddr == NULL) { return PSM2_NO_MEMORY; } psmi_assert_always(ptl->am_ep[shmidx].epaddr == NULL); if ((err = psmi_epid_set_hostname(psm2_epid_nid(epid), psmi_gethostname(), 0))) goto fail; epaddr->ptlctl = ptl->ctl; epaddr->epid = epid; /* convert to am_epaddr_t */ amaddr = (am_epaddr_t *) epaddr; /* tell the other endpoint their location in our directory */ amaddr->shmidx = shmidx; /* we haven't connected yet, so we can't give them the same hint */ amaddr->return_shmidx = -1; amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; amaddr->cstate_incoming = AMSH_CSTATE_INCOMING_NONE; /* other setup */ ptl->am_ep[shmidx].epaddr = epaddr; am_update_directory(&ptl->am_ep[shmidx]); /* Finally, add to table */ if ((err = psmi_epid_add(ptl->ep, epid, epaddr))) goto fail; _HFI_VDBG("epaddr=%s added to ptl=%p\n", psmi_epaddr_get_name(epid), ptl); *epaddr_o = epaddr; return PSM2_OK; fail: if (epaddr != ptl->epaddr) psmi_free(epaddr); return err; } static void amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; am_epaddr_t *amaddr; uint16_t shmidx; struct am_ctl_nodeinfo *nodeinfo; amaddr = (am_epaddr_t *) epaddr; shmidx = amaddr->shmidx; nodeinfo = (struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase; /* restart the connection process */ amaddr->return_shmidx = -1; amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; /* wait for the other process to init again */ { volatile uint16_t *is_init = &nodeinfo->is_init; while (*is_init == 0) usleep(1); ips_sync_reads(); } /* get the updated values from the new nodeinfo page */ ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno; ptl->am_ep[shmidx].pid = nodeinfo->pid; ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes; am_update_directory(&ptl->am_ep[shmidx]); return; } struct ptl_connection_req { int isdone; int op; /* connect or disconnect */ int numep; int numep_left; int phase; int *epid_mask; const psm2_epid_t *epids; /* input epid list */ psm2_epaddr_t *epaddr; psm2_error_t *errors; /* inout errors */ /* Used for connect/disconnect */ psm2_amarg_t args[4]; }; static void amsh_free_epaddr(psm2_epaddr_t epaddr) { psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid); psmi_free(epaddr); return; } #define PTL_OP_CONNECT 0 #define PTL_OP_DISCONNECT 1 #define PTL_OP_ABORT 2 static psm2_error_t amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ int numep, const psm2_epid_t *array_of_epid, /* non-NULL on connect */ const int array_of_epid_mask[], psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, struct ptl_connection_req **req_o) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i, cstate; psm2_epaddr_t epaddr; psm2_epid_t epid; struct ptl_connection_req *req = NULL; req = (struct ptl_connection_req *) psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1, sizeof(struct ptl_connection_req)); if (req == NULL) return PSM2_NO_MEMORY; req->isdone = 0; req->op = op; req->numep = numep; req->numep_left = 0; req->phase = ptl->connect_phase; req->epid_mask = (int *) psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int)); if (req->epid_mask == NULL) { psmi_free(req); return PSM2_NO_MEMORY; } req->epaddr = array_of_epaddr; req->epids = array_of_epid; req->errors = array_of_errors; /* First check if there's really something to connect/disconnect * for this PTL */ for (i = 0; i < numep; i++) { req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */ if (!array_of_epid_mask[i]) continue; if (op == PTL_OP_CONNECT) { epid = array_of_epid[i]; /* Connect only to other processes reachable by shared memory. The self PTL handles loopback communication, so explicitly refuse to connect to self. */ if (!amsh_epid_reachable(ptl_gen, epid) || epid == ptl->epid) { array_of_errors[i] = PSM2_EPID_UNREACHABLE; array_of_epaddr[i] = NULL; continue; } _HFI_VDBG("looking at epid %llx\n", (unsigned long long)epid); epaddr = psmi_epid_lookup(ptl->ep, epid); if (epaddr != NULL) { if (epaddr->ptlctl->ptl != ptl_gen) { array_of_errors[i] = PSM2_EPID_UNREACHABLE; array_of_epaddr[i] = NULL; continue; } cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { array_of_epaddr[i] = epaddr; array_of_errors[i] = PSM2_OK; } else { psmi_assert(cstate == AMSH_CSTATE_OUTGOING_NONE); array_of_errors[i] = PSM2_TIMEOUT; array_of_epaddr[i] = epaddr; req->epid_mask[i] = AMSH_CMASK_PREREQ; } } else { req->epid_mask[i] = AMSH_CMASK_PREREQ; array_of_epaddr[i] = NULL; } } else { /* disc or abort */ epaddr = array_of_epaddr[i]; if (epaddr->ptlctl->ptl != ptl_gen) continue; psmi_assert(epaddr != NULL); cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { req->epid_mask[i] = AMSH_CMASK_PREREQ; _HFI_VDBG ("Just set index %d to AMSH_CMASK_PREREQ\n", i); } /* XXX undef ? */ } if (req->epid_mask[i] != AMSH_CMASK_NONE) req->numep_left++; } if (req->numep_left == 0) { /* nothing to do */ psmi_free(req->epid_mask); psmi_free(req); _HFI_VDBG("Nothing to connect, bump up phase\n"); ptl->connect_phase++; *req_o = NULL; return PSM2_OK; } else { *req_o = req; return PSM2_OK_NO_PROGRESS; } } static psm2_error_t amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i, j, cstate; uint16_t shmidx = (uint16_t)-1; psm2_error_t err = PSM2_OK; psm2_epid_t epid; psm2_epaddr_t epaddr; if (req == NULL || req->isdone) return PSM2_OK; psmi_assert_always(ptl->connect_phase == req->phase); if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] == AMSH_CMASK_NONE || req->epid_mask[i] == AMSH_CMASK_DONE) continue; epaddr = req->epaddr[i]; psmi_assert(epaddr != NULL); if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { shmidx = ((am_epaddr_t *) epaddr)->shmidx; /* Make sure the target of the disconnect is still there */ if (ptl->am_ep[shmidx]. epid != epaddr->epid) { req->numep_left--; req->epid_mask[i] = AMSH_CMASK_DONE; ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; } } if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { req->args[0].u16w0 = PSMI_AM_DISC_REQ; req->args[0].u16w1 = shmidx; req->args[0].u32w1 = ptl->connect_phase; req->args[1].u64w0 = (uint64_t) ptl->epid; psmi_assert(shmidx != (uint16_t)-1); req->args[2].u32w0 = create_extra_ep_data(); req->args[2].u32w1 = PSM2_OK; req->args[3].u64w0 = (uint64_t) (uintptr_t) &req->errors[i]; psmi_amsh_short_request(ptl_gen, epaddr, amsh_conn_handler_hidx, req->args, 4, NULL, 0, 0); ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_DISC_REQUESTED; /** * Only munmap if we have nothing more to * communicate with the other node, i.e. we * already recieved a disconnect req from the * other node. */ if (((am_epaddr_t *) epaddr)->cstate_incoming == AMSH_CSTATE_INCOMING_DISC_REQUESTED) err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); req->epid_mask[i] = AMSH_CMASK_POSTREQ; } else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_DISC_REPLIED) { req->numep_left--; req->epid_mask[i] = AMSH_CMASK_DONE; ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; } } } } else { /* First see if we've made progress on any postreqs */ int n_prereq = 0; for (i = 0; i < req->numep; i++) { int cstate; if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) { if (req->epid_mask[i] == AMSH_CMASK_PREREQ) n_prereq++; continue; } epaddr = req->epaddr[i]; psmi_assert(epaddr != NULL); /* detect if a race has occurred on due to re-using an * old shm file - if so, restart the connection */ shmidx = ((am_epaddr_t *) epaddr)->shmidx; if (ptl->am_ep[shmidx].pid != ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) { req->epid_mask[i] = AMSH_CMASK_PREREQ; ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; n_prereq++; amsh_epaddr_update(ptl_gen, epaddr); continue; } cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { req->numep_left--; ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_ESTABLISHED; req->epid_mask[i] = AMSH_CMASK_DONE; continue; } } if (n_prereq > 0) { psmi_assert(req->numep_left > 0); /* Go through the list of peers we need to connect to and find out * if they each shared ep is mapped into shm */ for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] != AMSH_CMASK_PREREQ) continue; epid = req->epids[i]; epaddr = req->epaddr[i]; /* Go through mapped epids and find the epid we're looking for */ for (shmidx = -1, j = 0; j <= ptl->max_ep_idx; j++) { /* epid is connected and ready to go */ if (ptl->am_ep[j]. epid == epid) { shmidx = j; break; } } if (shmidx == (uint16_t)-1) { /* Couldn't find peer's epid in dirpage. Check shmdir to see if epid is up now. */ if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, 0))) { return err; } continue; } /* Before we even send the request out, check to see if * versions are interoperable */ if (!psmi_verno_isinteroperable (ptl->am_ep[shmidx]. psm_verno)) { char buf[32]; uint16_t their_verno = ptl->am_ep[shmidx]. psm_verno; snprintf(buf, sizeof(buf), "%d.%d", PSMI_VERNO_GET_MAJOR (their_verno), PSMI_VERNO_GET_MINOR (their_verno)); _HFI_INFO("Local endpoint id %" PRIx64 " has version %s " "which is not supported by library version %d.%d", epid, buf, PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); req->errors[i] = PSM2_EPID_INVALID_VERSION; req->numep_left--; req->epid_mask[i] = AMSH_CMASK_DONE; continue; } if (epaddr != NULL) { psmi_assert(((am_epaddr_t *) epaddr)-> shmidx == shmidx); } else if ((epaddr = psmi_epid_lookup(ptl->ep, epid)) == NULL) { if ((err = amsh_epaddr_add(ptl_gen, epid, shmidx, &epaddr))) { return err; } /* Remote pid is unknown at the moment */ ((am_epaddr_t *) epaddr)->pid = AMSH_PID_UNKNOWN; } req->epaddr[i] = epaddr; req->args[0].u16w0 = PSMI_AM_CONN_REQ; /* tell the other process its shmidx here */ req->args[0].u16w1 = shmidx; req->args[0].u32w1 = ptl->connect_phase; req->args[1].u64w0 = (uint64_t) ptl->epid; req->args[2].u32w0 = create_extra_ep_data(); req->args[2].u32w1 = PSM2_OK; req->args[3].u64w0 = (uint64_t) (uintptr_t) &req->errors[i]; req->epid_mask[i] = AMSH_CMASK_POSTREQ; psmi_amsh_short_request(ptl_gen, epaddr, amsh_conn_handler_hidx, req->args, 4, NULL, 0, 0); _HFI_PRDBG("epaddr=%p, epid=%" PRIx64 " at shmidx=%d\n", epaddr, epid, shmidx); } } } if (req->numep_left == 0) { /* we're all done */ req->isdone = 1; return PSM2_OK; } else { sched_yield(); return PSM2_OK_NO_PROGRESS; } } static psm2_error_t amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK; int i; /* Wherever we are at in our connect process, we've been instructed to * finish the connection process */ if (req == NULL) return PSM2_OK; /* This prevents future connect replies from referencing data structures * that disappeared */ ptl->connect_phase++; /* First process any leftovers in postreq or prereq */ for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] == AMSH_CMASK_NONE) continue; else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { int cstate; req->epid_mask[i] = AMSH_CMASK_DONE; cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { req->numep_left--; ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing = AMSH_CSTATE_OUTGOING_ESTABLISHED; } else { /* never actually got reply */ req->errors[i] = PSM2_TIMEOUT; } } /* If we couldn't go from prereq to postreq, that means we couldn't * find the shmidx for an epid in time. This can only be a case of * time out */ else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { req->errors[i] = PSM2_TIMEOUT; req->numep_left--; req->epid_mask[i] = AMSH_CMASK_DONE; } } /* Whatever is left can only be in DONE or NONE state */ for (i = 0; i < req->numep; i++) { if (req->epid_mask[i] == AMSH_CMASK_NONE) continue; psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE); err = psmi_error_cmp(err, req->errors[i]); /* XXX TODO: Report errors in connection. */ /* Only free epaddr if they have disconnected from us */ int cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_incoming; if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) { if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { psmi_assert(req->epaddr[i] != NULL); amsh_free_epaddr(req->epaddr[i]); req->epaddr[i] = NULL; } } } psmi_free(req->epid_mask); psmi_free(req); return err; } /* Wrapper for 2.0's use of connect/disconnect. The plan is to move the * init/poll/fini interface up to the PTL level for 2.2 */ #define CONNREQ_ZERO_POLLS_BEFORE_YIELD 20 static psm2_error_t amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op, int numep, const psm2_epid_t *array_of_epid, const int array_of_epid_mask[], psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err; uint64_t t_start; struct ptl_connection_req *req; int num_polls_noprogress = 0; static int shm_polite_attach = -1; if (shm_polite_attach == -1) { char *p = getenv("PSM2_SHM_POLITE_ATTACH"); if (p && *p && atoi(p) != 0) { fprintf(stderr, "%s: Using Polite SHM segment attach\n", psmi_gethostname()); shm_polite_attach = 1; } shm_polite_attach = 0; } /* Initialize */ err = amsh_ep_connreq_init(ptl_gen, op, numep, array_of_epid, array_of_epid_mask, array_of_errors, array_of_epaddr, &req); if (err != PSM2_OK_NO_PROGRESS) /* Either we're all done with connect or * there was an error */ return err; /* Poll until either * 1. We time out * 2. We are done with connecting */ t_start = get_cycles(); do { psmi_poll_internal(ptl->ep, 1); err = amsh_ep_connreq_poll(ptl_gen, req); if (err == PSM2_OK) break; /* Finished before timeout */ else if (err != PSM2_OK_NO_PROGRESS) { psmi_free(req->epid_mask); psmi_free(req); goto fail; } else if (shm_polite_attach && ++num_polls_noprogress == CONNREQ_ZERO_POLLS_BEFORE_YIELD) { num_polls_noprogress = 0; PSMI_YIELD(ptl->ep->mq->progress_lock); } } while (psmi_cycles_left(t_start, timeout_ns)); err = amsh_ep_connreq_fini(ptl_gen, req); fail: return err; } static psm2_error_t amsh_ep_connect(ptl_t *ptl, int numep, const psm2_epid_t *array_of_epid, const int array_of_epid_mask[], psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) { return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid, array_of_epid_mask, array_of_errors, array_of_epaddr, timeout_ns); } static psm2_error_t amsh_ep_disconnect(ptl_t *ptl, int force, int numep, psm2_epaddr_t array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t array_of_errors[], uint64_t timeout_ns) { return amsh_ep_connreq_wrap(ptl, force ? PTL_OP_ABORT : PTL_OP_DISCONNECT, numep, NULL, array_of_epaddr_mask, array_of_errors, array_of_epaddr, timeout_ns); } #undef CSWAP PSMI_ALWAYS_INLINE( int32_t cswap(volatile int32_t *p, int32_t old_value, int32_t new_value)) { asm volatile ("lock cmpxchg %2, %0" : "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory"); return old_value; } PSMI_ALWAYS_INLINE( am_pkt_short_t * am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0)) { am_pkt_short_t *pkt; uint32_t idx; #ifndef CSWAP pthread_spin_lock(&shq->lock); idx = shq->tail; pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz); if (pkt->flag == QFREE) { ips_sync_reads(); pkt->flag = QUSED; shq->tail += 1; if (shq->tail == shq->elem_cnt) shq->tail = 0; } else { pkt = 0; } pthread_spin_unlock(&shq->lock); #else uint32_t idx_next; do { idx = shq->tail; idx_next = (idx + 1 == shq->elem_cnt) ? 0 : idx + 1; } while (cswap(&shq->tail, idx, idx_next) != idx); pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz); while (cswap(&pkt->flag, QFREE, QUSED) != QFREE); #endif return pkt; } /* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */ #define am_ctl_getslot_bulkpkt_inner(shq, pkt0) ((am_pkt_bulk_t *) \ am_ctl_getslot_pkt_inner(shq, (am_pkt_short_t *)(pkt0))) PSMI_ALWAYS_INLINE( am_pkt_short_t * am_ctl_getslot_pkt(ptl_t *ptl_gen, uint16_t shmidx, int is_reply)) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; volatile am_ctl_qhdr_t *shq; am_pkt_short_t *pkt0; if (!is_reply) { shq = &(ptl->am_ep[shmidx].qdir.qreqH->shortq); pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoShort; } else { shq = &(ptl->am_ep[shmidx].qdir.qrepH->shortq); pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoShort; } return am_ctl_getslot_pkt_inner(shq, pkt0); } PSMI_ALWAYS_INLINE( am_pkt_bulk_t * am_ctl_getslot_long(ptl_t *ptl_gen, uint16_t shmidx, int is_reply)) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; volatile am_ctl_qhdr_t *shq; am_pkt_bulk_t *pkt0; if (!is_reply) { shq = &(ptl->am_ep[shmidx].qdir.qreqH->longbulkq); pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoLong; } else { shq = &(ptl->am_ep[shmidx].qdir.qrepH->longbulkq); pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoLong; } return am_ctl_getslot_bulkpkt_inner(shq, pkt0); } psmi_handlertab_t psmi_allhandlers[] = { {0} , {amsh_conn_handler} , {psmi_am_mq_handler} , {psmi_am_mq_handler_data} , {psmi_am_mq_handler_rtsmatch} , {psmi_am_mq_handler_rtsdone} , {psmi_am_handler} }; PSMI_ALWAYS_INLINE(void advance_head(volatile am_ctl_qshort_cache_t *hdr)) { QMARKFREE(hdr->head); hdr->head++; if (hdr->head == hdr->end) hdr->head = hdr->base; } #define AMSH_ZERO_POLLS_BEFORE_YIELD 64 #define AMSH_POLLS_BEFORE_PSM_POLL 16 /* XXX this can be made faster. Instead of checking the flag of the head, keep * a cached copy of the integer value of the tail and compare it against the * previous one we saw. */ PSMI_ALWAYS_INLINE( psm2_error_t amsh_poll_internal_inner(ptl_t *ptl_gen, int replyonly, int is_internal)) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK_NO_PROGRESS; /* poll replies */ if (!QISEMPTY(ptl->repH.head->flag)) { do { ips_sync_reads(); process_packet(ptl_gen, (am_pkt_short_t *) ptl->repH.head, 0); advance_head(&ptl->repH); err = PSM2_OK; } while (!QISEMPTY(ptl->repH.head->flag)); } if (!replyonly) { /* Request queue not enable for 2.0, will be re-enabled to support long * replies */ if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) { psmi_am_reqq_drain(ptl_gen); err = PSM2_OK; } if (!QISEMPTY(ptl->reqH.head->flag)) { do { ips_sync_reads(); process_packet(ptl_gen, (am_pkt_short_t *) ptl->reqH. head, 1); advance_head(&ptl->reqH); err = PSM2_OK; } while (!QISEMPTY(ptl->reqH.head->flag)); } } if (is_internal) { if (err == PSM2_OK) /* some progress, no yields */ ptl->zero_polls = 0; else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) { /* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */ sched_yield(); ptl->zero_polls = 0; } if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) { psmi_poll_internal(ptl->ep, 0); ptl->amsh_only_polls = 0; } } return err; /* if we actually did something */ } /* non-inlined version */ static psm2_error_t amsh_poll_internal(ptl_t *ptl, int replyonly) { return amsh_poll_internal_inner(ptl, replyonly, 1); } #ifdef PSM_PROFILE #define AMSH_POLL_UNTIL(ptl, isreply, cond) \ do { \ PSMI_PROFILE_BLOCK(); \ while (!(cond)) { \ PSMI_PROFILE_REBLOCK( \ amsh_poll_internal(ptl, isreply) == \ PSM2_OK_NO_PROGRESS); \ } \ PSMI_PROFILE_UNBLOCK(); \ } while (0) #else #define AMSH_POLL_UNTIL(ptl, isreply, cond) \ do { \ while (!(cond)) { \ amsh_poll_internal(ptl, isreply); \ } \ } while (0) #endif static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly) { return amsh_poll_internal_inner(ptl, replyonly, 0); } PSMI_ALWAYS_INLINE( void am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t returnidx, uint32_t bulkidx, uint16_t fmt, uint16_t nargs, uint16_t handleridx, psm2_amarg_t *args, const void *src, uint32_t len, int isreply)) { int i; volatile am_pkt_short_t *pkt; int copy_nargs; AMSH_POLL_UNTIL(ptl, isreply, (pkt = am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL); /* got a free pkt... fill it in */ pkt->bulkidx = bulkidx; pkt->shmidx = returnidx; pkt->type = fmt; pkt->nargs = nargs; pkt->handleridx = handleridx; /* Limit the number of args copied here to NSHORT_ARGS. Additional args are carried in the bulkpkt. */ copy_nargs = nargs; if (copy_nargs > NSHORT_ARGS) { copy_nargs = NSHORT_ARGS; } for (i = 0; i < copy_nargs; i++) pkt->args[i] = args[i]; if (fmt == AMFMT_SHORT_INLINE) mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src, len); _HFI_VDBG("pkt=%p fmt=%d bulkidx=%d,flag=%d,nargs=%d," "buf=%p,len=%d,hidx=%d,value=%d\n", pkt, (int)fmt, bulkidx, pkt->flag, pkt->nargs, src, (int)len, (int)handleridx, src != NULL ? *((uint32_t *) src) : 0); QMARKREADY(pkt); } #define amsh_shm_copy_short psmi_mq_mtucpy #define amsh_shm_copy_long psmi_mq_mtucpy PSMI_ALWAYS_INLINE( int psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dst, int flags)) { #ifdef PSM_DEBUG struct ptl_am *ptl = (struct ptl_am *)ptl_gen; #endif uint16_t type; uint32_t bulkidx; uint16_t hidx = (uint16_t) handler; int destidx = ((am_epaddr_t *) epaddr)->shmidx; int returnidx = ((am_epaddr_t *) epaddr)->return_shmidx; int is_reply = AM_IS_REPLY(amtype); volatile am_pkt_bulk_t *bulkpkt; _HFI_VDBG("%s epaddr=%s, shmidx=%d, type=%d\n", is_reply ? "reply" : "request", psmi_epaddr_get_name(epaddr->epid), ((am_epaddr_t *) epaddr)->shmidx, amtype); psmi_assert(epaddr != ptl->epaddr); switch (amtype) { case AMREQUEST_SHORT: case AMREPLY_SHORT: if (len + (nargs << 3) <= (NSHORT_ARGS << 3)) { /* Payload fits in args packet */ type = AMFMT_SHORT_INLINE; bulkidx = len; } else { int i; psmi_assert(len < amsh_qelemsz.qreqFifoLong); psmi_assert(src != NULL || nargs > NSHORT_ARGS); type = AMFMT_SHORT; AMSH_POLL_UNTIL(ptl_gen, is_reply, (bulkpkt = am_ctl_getslot_long(ptl_gen, destidx, is_reply)) != NULL); bulkidx = bulkpkt->idx; bulkpkt->len = len; _HFI_VDBG("bulkpkt %p flag is %d from idx %d\n", bulkpkt, bulkpkt->flag, destidx); for (i = 0; i < nargs - NSHORT_ARGS; i++) { bulkpkt->args[i] = args[i + NSHORT_ARGS]; } amsh_shm_copy_short((void *)bulkpkt->payload, src, (uint32_t) len); QMARKREADY(bulkpkt); } am_send_pkt_short(ptl_gen, destidx, returnidx, bulkidx, type, nargs, hidx, args, src, len, is_reply); break; case AMREQUEST_LONG: case AMREPLY_LONG: { uint32_t bytes_left = len; uint8_t *src_this = (uint8_t *) src; uint8_t *dst_this = (uint8_t *) dst; uint32_t bytes_this; type = AMFMT_LONG; _HFI_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n", is_reply ? "rep" : "req", src, dst, (uint32_t) len, hidx); while (bytes_left) { bytes_this = min(bytes_left, AMLONG_MTU); AMSH_POLL_UNTIL(ptl_gen, is_reply, (bulkpkt = am_ctl_getslot_long(ptl_gen, destidx, is_reply)) != NULL); bytes_left -= bytes_this; if (bytes_left == 0) type = AMFMT_LONG_END; bulkidx = bulkpkt->idx; amsh_shm_copy_long((void *)bulkpkt->payload, src_this, bytes_this); bulkpkt->dest = (uintptr_t) dst; bulkpkt->dest_off = (uint32_t) ((uintptr_t) dst_this - (uintptr_t) dst); bulkpkt->len = bytes_this; QMARKREADY(bulkpkt); am_send_pkt_short(ptl_gen, destidx, returnidx, bulkidx, type, nargs, hidx, args, NULL, 0, is_reply); src_this += bytes_this; dst_this += bytes_this; } break; } default: break; } return 1; } /* A generic version that's not inlined */ int psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dst, int flags) { return psmi_amsh_generic_inner(amtype, ptl, epaddr, handler, args, nargs, src, len, dst, flags); } int psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, int flags) { return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler, args, nargs, src, len, NULL, flags); } int psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dest, int flags) { return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler, args, nargs, src, len, dest, flags); } void psmi_amsh_short_reply(amsh_am_token_t *tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, int flags) { psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_incoming, handler, args, nargs, src, len, NULL, flags); return; } void psmi_amsh_long_reply(amsh_am_token_t *tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dest, int flags) { psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_incoming, handler, args, nargs, src, len, dest, flags); return; } void psmi_am_reqq_init(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; ptl->psmi_am_reqq_fifo.first = NULL; ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; } psm2_error_t psmi_am_reqq_drain(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first; am_reqq_t *req; psm2_error_t err = PSM2_OK_NO_PROGRESS; /* We're going to process the entire list, and running the generic handler * below can cause other requests to be enqueued in the queue that we're * processing. */ ptl->psmi_am_reqq_fifo.first = NULL; ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; while ((req = reqn) != NULL) { err = PSM2_OK; reqn = req->next; _HFI_VDBG ("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n", req, psmi_epaddr_get_hostname(req->epaddr->epid), (void *)(uintptr_t) req->args[1].u64w0, (void *)(uintptr_t) req->args[0].u64w0); psmi_amsh_generic(req->amtype, req->ptl, req->epaddr, req->handler, req->args, req->nargs, req->src, req->len, req->dest, req->amflags); if (req->flags & AM_FLAG_SRC_TEMP) psmi_free(req->src); psmi_free(req); } return err; } void psmi_am_reqq_add(int amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, void *dest, int amflags) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i; int flags = 0; am_reqq_t *nreq = (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t)); psmi_assert_always(nreq != NULL); _HFI_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, " "localreq=%p, remotereq=%p\n", nreq, psmi_epaddr_get_hostname(epaddr->epid), dest, (int)len, (void *)(uintptr_t) args[1].u64w0, (void *)(uintptr_t) args[0].u64w0); psmi_assert(nargs <= 8); nreq->next = NULL; nreq->amtype = amtype; nreq->ptl = ptl_gen; nreq->epaddr = epaddr; nreq->handler = handler; for (i = 0; i < nargs; i++) nreq->args[i] = args[i]; nreq->nargs = nargs; if (AM_IS_LONG(amtype) && src != NULL && len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) { abort(); flags |= AM_FLAG_SRC_TEMP; nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len); psmi_assert_always(nreq->src != NULL); /* XXX mem */ amsh_shm_copy_short(nreq->src, src, len); } else nreq->src = src; nreq->len = len; nreq->dest = dest; nreq->amflags = amflags; nreq->flags = flags; nreq->next = NULL; *(ptl->psmi_am_reqq_fifo.lastp) = nreq; ptl->psmi_am_reqq_fifo.lastp = &nreq->next; } static void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; amsh_am_token_t tok; psmi_handler_fn_t fn; psm2_amarg_t *args = pkt->args; uint16_t shmidx = pkt->shmidx; int nargs = pkt->nargs; tok.tok.epaddr_incoming = ((shmidx != (uint16_t)-1) ? ptl->am_ep[shmidx].epaddr : 0); tok.ptl = ptl_gen; tok.mq = ptl->ep->mq; tok.shmidx = shmidx; uint16_t hidx = (uint16_t) pkt->handleridx; uint32_t bulkidx = pkt->bulkidx; uintptr_t bulkptr; am_pkt_bulk_t *bulkpkt; fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn; psmi_assert(fn != NULL); psmi_assert((uintptr_t) pkt > ptl->self_nodeinfo->amsh_shmbase); if (pkt->type == AMFMT_SHORT_INLINE) { _HFI_VDBG ("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n", isreq ? "request" : "reply", pkt->flag, nargs, shmidx, pkt, hidx); fn(&tok, args, nargs, pkt->length > 0 ? (void *)&args[nargs] : NULL, pkt->length); } else { int isend = 0; switch (pkt->type) { case AMFMT_LONG_END: isend = 1; case AMFMT_LONG: case AMFMT_SHORT: if (isreq) { bulkptr = (uintptr_t) ptl->self_nodeinfo->qdir. qreqFifoLong; bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong; } else { bulkptr = (uintptr_t) ptl->self_nodeinfo->qdir. qrepFifoLong; bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong; } break; default: bulkptr = 0; psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled packet type 0x%x", pkt->type); return; } bulkpkt = (am_pkt_bulk_t *) bulkptr; _HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d " "from_idx=%d pkt=%p/%p hidx=%d\n", ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag, bulkpkt->flag, nargs, shmidx, pkt, bulkpkt, hidx); psmi_assert(bulkpkt->flag == QREADY); if (nargs > NSHORT_ARGS || isend == 1) { /* Either there are more args in the bulkpkt, or this is the last packet of a long payload. In either case, copy the args. */ int i; args = alloca((NSHORT_ARGS + NBULK_ARGS) * sizeof(psm2_amarg_t)); for (i = 0; i < NSHORT_ARGS; i++) { args[i] = pkt->args[i]; } for (; i < nargs; i++) { args[i] = bulkpkt->args[i - NSHORT_ARGS]; } } if (pkt->type == AMFMT_SHORT) { fn(&tok, args, nargs, (void *)bulkpkt->payload, bulkpkt->len); QMARKFREE(bulkpkt); } else { amsh_shm_copy_long((void *)(bulkpkt->dest + bulkpkt->dest_off), bulkpkt->payload, bulkpkt->len); /* If this is the last packet, copy args before running the * handler */ if (isend) { void *dest = (void *)bulkpkt->dest; size_t len = (size_t) (bulkpkt->dest_off + bulkpkt->len); QMARKFREE(bulkpkt); fn(&tok, args, nargs, dest, len); } else QMARKFREE(bulkpkt); } } return; } static psm2_error_t amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf, uint32_t len) { psm2_amarg_t args[5]; psm2_error_t err = PSM2_OK; args[0].u32w0 = MQ_MSG_LONGRTS; args[0].u32w1 = len; args[1].u32w1 = tag->tag[0]; args[1].u32w0 = tag->tag[1]; args[2].u32w1 = tag->tag[2]; args[3].u64w0 = (uint64_t) (uintptr_t) req; args[4].u64w0 = (uint64_t) (uintptr_t) buf; psmi_assert(req != NULL); req->type = MQE_TYPE_SEND; req->req_data.buf = (void *)buf; req->req_data.buf_len = len; req->req_data.send_msglen = len; req->send_msgoff = 0; #ifdef PSM_CUDA /* If the send buffer is on gpu, we create a cuda IPC * handle and send it as payload in the RTS */ if (req->is_buf_gpu_mem) { CUdeviceptr buf_base_ptr; PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); /* Offset in GPU buffer from which we copy data, we have to * send it separetly because this offset is lost * when cuIpcGetMemHandle is called */ req->cuda_ipc_offset = buf - (void*)buf_base_ptr; args[2].u32w0 = (uint32_t)req->cuda_ipc_offset; PSMI_CUDA_CALL(cuIpcGetMemHandle, &req->cuda_ipc_handle, (CUdeviceptr) buf); if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { psmi_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx, args, 5, (void*)&req->cuda_ipc_handle, sizeof(CUipcMemHandle), NULL, 0); } else { psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, args, 5, (void*)&req->cuda_ipc_handle, sizeof(CUipcMemHandle), 0); } req->cuda_ipc_handle_attached = 1; } else #endif if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { psmi_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx, args, 5, NULL, 0, NULL, 0); } else { psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, args, 5, NULL, 0, 0); } mq->stats.tx_num++; mq->stats.tx_shm_num++; mq->stats.tx_rndv_num++; mq->stats.tx_rndv_bytes += len; return err; } PSMI_ALWAYS_INLINE( psm2_error_t amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, psm2_amarg_t *args, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)) { uint32_t bytes_left = len; uint32_t bytes_this = 0; psm2_handler_t handler = mq_handler_hidx; args[1].u32w1 = tag->tag[0]; args[1].u32w0 = tag->tag[1]; args[2].u32w1 = tag->tag[2]; args[2].u32w0 = 0; if (!flags_user && len <= AMLONG_MTU) { if (len <= 32) args[0].u32w0 = MQ_MSG_TINY; else args[0].u32w0 = MQ_MSG_SHORT; } else { args[0].u32w0 = MQ_MSG_EAGER; args[0].u32w1 = len; } do { args[2].u32w0 += bytes_this; bytes_this = min(bytes_left, AMLONG_MTU); /* Assume that shared-memory active messages are delivered in order */ if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, epaddr, handler, args, 3, (void *)ubuf, bytes_this, NULL, 0); } else { psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, handler, args, 3, ubuf, bytes_this, 0); } ubuf += bytes_this; bytes_left -= bytes_this; handler = mq_handler_data_hidx; } while(bytes_left); /* All eager async sends are always "all done" */ if (req != NULL) { req->state = MQ_STATE_COMPLETE; mq_qq_append(&mq->completed_q, req); } mq->stats.tx_num++; mq->stats.tx_shm_num++; mq->stats.tx_eager_num++; mq->stats.tx_eager_bytes += len; return PSM2_OK; } /* * All shared am mq sends, req can be NULL */ PSMI_ALWAYS_INLINE( psm2_error_t amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)) { psm2_amarg_t args[3]; psm2_error_t err = PSM2_OK; int is_blocking = (req == NULL); #ifdef PSM_CUDA int gpu_mem = 0; int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported; if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { gpu_mem = 1; /* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */ if (ep_supports_p2p) { goto do_rendezvous; } /* * Use eager messages if P2P is unsupported between endpoints. * Potentially use rendezvous with blocking requests only. */ if (!is_blocking) goto do_eager; } #endif if (flags_user & PSM2_MQ_FLAG_SENDSYNC) goto do_rendezvous; if (len <= mq->shm_thresh_rv) #ifdef PSM_CUDA do_eager: #endif return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user, flags_internal, tag, ubuf, len); do_rendezvous: if (is_blocking) { req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); if_pf(req == NULL) return PSM2_NO_MEMORY; req->req_data.send_msglen = len; req->req_data.tag = *tag; /* Since SEND command is blocking, this request is * entirely internal and we will not be exposed to user. * Setting as internal so it will not be added to * mq->completed_q */ req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL); } #ifdef PSM_CUDA void *host_buf = NULL; req->is_buf_gpu_mem = gpu_mem; if (req->is_buf_gpu_mem) { psmi_cuda_set_attr_sync_memops(ubuf); /* Use host buffer for blocking requests if GPU P2P is * unsupported between endpoints. * This will be only used with blocking requests. */ if (!ep_supports_p2p) { host_buf = psmi_malloc(epaddr->ptlctl->ep, UNDEFINED, len); PSMI_CUDA_CALL(cuMemcpyDtoH, host_buf, (CUdeviceptr)ubuf, len); /* Reset is_buf_gpu_mem since host buffer is being used * instead of one from GPU. */ ubuf = host_buf; req->is_buf_gpu_mem = 0; } } #endif err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len); if (err == PSM2_OK && is_blocking) { /* wait... */ err = psmi_mq_wait_internal(&req); } #ifdef PSM_CUDA if (err == PSM2_OK && host_buf) psmi_free(host_buf); #endif return err; } static psm2_error_t amsh_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context, psm2_mq_req_t *req_o) { psm2_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); if_pf(req == NULL) return PSM2_NO_MEMORY; req->req_data.send_msglen = len; req->req_data.tag = *tag; req->req_data.context = context; req->flags_user = flags_user; req->flags_internal = flags_internal; _HFI_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n", psmi_epaddr_get_name(epaddr->ptlctl->ep->epid), psmi_epaddr_get_name(epaddr->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); amsh_mq_send_inner(mq, req, epaddr, flags_user, flags_internal, tag, ubuf, len); *req_o = req; return PSM2_OK; } static psm2_error_t amsh_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) { _HFI_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n", psmi_epaddr_get_name(epaddr->ptlctl->ep->epid), psmi_epaddr_get_name(epaddr->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); amsh_mq_send_inner(mq, NULL, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len); return PSM2_OK; } /* kassist-related handling */ int psmi_epaddr_pid(psm2_epaddr_t epaddr) { uint16_t shmidx = ((am_epaddr_t *) epaddr)->shmidx; return ((struct ptl_am *)(epaddr->ptlctl->ptl))->am_ep[shmidx].pid; } #if _HFI_DEBUGGING static const char *psmi_kassist_getmode(int mode) { switch (mode) { case PSMI_KASSIST_OFF: return "kassist off"; case PSMI_KASSIST_CMA_GET: return "cma get"; case PSMI_KASSIST_CMA_PUT: return "cma put"; default: return "unknown"; } } #endif static int psmi_get_kassist_mode() { /* Cuda PSM2 supports only KASSIST_CMA_GET */ int mode = PSMI_KASSIST_CMA_GET; #ifndef PSM_CUDA union psmi_envvar_val env_kassist; if (!psmi_getenv("PSM2_KASSIST_MODE", "PSM Shared memory kernel assist mode " "(cma-put, cma-get, none)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val) PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) { char *s = env_kassist.e_str; if (strcasecmp(s, "cma-put") == 0) mode = PSMI_KASSIST_CMA_PUT; else if (strcasecmp(s, "cma-get") == 0) mode = PSMI_KASSIST_CMA_GET; else mode = PSMI_KASSIST_OFF; } #endif return mode; } /* Connection handling for shared memory AM. * * arg0 => conn_op, result (PSM error type) * arg1 => epid (always) * arg2 => pid, version. * arg3 => pointer to error for replies. */ static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) { int op = args[0].u16w0; int phase = args[0].u32w1; psm2_epid_t epid = args[1].u64w0; int16_t return_shmidx = args[0].u16w1; psm2_error_t err = (psm2_error_t) args[2].u32w1; psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0; unsigned int pid; unsigned int gpuid; int force_remap = 0; psm2_epaddr_t epaddr; amsh_am_token_t *tok = (amsh_am_token_t *) toki; uint16_t shmidx = tok->shmidx; int is_valid; struct ptl_am *ptl = (struct ptl_am *)(tok->ptl); ptl_t *ptl_gen = tok->ptl; int cstate; /* We do this because it's an assumption below */ psmi_assert_always(buf == NULL && len == 0); read_extra_ep_data(args[2].u32w0, &pid, &gpuid); _HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n", op, phase, (unsigned long long)epid, err); switch (op) { case PSMI_AM_CONN_REQ: _HFI_VDBG("Connect from %d:%d\n", (int)psm2_epid_nid(epid), (int)psm2_epid_context(epid)); epaddr = psmi_epid_lookup(ptl->ep, epid); if (epaddr && ((am_epaddr_t *) epaddr)->pid != pid) { /* If old pid is unknown consider new pid the correct one */ if (((am_epaddr_t *) epaddr)->pid == AMSH_PID_UNKNOWN) { ((am_epaddr_t *) epaddr)->pid = pid; ((am_epaddr_t *) epaddr)->gpuid = gpuid; } else { psmi_epid_remove(ptl->ep, epid); epaddr = NULL; force_remap = 1; } } if (shmidx == (uint16_t)-1) { /* incoming packet will never be from our shmidx slot 0 thus the other process doesn't know our return info. attach_to will lookup or create the proper shmidx */ if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, force_remap))) { psmi_handle_error(PSMI_EP_NORETURN, err, "Fatal error in " "connecting to shm segment"); } am_update_directory(&ptl->am_ep[shmidx]); tok->shmidx = shmidx; } if (epaddr == NULL) { uintptr_t args_segoff = (uintptr_t) args - ptl->self_nodeinfo->amsh_shmbase; if ((err = amsh_epaddr_add(ptl_gen, epid, shmidx, &epaddr))) /* Unfortunately, no way out of here yet */ psmi_handle_error(PSMI_EP_NORETURN, err, "Fatal error " "in connecting to shm segment"); args = (psm2_amarg_t *) (ptl->self_nodeinfo->amsh_shmbase + args_segoff); ((am_epaddr_t *) epaddr)->pid = pid; ((am_epaddr_t *) epaddr)->gpuid = gpuid; } /* Rewrite args */ ptl->connect_incoming++; args[0].u16w0 = PSMI_AM_CONN_REP; /* and return our shmidx for the connecting process */ args[0].u16w1 = shmidx; args[1].u64w0 = (psm2_epid_t) ptl->epid; args[2].u32w0 = create_extra_ep_data(); args[2].u32w1 = PSM2_OK; ((am_epaddr_t *) epaddr)->cstate_incoming = AMSH_CSTATE_INCOMING_ESTABLISHED; ((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx; tok->tok.epaddr_incoming = epaddr; /* adjust token */ psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, args, narg, NULL, 0, 0); break; case PSMI_AM_CONN_REP: if (ptl->connect_phase != phase) { _HFI_VDBG("Out of phase connect reply\n"); return; } epaddr = ptl->am_ep[shmidx].epaddr; /* check if a race has occurred on shm-file reuse. * if so, don't transition to the next state. * the next call to connreq_poll() will restart the * connection. */ if (ptl->am_ep[shmidx].pid != ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) break; *perr = err; ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_REPLIED; ((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx; ptl->connect_outgoing++; _HFI_VDBG("CCC epaddr=%s connected to ptl=%p\n", psmi_epaddr_get_name(epaddr->epid), ptl); break; case PSMI_AM_DISC_REQ: epaddr = psmi_epid_lookup(ptl->ep, epid); if (!epaddr) { _HFI_VDBG("Dropping disconnect request from an epid that we are not connected to\n"); return; } args[0].u16w0 = PSMI_AM_DISC_REP; args[2].u32w1 = PSM2_OK; ((am_epaddr_t *) epaddr)->cstate_incoming = AMSH_CSTATE_INCOMING_DISC_REQUESTED; ptl->connect_incoming--; /* Before sending the reply, make sure the process * is still connected */ if (ptl->am_ep[shmidx].epid != epaddr->epid) is_valid = 0; else is_valid = 1; if (is_valid) { psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, args, narg, NULL, 0, 0); /** * Only munmap if we have nothing more to * communicate with the other node, i.e. we are * already disconnected with the other node * or have sent a disconnect request. */ cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) { err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid); } } break; case PSMI_AM_DISC_REP: if (ptl->connect_phase != phase) { _HFI_VDBG("Out of phase disconnect reply\n"); return; } *perr = err; epaddr = tok->tok.epaddr_incoming; ((am_epaddr_t *) epaddr)->cstate_outgoing = AMSH_CSTATE_OUTGOING_DISC_REPLIED; ptl->connect_outgoing--; break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled connect handler op=%d", op); break; } return; } static size_t amsh_sizeof(void) { return sizeof(struct ptl_am); } /* Fill in AM capabilities parameters */ psm2_error_t psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) { if (parameters == NULL) { return PSM2_PARAM_ERR; } parameters->max_handlers = PSMI_AM_NUM_HANDLERS; parameters->max_nargs = PSMI_AM_MAX_ARGS; parameters->max_request_short = AMLONG_MTU; parameters->max_reply_short = AMLONG_MTU; return PSM2_OK; } /** * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid. * @param ptl Pointer to caller-allocated space for PTL (fill in) * @param ctl Pointer to caller-allocated space for PTL-control * structure (fill in) */ static psm2_error_t amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK; /* Preconditions */ psmi_assert_always(ep != NULL); psmi_assert_always(ep->epaddr != NULL); psmi_assert_always(ep->epid != 0); ptl->ep = ep; /* back pointer */ ptl->epid = ep->epid; /* cache epid */ ptl->epaddr = ep->epaddr; /* cache a copy */ ptl->ctl = ctl; ptl->zero_polls = 0; ptl->connect_phase = 0; ptl->connect_incoming = 0; ptl->connect_outgoing = 0; memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt)); memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo)); ptl->max_ep_idx = -1; ptl->am_ep_size = AMSH_DIRBLOCK_SIZE; ptl->am_ep = (struct am_ctl_nodeinfo *) psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); if (ptl->am_ep == NULL) { err = PSM2_NO_MEMORY; goto fail; } memset(ptl->am_ep, 0, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); if ((err = amsh_init_segment(ptl_gen))) goto fail; ptl->self_nodeinfo->psm_verno = PSMI_VERNO; if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) { if (cma_available()) { ptl->self_nodeinfo->amsh_features |= AMSH_HAVE_CMA; psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_CMA; } else { ptl->psmi_kassist_mode = PSMI_KASSIST_OFF; psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; } } else { psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; } ptl->self_nodeinfo->pid = getpid(); ptl->self_nodeinfo->epid = ep->epid; ptl->self_nodeinfo->epaddr = ep->epaddr; ips_mb(); ptl->self_nodeinfo->is_init = 1; psmi_am_reqq_init(ptl_gen); memset(ctl, 0, sizeof(*ctl)); /* Fill in the control structure */ ctl->ep = ep; ctl->ptl = ptl_gen; ctl->ep_poll = amsh_poll; ctl->ep_connect = amsh_ep_connect; ctl->ep_disconnect = amsh_ep_disconnect; ctl->mq_send = amsh_mq_send; ctl->mq_isend = amsh_mq_isend; ctl->am_get_parameters = psmi_amsh_am_get_parameters; ctl->am_short_request = psmi_amsh_am_short_request; ctl->am_short_reply = psmi_amsh_am_short_reply; /* No stats in shm (for now...) */ ctl->epaddr_stats_num = NULL; ctl->epaddr_stats_init = NULL; ctl->epaddr_stats_get = NULL; #ifdef PSM_CUDA union psmi_envvar_val env_memcache_enabled; psmi_getenv("PSM2_CUDA_MEMCACHE_ENABLED", "PSM cuda ipc memhandle cache enabled (default is enabled)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) 1, &env_memcache_enabled); if (PSMI_IS_CUDA_ENABLED && env_memcache_enabled.e_uint) { union psmi_envvar_val env_memcache_size; psmi_getenv("PSM2_CUDA_MEMCACHE_SIZE", "Size of the cuda ipc memhandle cache ", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK)) goto fail; } #endif fail: return err; } static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; struct psmi_eptab_iterator itor; psm2_epaddr_t epaddr; psm2_error_t err = PSM2_OK; psm2_error_t err_seg; uint64_t t_start = get_cycles(); int i = 0; /* Close whatever has been left open -- this will be factored out for 2.1 */ if (ptl->connect_outgoing > 0) { int num_disc = 0; int *mask; psm2_error_t *errs; psm2_epaddr_t *epaddr_array; psmi_epid_itor_init(&itor, ptl->ep); while ((epaddr = psmi_epid_itor_next(&itor))) { if (epaddr->ptlctl->ptl != ptl_gen) continue; if (((am_epaddr_t *) epaddr)->cstate_outgoing == AMSH_CSTATE_OUTGOING_ESTABLISHED) num_disc++; } psmi_epid_itor_fini(&itor); mask = (int *)psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(int)); errs = (psm2_error_t *) psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(psm2_error_t)); epaddr_array = (psm2_epaddr_t *) psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(psm2_epaddr_t)); if (errs == NULL || epaddr_array == NULL || mask == NULL) { if (epaddr_array) psmi_free(epaddr_array); if (errs) psmi_free(errs); if (mask) psmi_free(mask); err = PSM2_NO_MEMORY; goto fail; } psmi_epid_itor_init(&itor, ptl->ep); while ((epaddr = psmi_epid_itor_next(&itor))) { if (epaddr->ptlctl->ptl == ptl_gen) { if (((am_epaddr_t *) epaddr)->cstate_outgoing == AMSH_CSTATE_OUTGOING_ESTABLISHED) { mask[i] = 1; epaddr_array[i] = epaddr; i++; } } } psmi_epid_itor_fini(&itor); psmi_assert(i == num_disc && num_disc > 0); err = amsh_ep_disconnect(ptl_gen, force, num_disc, epaddr_array, mask, errs, timeout_ns); psmi_free(mask); psmi_free(errs); psmi_free(epaddr_array); } if (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) { while (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) { if (!psmi_cycles_left(t_start, timeout_ns)) { err = PSM2_TIMEOUT; _HFI_VDBG("CCC timed out with from=%d,to=%d\n", ptl->connect_incoming, ptl->connect_outgoing); break; } psmi_poll_internal(ptl->ep, 1); } } else _HFI_VDBG("CCC complete disconnect from=%d,to=%d\n", ptl->connect_incoming, ptl->connect_outgoing); if ((err_seg = psmi_shm_detach(ptl_gen))) { err = err_seg; goto fail; } /* This prevents poll calls between now and the point where the endpoint is * deallocated to reference memory that disappeared */ ptl->repH.head = &ptl->amsh_empty_shortpkt; ptl->reqH.head = &ptl->amsh_empty_shortpkt; if (ptl->am_ep) psmi_free(ptl->am_ep); #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) am_cuda_memhandle_cache_map_fini(); #endif return PSM2_OK; fail: return err; } static psm2_error_t amsh_setopt(const void *component_obj, int optname, const void *optval, uint64_t optlen) { /* No options for AM PTL at the moment */ return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown AM ptl option %u.", optname); } static psm2_error_t amsh_getopt(const void *component_obj, int optname, void *optval, uint64_t *optlen) { /* No options for AM PTL at the moment */ return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown AM ptl option %u.", optname); } /* Only symbol we expose out of here */ struct ptl_ctl_init psmi_ptl_amsh = { amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt }; opa-psm2-PSM2_11.2.185/ptl_am/cmarw.h000066400000000000000000000052161370564314600167630ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include /* * read from remote process pid */ int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n); /* * write to remote process pid */ int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n); /* * Test if CMA is available by trying a no-op call. * Returns 1 if CMA is present, 0 if not. */ int cma_available(void); opa-psm2-PSM2_11.2.185/ptl_am/cmarwu.c000066400000000000000000000143331370564314600171430ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include #include #include #include #include "psm_user.h" #include "cmarw.h" /* An iovec looks like this: * struct iovec { * void *iov_base; // Starting address * size_t iov_len; // Number of bytes to transfer * }; */ #if 0 #define __NR_process_vm_readv 310 #define __NR_process_vm_writev 311 #define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ syscall(__NR_process_vm_readv, \ pid, local_iov, liovcnt, remote_iov, riovcnt, flags) #define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ syscall(__NR_process_vm_writev, \ pid, local_iov, liovcnt, remote_iov, riovcnt, flags) #endif /*CMA syscall wrappers were added in glibc 2.15. For anything older than that, we need to define our own wrappers. Apparently older (and maybe newer?) (2.12 from RHEL6.3 definitely has this bug) glibcs only pass up to 5 arguments via the generic syscall() function. These CMA functions, however, have 6 arguments. So for now, we hack our way around it by generating ASM code for doing a syscall directly. */ #if defined(__GLIBC__) && ((__GLIBC__ == 2) && (__GLIBC_MINOR__ < 15)) #ifdef __x86_64__ #define __NR_process_vm_readv 310 #define __NR_process_vm_writev 311 static inline ssize_t __x86_64_syscall6(int syscall, pid_t pid, const struct iovec *local_iov, unsigned long liovcnt, const struct iovec *remote_iov, unsigned long riovcnt, unsigned long flags) { /*GCC inline ASM is annoying -- can't specify all the x86_64 registers directly, so declare register-specific variables and use them. */ register int64_t rax asm("rax") = syscall; register int64_t rdi asm("rdi") = pid; register int64_t rsi asm("rsi") = (intptr_t) local_iov; register int64_t rdx asm("rdx") = liovcnt; register int64_t r10 asm("r10") = (intptr_t) remote_iov; register int64_t r8 asm("r8") = riovcnt; register int64_t r9 asm("r9") = flags; asm volatile ("syscall\n" : "=a" (rax) : "r"(rax), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10), "r"(r8), "r"(r9) : "%rcx", "%r11", "cc", "memory"); return rax; } #define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ __x86_64_syscall6(__NR_process_vm_readv, \ pid, local_iov, liovcnt, remote_iov, riovcnt, flags) #define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ __x86_64_syscall6(__NR_process_vm_writev, \ pid, local_iov, liovcnt, remote_iov, riovcnt, flags) #else /* ndef __x86_64__ */ #error "Can't compile CMA support for this architecture." #endif /* __x86_64__ */ #endif /* __GLIBC__ < 2.15 */ int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n) { int64_t nr, sum; struct iovec local = { .iov_base = dst, .iov_len = n }; struct iovec remote = { .iov_base = (void *)src, .iov_len = n }; nr = sum = 0; while (sum != n) { nr = process_vm_readv(pid, &local, 1, &remote, 1, 0); if (nr == -1) { return -1; } sum += nr; local.iov_base += nr; local.iov_len -= nr; remote.iov_base += nr; remote.iov_len -= nr; } return sum; } int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n) { int64_t nr, sum; struct iovec local = { .iov_base = (void *)src, .iov_len = n }; struct iovec remote = { .iov_base = dst, .iov_len = n }; nr = sum = 0; while (sum != n) { nr = process_vm_writev(pid, &local, 1, &remote, 1, 0); if (nr == -1) { return -1; } sum += nr; local.iov_base += nr; local.iov_len -= nr; remote.iov_base += nr; remote.iov_len -= nr; } return sum; } /* Test if CMA is available by trying a no-op call. */ int cma_available(void) { /* Make a no-op CMA syscall. If CMA is present, 0 (bytes transferred) * should be returned. If not present, expect -ENOSYS. */ int ret = process_vm_readv(getpid(), NULL, 0, NULL, 0, 0); if (ret == 0) { /* CMA is available! */ return 1; } return 0; } opa-psm2-PSM2_11.2.185/ptl_am/psm_am_internal.h000066400000000000000000000333561370564314600210300ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #ifndef PSMI_AM_H #define PSMI_AM_H #include "am_config.h" #include "../psm_am_internal.h" #define AMSH_DIRBLOCK_SIZE 128 typedef struct am_epaddr { /* * epaddr must be the first field to have the same address as this * structure */ struct psm2_epaddr epaddr; uint16_t shmidx; uint16_t return_shmidx; uint32_t cstate_outgoing:3; uint32_t cstate_incoming:3; uint32_t pid:22; /* * Device number of GPU used by given EP, only used when CUDA is * enabled. There is no gain from #ifdefing it out, since it does not * use any extra space. */ uint32_t gpuid:4; } am_epaddr_t; /* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining arguments are passed using space in am_pkt_bulk_t. One additional argument is added for passing the internal ptl_am handler index. */ #define NSHORT_ARGS 6 #define NBULK_ARGS (PSMI_AM_MAX_ARGS - NSHORT_ARGS + 1) typedef struct amsh_am_token { struct psmi_am_token tok; ptl_t *ptl; /**> What PTL was it received on */ psm2_mq_t mq; /**> What matched queue is this for ? */ uint16_t shmidx; /**> what shmidx sent this */ } amsh_am_token_t; typedef void (*psmi_handler_fn_t) (void *token, psm2_amarg_t *args, int nargs, void *src, size_t len); typedef struct psmi_handlertab { psmi_handler_fn_t fn; } psmi_handlertab_t; #define PSMI_AM_CONN_REQ 1 #define PSMI_AM_CONN_REP 2 #define PSMI_AM_DISC_REQ 3 #define PSMI_AM_DISC_REP 4 #define PSMI_KASSIST_OFF 0x0 #define PSMI_KASSIST_CMA_GET 0x1 #define PSMI_KASSIST_CMA_PUT 0x2 #define PSMI_KASSIST_CMA 0x3 #define PSMI_KASSIST_GET 0x1 #define PSMI_KASSIST_PUT 0x2 #define PSMI_KASSIST_MASK 0x3 int psmi_epaddr_pid(psm2_epaddr_t epaddr); /* * Eventually, we will allow users to register handlers as "don't reply", which * may save on some of the buffering requirements */ #define PSMI_HANDLER_NEEDS_REPLY(handler) 1 #define PSMI_VALIDATE_REPLY(handler) assert(PSMI_HANDLER_NEEDS_REPLY(handler)) int psmi_amsh_poll(ptl_t *ptl, int replyonly); /* Shared memory AM, forward decls */ int psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, int flags); void psmi_amsh_short_reply(amsh_am_token_t *tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, int flags); int psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dest, int flags); void psmi_amsh_long_reply(amsh_am_token_t *tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dest, int flags); void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); void psmi_am_mq_handler_complete(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len); /* AM over shared memory (forward decls) */ psm2_error_t psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters); psm2_error_t psmi_amsh_am_short_request(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); psm2_error_t psmi_amsh_am_short_reply(psm2_am_token_t tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); #define amsh_conn_handler_hidx 1 #define mq_handler_hidx 2 #define mq_handler_data_hidx 3 #define mq_handler_rtsmatch_hidx 4 #define mq_handler_rtsdone_hidx 5 #define am_handler_hidx 6 #define AMREQUEST_SHORT 0 #define AMREQUEST_LONG 1 #define AMREPLY_SHORT 2 #define AMREPLY_LONG 3 #define AM_IS_REPLY(x) ((x)&0x2) #define AM_IS_REQUEST(x) (!AM_IS_REPLY(x)) #define AM_IS_LONG(x) ((x)&0x1) #define AM_IS_SHORT(x) (!AM_IS_LONG(x)) #define AM_FLAG_SRC_ASYNC 0x1 #define AM_FLAG_SRC_TEMP 0x2 /* * Request Fifo. */ typedef struct am_reqq { struct am_reqq *next; ptl_t *ptl; psm2_epaddr_t epaddr; int amtype; psm2_handler_t handler; psm2_amarg_t args[8]; int nargs; uint32_t len; void *src; void *dest; int amflags; int flags; } am_reqq_t; struct am_reqq_fifo_t { am_reqq_t *first; am_reqq_t **lastp; }; psm2_error_t psmi_am_reqq_drain(ptl_t *ptl); void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, void *dest, int flags); /* * Shared memory Active Messages, implementation derived from * Lumetta, Mainwaring, Culler. Multi-Protocol Active Messages on a Cluster of * SMP's. Supercomputing 1997. * * We support multiple endpoints in shared memory, but we only support one * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some * structures are endpoint specific (as denoted * with amsh_ep_) and others are * specific to the single shared memory context * (amsh_ global variables). * * Each endpoint maintains a shared request block and a shared reply block. * Each block is composed of queues for small, medium and large messages. */ #define QFREE 0 #define QUSED 1 #define QREADY 2 #define QREADYMED 3 #define QREADYLONG 4 #define QISEMPTY(flag) (flag < QREADY) #if defined(__x86_64__) || defined(__i386__) # define _QMARK_FLAG_FENCE() asm volatile("" : : : "memory") /* compilerfence */ #else # error No _QMARK_FLAG_FENCE() defined for this platform #endif #define _QMARK_FLAG(pkt_ptr, _flag) \ do { \ _QMARK_FLAG_FENCE(); \ (pkt_ptr)->flag = (_flag); \ } while (0) #define QMARKFREE(pkt_ptr) _QMARK_FLAG(pkt_ptr, QFREE) #define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY) #define QMARKUSED(pkt_ptr) _QMARK_FLAG(pkt_ptr, QUSED) #define AMFMT_SYSTEM 1 #define AMFMT_SHORT_INLINE 2 #define AMFMT_SHORT 3 #define AMFMT_LONG 4 #define AMFMT_LONG_END 5 #define AMSH_CMASK_NONE 0 #define AMSH_CMASK_PREREQ 1 #define AMSH_CMASK_POSTREQ 2 #define AMSH_CMASK_DONE 3 #define AMSH_CSTATE_OUTGOING_NONE 1 #define AMSH_CSTATE_OUTGOING_REPLIED 2 #define AMSH_CSTATE_OUTGOING_ESTABLISHED 3 #define AMSH_CSTATE_OUTGOING_DISC_REPLIED 4 #define AMSH_CSTATE_OUTGOING_DISC_REQUESTED 5 #define AMSH_CSTATE_INCOMING_NONE 1 #define AMSH_CSTATE_INCOMING_DISC_REQUESTED 4 #define AMSH_CSTATE_INCOMING_ESTABLISHED 5 #define AMSH_PID_UNKNOWN 0 /********************************** * Shared memory packet formats **********************************/ typedef struct am_pkt_short { uint32_t flag; /**> Packet state */ union { uint32_t bulkidx; /**> index in bulk packet queue */ uint32_t length; /**> length when no bulkidx used */ }; uint16_t shmidx; /**> index in shared segment */ uint16_t type; uint16_t nargs; uint16_t handleridx; psm2_amarg_t args[NSHORT_ARGS]; /* AM arguments */ /* We eventually will expose up to 8 arguments, but this isn't implemented * For now. >6 args will probably require a medium instead of a short */ } __attribute__ ((aligned(64))) am_pkt_short_t; PSMI_STRICT_SIZE_DECL(am_pkt_short_t, 64); typedef struct am_pkt_bulk { uint32_t flag; uint32_t idx; uintptr_t dest; /* Destination pointer in "longs" */ uint32_t dest_off; /* Destination pointer offset */ uint32_t len; /* Destination length within offset */ psm2_amarg_t args[NBULK_ARGS]; /* Additional "spillover" for >6 args */ uint8_t payload[0]; } am_pkt_bulk_t; /* No strict size decl, used for mediums and longs */ /**************************************************** * Shared memory header and block control structures ***************************************************/ /* Each pkt queue has the same header format, although the queue * consumers don't use the 'head' index in the same manner. */ typedef struct am_ctl_qhdr { uint32_t head; /* Touched only by 1 consumer */ uint8_t _pad0[64 - 4]; pthread_spinlock_t lock; uint32_t tail; /* XXX candidate for fetch-and-incr */ uint32_t elem_cnt; uint32_t elem_sz; uint8_t _pad1[64 - 3 * 4 - sizeof(pthread_spinlock_t)]; } am_ctl_qhdr_t; PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t, 128); /* Each process has a reply qhdr and a request qhdr */ typedef struct am_ctl_blockhdr { volatile am_ctl_qhdr_t shortq; volatile am_ctl_qhdr_t longbulkq; } am_ctl_blockhdr_t; PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t, 128 * 2); /* We cache the "shorts" because that's what we poll on in the critical path. * We take care to always update these pointers whenever the segment is remapped. */ typedef struct am_ctl_qshort_cache { volatile am_pkt_short_t *base; volatile am_pkt_short_t *head; volatile am_pkt_short_t *end; } am_ctl_qshort_cache_t; /****************************************** * Shared segment local directory (global) ****************************************** * * Each process keeps a directory for where request and reply structures are * located at its peers. This directory must be re-initialized every time the * shared segment moves in the VM, and the segment moves every time we remap() * for additional memory. */ struct amsh_qdirectory { am_ctl_blockhdr_t *qreqH; am_pkt_short_t *qreqFifoShort; am_pkt_bulk_t *qreqFifoLong; am_ctl_blockhdr_t *qrepH; am_pkt_short_t *qrepFifoShort; am_pkt_bulk_t *qrepFifoLong; } __attribute__ ((aligned(64))); /****************************************** * Shared fifo element counts and sizes ****************************************** * These values are context-wide, they can only be set early on and can't be * * modified at runtime. All endpoints are expected to use the same values. */ typedef struct amsh_qinfo { int qreqFifoShort; int qreqFifoLong; int qrepFifoShort; int qrepFifoLong; } amsh_qinfo_t; /****************************************** * Per-endpoint structures (ep-local) ****************************************** * Each endpoint keeps its own information as to where it resides in the * directory, and maintains its own cached copies of where the short header * resides in shared memory. * * This structure is carefully arranged to optimize cache locality and * performance. Do not modify without careful and thorough analysis. */ struct am_ctl_nodeinfo { uint16_t psm_verno; volatile uint16_t is_init; volatile pid_t pid; psm2_epid_t epid; psm2_epaddr_t epaddr; uintptr_t amsh_shmbase; amsh_qinfo_t amsh_qsizes; uint32_t amsh_features; struct amsh_qdirectory qdir; } __attribute__((aligned(64))); struct ptl_am { psm2_ep_t ep; psm2_epid_t epid; psm2_epaddr_t epaddr; ptl_ctl_t *ctl; int connect_phase; int connect_outgoing; int connect_incoming; int zero_polls; int amsh_only_polls; int max_ep_idx, am_ep_size; int psmi_kassist_mode; char *amsh_keyname; /* These three items carefully picked to fit in one cache line. */ am_ctl_qshort_cache_t reqH; am_ctl_qshort_cache_t repH; struct am_reqq_fifo_t psmi_am_reqq_fifo; am_pkt_short_t amsh_empty_shortpkt; struct am_ctl_nodeinfo *self_nodeinfo; struct am_ctl_nodeinfo *am_ep; } __attribute__((aligned(64))); #endif opa-psm2-PSM2_11.2.185/ptl_am/ptl.c000066400000000000000000000307331370564314600164460ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" #include "cmarw.h" #ifdef PSM_CUDA #include "am_cuda_memhandle_cache.h" #endif /** * Callback function when a receive request is matched with the * tag obtained from the RTS packet. */ static psm2_error_t ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, amsh_am_token_t *tok) { psm2_amarg_t args[5]; psm2_epaddr_t epaddr = req->rts_peer; struct ptl_am *ptl = (struct ptl_am *)(epaddr->ptlctl->ptl); int cma_succeed = 0; int pid = 0, cuda_ipc_send_completion = 0; PSM2_LOG_MSG("entering."); psmi_assert((tok != NULL && was_posted) || (tok == NULL && !was_posted)); _HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n", req, req->req_data.buf, req->req_data.recv_msglen, tok); #ifdef PSM_CUDA if (req->cuda_ipc_handle_attached) { CUdeviceptr cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(req->rts_sbuf - req->cuda_ipc_offset, (CUipcMemHandle*)&req->cuda_ipc_handle, req->req_data.recv_msglen, req->rts_peer->epid); cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->cuda_ipc_offset; /* cuMemcpy into the receive side buffer * based on its location */ if (req->is_buf_gpu_mem) { PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr, req->req_data.recv_msglen); PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); } else PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr, req->req_data.recv_msglen); cuda_ipc_send_completion = 1; am_cuda_memhandle_release(cuda_ipc_dev_ptr - req->cuda_ipc_offset); req->cuda_ipc_handle_attached = 0; goto send_cts; } #endif if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET) && req->req_data.recv_msglen > 0 && (pid = psmi_epaddr_pid(epaddr))) { #ifdef PSM_CUDA /* If the buffer on the send side is on the host, * we alloc a bounce buffer, use kassist and then * do a cuMemcpy if the buffer on the recv side * resides on the GPU */ if (req->is_buf_gpu_mem) { void* cuda_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen); size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, cuda_ipc_bounce_buf, req->req_data.recv_msglen); psmi_assert_always(nbytes == req->req_data.recv_msglen); PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_bounce_buf, req->req_data.recv_msglen); /* Cuda library has recent optimizations where they do * not guarantee synchronus nature for Host to Device * copies for msg sizes less than 64k. The event record * and synchronize calls are to guarentee completion. */ PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); psmi_free(cuda_ipc_bounce_buf); } else { /* cma can be done in handler context or not. */ size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, req->req_data.buf, req->req_data.recv_msglen); psmi_assert_always(nbytes == req->req_data.recv_msglen); } #else /* cma can be done in handler context or not. */ size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, req->req_data.buf, req->req_data.recv_msglen); if (nbytes == -1) { ptl->psmi_kassist_mode = PSMI_KASSIST_OFF; _HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n"); } else { psmi_assert_always(nbytes == req->req_data.recv_msglen); cma_succeed = 1; } psmi_assert_always(nbytes == req->req_data.recv_msglen); #endif } #ifdef PSM_CUDA send_cts: #endif args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr; args[1].u64w0 = (uint64_t) (uintptr_t) req; args[2].u64w0 = (uint64_t) (uintptr_t) req->req_data.buf; args[3].u32w0 = req->req_data.recv_msglen; args[3].u32w1 = tok != NULL ? 1 : 0; args[4].u32w0 = ptl->psmi_kassist_mode; // pass current kassist mode to the peer process if (tok != NULL) { psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl, tok->tok.epaddr_incoming, mq_handler_rtsmatch_hidx, args, 5, NULL, 0, NULL, 0); } else psmi_amsh_short_request((struct ptl *)ptl, epaddr, mq_handler_rtsmatch_hidx, args, 5, NULL, 0, 0); /* 0-byte completion or we used kassist */ if (pid || cma_succeed || req->req_data.recv_msglen == 0 || cuda_ipc_send_completion == 1) { psmi_mq_handle_rts_complete(req); } PSM2_LOG_MSG("leaving."); return PSM2_OK; } static psm2_error_t ptl_handle_rtsmatch(psm2_mq_req_t req, int was_posted) { /* was_posted == 0 allows us to assume that we're not running this callback * within am handler context (i.e. we can poll) */ psmi_assert(was_posted == 0); return ptl_handle_rtsmatch_request(req, 0, NULL); } void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) { amsh_am_token_t *tok = (amsh_am_token_t *) toki; psm2_mq_req_t req; psm2_mq_tag_t tag; int rc; uint32_t opcode = args[0].u32w0; uint32_t msglen = opcode <= MQ_MSG_SHORT ? len : args[0].u32w1; tag.tag[0] = args[1].u32w1; tag.tag[1] = args[1].u32w0; tag.tag[2] = args[2].u32w1; psmi_assert(toki != NULL); _HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n", tok->mq, opcode, (int)len, msglen); switch (opcode) { case MQ_MSG_TINY: case MQ_MSG_SHORT: case MQ_MSG_EAGER: rc = psmi_mq_handle_envelope(tok->mq, tok->tok.epaddr_incoming, &tag, msglen, 0, buf, (uint32_t) len, 1, opcode, &req); /* for eager matching */ req->ptl_req_ptr = (void *)tok->tok.epaddr_incoming; req->msg_seqnum = 0; /* using seqnum 0 */ break; default:{ void *sreq = (void *)(uintptr_t) args[3].u64w0; uintptr_t sbuf = (uintptr_t) args[4].u64w0; psmi_assert(narg == 5); psmi_assert_always(opcode == MQ_MSG_LONGRTS); rc = psmi_mq_handle_rts(tok->mq, tok->tok.epaddr_incoming, &tag, msglen, NULL, 0, 1, ptl_handle_rtsmatch, &req); req->rts_peer = tok->tok.epaddr_incoming; req->ptl_req_ptr = sreq; req->rts_sbuf = sbuf; #ifdef PSM_CUDA /* Payload in RTS would mean an IPC handle has been * sent. This would also mean the sender has to * send from a GPU buffer */ if (buf && len > 0) { req->cuda_ipc_handle = *((CUipcMemHandle*)buf); req->cuda_ipc_handle_attached = 1; req->cuda_ipc_offset = args[2].u32w0; } #endif if (rc == MQ_RET_MATCH_OK) /* we are in handler context, issue a reply */ ptl_handle_rtsmatch_request(req, 1, tok); /* else will be called later */ break; } } return; } void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) { amsh_am_token_t *tok = (amsh_am_token_t *) toki; psmi_assert(toki != NULL); psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming; psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0); /* using seqnum 0 */ psmi_assert_always(req != NULL); psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len); return; } /** * Function to handle CTS on the sender. */ void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) { amsh_am_token_t *tok = (amsh_am_token_t *) toki; psmi_assert(toki != NULL); ptl_t *ptl = tok->ptl; psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; #ifdef PSM_CUDA /* If send side req has a cuda ipc handle attached then we can * assume the data has been copied as soon as we get a CTS */ if (sreq->cuda_ipc_handle_attached) { sreq->cuda_ipc_handle_attached = 0; psmi_mq_handle_rts_complete(sreq); return; } #endif void *dest = (void *)(uintptr_t) args[2].u64w0; uint32_t msglen = args[3].u32w0; psm2_amarg_t rarg[1]; _HFI_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n", sreq, (void *)(uintptr_t) args[1].u64w0, sreq->req_data.buf, dest, msglen); if (msglen > 0) { rarg[0].u64w0 = args[1].u64w0; /* rreq */ int kassist_mode = ((struct ptl_am *)ptl)->psmi_kassist_mode; int kassist_mode_peer = args[4].u32w0; // In general, peer process(es) shall have the same kassist mode set, // but due to dynamic CMA failure detection, we must align local and remote state, // and make protocol to adopt to that potential change. if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) { ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; goto no_kassist; } if (kassist_mode & PSMI_KASSIST_PUT) { int pid = psmi_epaddr_pid(tok->tok.epaddr_incoming); size_t nbytes = cma_put(sreq->req_data.buf, pid, dest, msglen); if (nbytes == -1) { _HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n"); ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; goto no_kassist; } psmi_assert_always(nbytes == msglen); /* Send response that PUT is complete */ psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, NULL, 0, 0); } else if (!(kassist_mode & PSMI_KASSIST_MASK)) { /* Only transfer if kassist is off, i.e. neither GET nor PUT. */ no_kassist: psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, sreq->req_data.buf, msglen, dest, 0); } } psmi_mq_handle_rts_complete(sreq); } void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) { psm2_mq_req_t rreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; psmi_assert(narg == 1); _HFI_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->req_data.buf, rreq->req_data.recv_msglen); psmi_mq_handle_rts_complete(rreq); } void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) { amsh_am_token_t *tok = (amsh_am_token_t *) toki; struct psm2_ep_am_handle_entry *hentry; psmi_assert(toki != NULL); hentry = psm_am_get_handler_function(tok->mq->ep, (psm2_handler_t) args[0].u32w0); /* Note a guard here for hentry != NULL is not needed because at * initialization, a psmi_assert_always() assure the entry will be * non-NULL. */ /* Invoke handler function. For AM we do not support break functionality */ if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { psm2_am_handler_2_fn_t hfn2 = (psm2_am_handler_2_fn_t)hentry->hfn; hfn2(toki, args + 1, narg - 1, buf, len, hentry->hctx); } else { psm2_am_handler_fn_t hfn1 = (psm2_am_handler_fn_t)hentry->hfn; hfn1(toki, args + 1, narg - 1, buf, len); } return; } opa-psm2-PSM2_11.2.185/ptl_am/ptl_fwd.h000066400000000000000000000046541370564314600173160ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #ifndef _PTL_FWD_AMSH_H #define _PTL_FWD_AMSH_H /* Symbol in am ptl */ extern struct ptl_ctl_init psmi_ptl_amsh; extern int psmi_shm_mq_rv_thresh; #endif opa-psm2-PSM2_11.2.185/ptl_ips/000077500000000000000000000000001370564314600156735ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/ptl_ips/Makefile000066400000000000000000000070061370564314600173360ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2003-2014 Intel Corporation. All rights reserved. # OUTDIR = . this_srcdir = $(shell readlink -m .) top_srcdir := $(this_srcdir)/.. INCLUDES += -I$(top_srcdir) -I$(top_srcdir)/ptl_ips ${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ips_recvq.o \ ips_recvhdrq.o ips_proto_recv.o ips_proto_connect.o \ ips_proto_dump.o ips_proto_mq.o \ ips_writehdrq.o ips_proto_expected.o ips_tid.o \ ips_scb.o ips_proto_am.o ips_opp_path_rec.o ips_tidflow.o \ ips_epstate.o ips_crc32.o ips_path_rec.o ips_tidcache.o ${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) DEPS := $(${TARGLIB}-objs:.o=.d) .PHONY: all clean IGNORE_DEP_TARGETS = clean all .DEFAULT: ${${TARGLIB}-objs} $(OUTDIR)/%.d: $(this_srcdir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ clean: @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno; \ cd -; \ fi #ifeq prevents the deps from being included during clean #-include line is required to pull in auto-dependecies during 2nd pass ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) -include ${DEPS} endif install: @echo "Nothing to do for install." opa-psm2-PSM2_11.2.185/ptl_ips/ips_config.h000066400000000000000000000117541370564314600201740ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2018 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2018 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef PTL_IPS_IPS_CONFIG_H #define PTL_IPS_IPS_CONFIG_H #include "psm_config.h" /* Allocate new epaddrs in chunks of 128 */ #define PTL_EPADDR_ALLOC_CHUNK 128 /* Generate an expected header every 16 packets */ #define PSM_DEFAULT_EXPECTED_HEADER 16 #define DF_OPP_LIBRARY "libopasadb.so.1.0.0" #define DATA_VFABRIC_OFFSET 8 /* Send retransmission */ #define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */ #define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT 160 /* in millisecs */ #define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT 640 /* in millisecs */ #define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2 #define PSM_TID_TIMEOUT_DEFAULT "160:640:2" /* update from above params */ /* We have to get an MTU of at least 2K, or else this breaks some assumptions * in the packets that handle tid descriptors */ #define IPS_PROTOEXP_MIN_MTU 2048 #ifdef PSM_FI /* Fault injection, becomes parameters to psmi_faultinj_getspec so * a comma-delimited list of * "spec_name", num, denom * Where num/denom means fault num out of every denom. * The defines set 'denum' and assume that num is set to 1 * * These values are all defaults, each is overridable via * PSM2_FI_ in the environment (and yes, spec_name is in lowercase * *in the environment* just to minimize it appearing in the wild). The format * there is so the same thing except that one can set * a specific seed to the random number generator. */ #define IPS_FAULTINJ_DMALOST 20 /* 1 every 20 dma writev get lost */ #define IPS_FAULTINJ_PIOLOST 100 /* 1 every 100 pio writes get lost */ #define IPS_FAULTINJ_PIOBUSY 10 /* 1 every 10 pio sends get busy */ #define IPS_FAULTINJ_RECVLOST 200 /* 1 every 200 pkts dropped at recv */ #endif /* #ifdef PSM_FI */ /* TID */ /* Max tids a context can support */ #define IPS_TID_MAX_TIDS 2048 /* Max tid-session buffer size */ #define PSM_TIDLIST_BUFSIZE 4096 /* Max tid-session window size */ #define PSM_TID_WINSIZE (4*1024*1024) /* Max number of packets for a single TID flow, fitting tid-session window. * In PSM2 packet integrity is realized by PSN (Packet Sequence Number), * which is kept as 11 bits field (for 9B KDETH), * giving max value 2048 (0 - 2047) */ #define PSM_TID_MAX_PKTS 2048 /* Total number of combined pages from the Tid-pair to be merged */ #define PSM_MAX_NUM_PAGES_IN_TIDPAIR 512 /* rcv thread */ /* All in milliseconds */ #define RCVTHREAD_TO_MIN_FREQ 10 /* min of 10 polls per sec */ #define RCVTHREAD_TO_MAX_FREQ 100 /* max of 100 polls per sec */ #define RCVTHREAD_TO_SHIFT 1 /* ptl.c */ #define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS 250 /* ips_proto_recv.c */ #define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS 30 /* * Easy switch to (say) _HFI_INFO if debugging in the expected protocol is * needed */ #define _HFI_EXP _HFI_VDBG #endif /* PTL_IPS_IPS_CONFIG_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_crc32.c000066400000000000000000000060721370564314600176330ustar00rootroot00000000000000/* The code in this file was derived from crc32.c in zlib 1.2.3, and modified from its original form to suit our requirements. The zlib license and crc32.c copyright and credits are preserved below. */ /* zlib.h -- interface of the 'zlib' general purpose compression library version 1.2.3, July 18th, 2005 Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Jean-loup Gailly Mark Adler jloup@gzip.org madler@alumni.caltech.edu The data format used by the zlib library is described by RFCs (Request for Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). */ /* crc32.c -- compute the CRC-32 of a data stream * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Thanks to Rodney Brown for his contribution of faster * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing * tables for updating the shift register in one step with three exclusive-ors * instead of four steps with four exclusive-ors. This results in about a * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_proto_internal.h" /* Table of CRCs of all 8-bit messages. */ static uint32_t crc_table[256]; /* Flag: has the table been computed? Initially false. */ static int crc_table_computed; /* Make the table for a fast CRC. */ static void make_crc_table(void) { uint32_t c; int n, k; for (n = 0; n < 256; n++) { c = (uint32_t) n; for (k = 0; k < 8; k++) { if (c & 1) c = 0xedb88320 ^ (c >> 1); else c = c >> 1; } crc_table[n] = c; } crc_table_computed = 1; } /* Update a running CRC with the bytes buf[0..len-1]--the CRC * should be initialized to all 1's, and the transmitted value * is the 1's complement of the final running CRC (see the * crc() routine below)). */ uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc) { uint32_t c = crc; uint32_t n; if (!crc_table_computed) { make_crc_table(); } for (n = 0; n < len; n++) { c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8); } return c; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_epstate.c000066400000000000000000000124651370564314600203670ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_proto_internal.h" #include "ips_epstate.h" /* The indexes are used to map a particular endpoint to a structure at the * receiver. Although we take extra care to validate the identity of endpoints * when packets are received, the communication index is at an offset selected * by the endpoint that allocates the index. This narrows the window of two * jobs communicated with the same set of indexes from getting crosstalk. */ psm2_error_t ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context) { memset(eps, 0, sizeof(*eps)); eps->context = context; eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) & (IPS_EPSTATE_CONNIDX_MAX-1); return PSM2_OK; } psm2_error_t ips_epstate_fini(struct ips_epstate *eps) { if (eps->eps_tab) psmi_free(eps->eps_tab); memset(eps, 0, sizeof(*eps)); return PSM2_OK; } /* * Add ipsaddr with epid to the epstate table, return new index to caller in * 'connidx'. */ psm2_error_t ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr, ips_epstate_idx *connidx_o) { int i, j; ips_epstate_idx connidx; if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */ struct ips_epstate_entry *newtab; eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK; newtab = (struct ips_epstate_entry *) psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, eps->eps_tabsize, sizeof(struct ips_epstate_entry)); if (newtab == NULL) return PSM2_NO_MEMORY; else if (eps->eps_tab) { /* NOT first alloc */ for (i = 0; i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++) newtab[i] = eps->eps_tab[i]; /* deep copy */ psmi_free(eps->eps_tab); } eps->eps_tab = newtab; } /* Find the next free hole. We can afford to do this since connect is not * in the critical path */ for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) { if (j == eps->eps_tabsize) j = 0; if (eps->eps_tab[j].ipsaddr == NULL) { eps->eps_tab_nextidx = j + 1; if (eps->eps_tab_nextidx == eps->eps_tabsize) eps->eps_tab_nextidx = 0; break; } } psmi_assert_always(i != eps->eps_tabsize); connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); _HFI_VDBG("node %s gets connidx=%d (table idx %d)\n", psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx, j); eps->eps_tab[j].ipsaddr = ipsaddr; if (j >= IPS_EPSTATE_CONNIDX_MAX) { return psmi_handle_error(eps->context->ep, PSM2_TOO_MANY_ENDPOINTS, "Can't connect to more than %d non-local endpoints", IPS_EPSTATE_CONNIDX_MAX); } *connidx_o = connidx; return PSM2_OK; } psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx) { ips_epstate_idx idx; /* actual table index */ idx = (connidx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); psmi_assert_always(idx < eps->eps_tabsize); _HFI_VDBG("connidx=%d, table_idx=%d\n", connidx, idx); eps->eps_tab[idx].ipsaddr = NULL; /* We may eventually want to release memory, but probably not */ eps->eps_tabsizeused--; return PSM2_OK; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_epstate.h000066400000000000000000000065011370564314600203660ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_EPSTATE_H #define _IPS_EPSTATE_H #include "psm_user.h" typedef uint32_t ips_epstate_idx; #define IPS_EPSTATE_CONNIDX_MAX (1<<26) struct ips_epaddr; struct ips_epstate_entry { struct ips_epaddr *ipsaddr; }; struct ips_epstate { const psmi_context_t *context; ips_epstate_idx eps_base_idx; int eps_tabsize; int eps_tabsizeused; int eps_tab_nextidx; struct ips_epstate_entry *eps_tab; }; psm2_error_t ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *contextj); psm2_error_t ips_epstate_fini(struct ips_epstate *eps); psm2_error_t ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr, ips_epstate_idx *connidx); psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx); PSMI_INLINE( struct ips_epstate_entry * ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx)) { idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); if (idx < (ips_epstate_idx)eps->eps_tabsize) return &eps->eps_tab[idx]; else return NULL; } #endif /* _IPS_EPSTATE_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_expected_proto.h000066400000000000000000000316751370564314600217570ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ /* * Control and state structure for one instance of the expected protocol. The * protocol depends on some upcalls from internal portions of the receive * protocol (such as opcodes dedicated for expected protocol handling) */ /* * Expected tid operations are carried out over "sessions". One session is a * collection of N tids where N is determined by the expected message window * size (-W option or PSM2_MQ_RNDV_HFI_WINDOW). Since naks can cause * retransmissions, each session has an session index (_desc_idx) and a * generation count (_desc_genc) to be able to identify if retransmitted * packets reference the correct session. * * index and generation count are each 4 bytes encoded in one ptl_arg. They * could be compressed further but we have the header space, so we don't * bother. */ #ifndef __IPS_EXPECTED_PROTO_H__ #define __IPS_EXPECTED_PROTO_H__ 1 #define _desc_idx u32w0 #define _desc_genc u32w1 /* * For debug and/or other reasons, we can log the state of each tid and * optionally associate it to a particular receive descriptor */ #define TIDSTATE_FREE 0 #define TIDSTATE_USED 1 struct ips_tidinfo { uint32_t tid; uint32_t state; struct ips_tid_recv_desc *tidrecvc; }; struct ips_protoexp { const struct ptl *ptl; struct ips_proto *proto; struct psmi_timer_ctrl *timerq; struct ips_tid tidc; struct ips_tf tfc; psm_transfer_type_t ctrl_xfer_type; psm_transfer_type_t tid_xfer_type; struct ips_scbctrl tid_scbc_rv; mpool_t tid_desc_send_pool; mpool_t tid_getreq_pool; mpool_t tid_sreq_pool; /* backptr into proto->ep->mq */ mpool_t tid_rreq_pool; /* backptr into proto->ep->mq */ struct drand48_data tidflow_drand48_data; uint32_t tid_flags; uint32_t tid_send_fragsize; uint32_t tid_page_offset_mask; uint64_t tid_page_mask; uint32_t hdr_pkt_interval; struct ips_tidinfo *tid_info; STAILQ_HEAD(ips_tid_send_pend, /* pending exp. sends */ ips_tid_send_desc) pend_sendq; struct psmi_timer timer_send; STAILQ_HEAD(ips_tid_get_pend, ips_tid_get_request) pend_getreqsq; /* pending tid reqs */ struct psmi_timer timer_getreqs; #ifdef PSM_CUDA STAILQ_HEAD(ips_tid_get_cudapend, /* pending cuda transfers */ ips_tid_get_request) cudapend_getreqsq; struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_recv_cfg; struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_recv_cfg; mpool_t cuda_hostbuf_pool_recv; mpool_t cuda_hostbuf_pool_small_recv; CUstream cudastream_recv; #endif }; /* * TID member list format used in communication. * Since the compiler does not make sure the bit fields order, * we use mask and shift defined below. typedef struct { uint32_t length:11; // in page unit, max 1024 pages uint32_t reserved:9; // for future usage uint32_t tidctrl:2; // hardware defined tidctrl value uint32_t tid:10; // hardware only support 10bits } ips_tid_session_member; */ #define IPS_TIDINFO_LENGTH_SHIFT 0 #define IPS_TIDINFO_LENGTH_MASK 0x7ff #define IPS_TIDINFO_TIDCTRL_SHIFT 20 #define IPS_TIDINFO_TIDCTRL_MASK 0x3 #define IPS_TIDINFO_TID_SHIFT 22 #define IPS_TIDINFO_TID_MASK 0x3ff #define IPS_TIDINFO_GET_LENGTH(tidinfo) \ (((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK) #define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \ (((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK) #define IPS_TIDINFO_GET_TID(tidinfo) \ (((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK) typedef struct ips_tid_session_list_tag { uint8_t tsess_unaligned_start; /* unaligned bytes at starting */ uint8_t tsess_unaligned_end; /* unaligned bytes at ending */ uint16_t tsess_tidcount; /* tid number for the session */ uint32_t tsess_tidoffset; /* offset in first tid */ uint32_t tsess_srcoff; /* source offset from beginning */ uint32_t tsess_length; /* session length, including start/end */ uint32_t tsess_list[0]; /* must be last in struct */ } ips_tid_session_list; /* * Send-side expected send descriptors. * * Descriptors are allocated when tid grant requests are received (the 'target' * side of an RDMA get request). Descriptors are added to a pending queue of * expected sends and processed one at a time (scb's are requested and messages * sent until all fragments of the descriptor's length are put on the wire). * */ #define TIDSENDC_SDMA_VEC_DEFAULT 260 struct ips_tid_send_desc { struct ips_protoexp *protoexp; STAILQ_ENTRY(ips_tid_send_desc) next; /* Filled in at allocation time */ ptl_arg_t sdescid; /* sender descid */ ptl_arg_t rdescid; /* reciever descid */ ips_epaddr_t *ipsaddr; psm2_mq_req_t mqreq; /* tidflow to send tid traffic */ struct ips_flow tidflow; /* Iterated during send progress */ void *userbuf; /* user privided buffer */ void *buffer; uint32_t length; /* total length, includint start/end */ uint32_t tidbytes; /* bytes sent over tid so far */ uint32_t remaining_tidbytes; uint32_t offset_in_tid; /* could be more than page */ uint32_t remaining_bytes_in_tid; uint16_t frame_send; uint16_t tid_idx; uint16_t is_complete; uint16_t frag_size; /* bitmap of queued control messages for flow */ uint16_t ctrl_msg_queued; #ifdef PSM_CUDA /* As size of cuda_hostbuf is less than equal to window size, * there is a guarantee that the maximum number of host bufs we * would need to attach to a tidsendc would be 2 */ struct ips_cuda_hostbuf *cuda_hostbuf[2]; /* Number of hostbufs attached */ uint8_t cuda_num_buf; #endif /* * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes, * so the max possible tid window size mq->hfi_base_window_rv is 4M. * However, PSM must fit tid grant message into a single transfer * unit, either PIO or SDMA, PSM will shrink the window accordingly. */ uint16_t tsess_tidlist_length; union { ips_tid_session_list tid_list; uint8_t filler[PSM_TIDLIST_BUFSIZE+ sizeof(ips_tid_session_list)]; }; }; #define TIDRECVC_STATE_FREE 0 #define TIDRECVC_STATE_BUSY 1 struct ips_expected_recv_stats { uint32_t nSeqErr; uint32_t nGenErr; uint32_t nReXmit; uint32_t nErrChkReceived; }; struct ips_tid_recv_desc { const psmi_context_t *context; struct ips_protoexp *protoexp; ptl_arg_t rdescid; /* reciever descid */ ips_epaddr_t *ipsaddr; struct ips_tid_get_request *getreq; /* scb to send tid grant CTS */ ips_scb_t *grantscb; /* scb to send tid data completion */ ips_scb_t *completescb; /* tidflow to only send ctrl msg ACK and NAK */ struct ips_flow tidflow; /* TF protocol state (recv) */ uint32_t state; uint32_t tidflow_active_gen; uint32_t tidflow_nswap_gen; psmi_seqnum_t tidflow_genseq; #ifdef PSM_CUDA struct ips_cuda_hostbuf *cuda_hostbuf; uint8_t is_ptr_gpu_backed; #endif void *buffer; uint32_t recv_msglen; uint32_t recv_tidbytes; /* exlcude start/end trim */ struct ips_expected_recv_stats stats; /* bitmap of queued control messages for */ uint16_t ctrl_msg_queued; /* * tid_session_list is 24 bytes, plus 512 tidpair for 2048 bytes, * so the max possible tid window size mq->hfi_base_window_rv is 4M. * However, PSM must fit tid grant message into a single transfer * unit, either PIO or SDMA, PSM will shrink the window accordingly. */ uint16_t tsess_tidlist_length; union { ips_tid_session_list tid_list; uint8_t filler[PSM_TIDLIST_BUFSIZE+ sizeof(ips_tid_session_list)]; }; }; /* * Get requests, issued by MQ when there's a match on a large message. Unlike * an RDMA get, the initiator identifies the location of the data at the target * using a 'send token' instead of a virtual address. This, of course, assumes * that the target has already registered the token and communicated it to the * initiator beforehand (it actually sends the token as part of the initial * MQ message that contains the MQ tag). * * The operation is semantically a two-sided RDMA get. */ typedef void (*ips_tid_completion_callback_t) (void *); struct ips_tid_get_request { STAILQ_ENTRY(ips_tid_get_request) tidgr_next; struct ips_protoexp *tidgr_protoexp; psm2_epaddr_t tidgr_epaddr; void *tidgr_lbuf; uint32_t tidgr_length; uint32_t tidgr_rndv_winsz; uint32_t tidgr_sendtoken; ips_tid_completion_callback_t tidgr_callback; void *tidgr_ucontext; uint32_t tidgr_offset; /* offset in bytes */ uint32_t tidgr_bytesdone; uint32_t tidgr_flags; #ifdef PSM_CUDA int cuda_hostbuf_used; uint32_t tidgr_cuda_bytesdone; STAILQ_HEAD(ips_tid_getreq_cuda_hostbuf_pend, /* pending exp. sends */ ips_cuda_hostbuf) pend_cudabuf; #endif }; /* * Descriptor limits, structure contents of struct psmi_rlimit_mpool for * normal, min and large configurations. */ #define TID_SENDSESSIONS_LIMITS { \ .env = "PSM2_TID_SENDSESSIONS_MAX", \ .descr = "Tid max send session descriptors", \ .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ .minval = 1, \ .maxval = 1<<30, \ .mode[PSMI_MEMMODE_NORMAL] = { 256, 8192 }, \ .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ .mode[PSMI_MEMMODE_LARGE] = { 512, 16384 } \ } /* * Expected send support */ /* * The expsend token is currently always a pointer to a MQ request. It is * echoed on the wire throughout various phases of the expected send protocol * to identify a particular send. */ psm2_error_t MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, const struct ips_proto *proto, uint32_t protoexp_flags, int num_of_send_bufs, int num_of_send_desc, struct ips_protoexp **protoexp_o); MOCK_DCL_EPILOGUE(ips_protoexp_init); psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp); void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev); void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev); void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev); void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev); int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev); int ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev); psm2_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc); PSMI_ALWAYS_INLINE( void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len)) { while (len) { dst[len-1] = src[len-1]; len--; } } /* * Peer is waiting (blocked) for this request */ #define IPS_PROTOEXP_TIDGET_WAIT 0x1 #define IPS_PROTOEXP_TIDGET_PEERWAIT 0x2 psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, void *buf, uint32_t length, psm2_epaddr_t epaddr, uint32_t remote_tok, uint32_t flags, ips_tid_completion_callback_t callback, void *context); psm2_error_t ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, ips_epaddr_t *ipsaddr, psm2_mq_req_t req, ptl_arg_t rdescid, uint32_t tidflow_genseq, ips_tid_session_list *tid_list, uint32_t tid_list_size); #endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_opp_path_rec.c000066400000000000000000000512301370564314600213560ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include /* SLID and DLID are in network byte order */ static psm2_error_t ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, uint16_t slid, uint16_t dlid, uint16_t desthfi_type, ips_path_rec_t **ppath_rec) { psm2_error_t err = PSM2_OK; ibta_path_rec_t query, opp_response; #ifdef _HFI_DEBUGGING int opp_response_set = 0; #endif ips_path_rec_t *path_rec; int opp_err; ENTRY elid, *epath = NULL; char eplid[128]; uint64_t timeout_ack_ms; /* Query path record query cache first */ bzero(&query, sizeof(query)); bzero(eplid, sizeof(eplid)); /* Bulk service ID is control service id + 1 */ switch (type) { case IPS_PATH_LOW_PRIORITY: query.service_id = __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET); break; case IPS_PATH_NORMAL_PRIORITY: case IPS_PATH_HIGH_PRIORITY: default: query.service_id = __cpu_to_be64(proto->ep->service_id); } query.slid = slid; query.dlid = dlid; snprintf(eplid, sizeof(eplid), "%s_%x_%x", (type == IPS_PATH_LOW_PRIORITY) ? "LOW" : "HIGH", query.slid, query.dlid); elid.key = eplid; hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); if (!epath) { /* Unable to find path record in cache */ elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); path_rec = (ips_path_rec_t *) psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_rec_t)); if (!elid.key || !path_rec) { if (elid.key) psmi_free(elid.key); if (path_rec) psmi_free(path_rec); err = PSM2_NO_MEMORY; goto fail; } /* Get path record between local LID and remote */ opp_err = proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt, &query, &opp_response); if (opp_err) { psmi_free(path_rec); psmi_free(elid.key); err = PSM2_EPID_PATH_RESOLUTION; goto fail; } #ifdef _HFI_DEBUGGING opp_response_set = 1; #endif /* Create path record */ path_rec->pr_slid = opp_response.slid; path_rec->pr_dlid = opp_response.dlid; path_rec->pr_mtu = min(opa_mtu_enum_to_int(opp_response.mtu & 0x3f), proto->epinfo.ep_mtu); path_rec->pr_pkey = ntohs(opp_response.pkey); path_rec->pr_sl = ntohs(opp_response.qos_class_sl); path_rec->pr_static_ipd = proto->ips_ipd_delay[opp_response.rate & 0x3f]; /* Setup CCA parameters for path */ if (path_rec->pr_sl > PSMI_SL_MAX) { psmi_free(path_rec); psmi_free(elid.key); err = PSM2_INTERNAL_ERR; goto fail; } if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) { _HFI_CCADBG("No CCA for sl %d, disable CCA\n", path_rec->pr_sl); proto->flags &= ~IPS_PROTO_FLAG_CCA; proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; } if (!psmi_hal_has_cap(PSM_HAL_CAP_STATIC_RATE_CTRL)) { _HFI_CCADBG("No Static-Rate-Control, disable CCA\n"); proto->flags &= ~IPS_PROTO_FLAG_CCA; proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; } path_rec->proto = proto; path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min; path_rec->pr_timer_cca = NULL; /* Determine active IPD for path. Is max of static rate and CCT table */ if (!(proto->flags & IPS_PROTO_FLAG_CCA)) { path_rec->pr_active_ipd = 0; path_rec->pr_cca_divisor = 0; } else if ((path_rec->pr_static_ipd) && ((path_rec->pr_static_ipd + 1) > (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) { path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1; path_rec->pr_cca_divisor = 0; /*Static rate has no CCA divisor */ } else { /* Pick it from the CCT table */ path_rec->pr_active_ipd = proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK; path_rec->pr_cca_divisor = proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT; } /* Compute max timeout based on pkt life time for path */ timeout_ack_ms = ((4096UL * (1UL << (opp_response.pkt_life & 0x3f))) / 1000000UL); timeout_ack_ms = ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT + timeout_ack_ms); if (proto->epinfo.ep_timeout_ack_max < timeout_ack_ms) proto->epinfo.ep_timeout_ack_max = timeout_ack_ms; /* Add path record into cache */ strcpy(elid.key, eplid); elid.data = (void *)path_rec; hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); } else /* Path record found in cache */ path_rec = (ips_path_rec_t *) epath->data; #ifdef _HFI_DEBUGGING /* Dump path record stats */ _HFI_PRDBG("Path Record ServiceID: %" PRIx64 " %x -----> %x\n", (uint64_t) __be64_to_cpu(query.service_id), __be16_to_cpu(slid), __be16_to_cpu(dlid)); if (opp_response_set) { _HFI_PRDBG("MTU: %x, %x\n", (opp_response.mtu & 0x3f), path_rec->pr_mtu); _HFI_PRDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey)); _HFI_PRDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl)); _HFI_PRDBG("Rate: %x, IPD: %x\n", (opp_response.rate & 0x3f), path_rec->pr_static_ipd); } _HFI_PRDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n", proto->epinfo.ep_timeout_ack, proto->epinfo.ep_timeout_ack_max); #endif /* Return the IPS path record */ *ppath_rec = path_rec; fail: return err; } static psm2_error_t ips_opp_path_rec(struct ips_proto *proto, uint16_t slid, uint16_t dlid, uint16_t desthfi_type, unsigned long timeout, ips_path_grp_t **ppathgrp) { psm2_error_t err = PSM2_OK; uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc); ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY; ips_path_rec_t *path; ips_path_grp_t *pathgrp; uint16_t path_slid, path_dlid; ENTRY elid, *epath = NULL; char eplid[128]; /* * High Priority Path * ------------------ * * Uses the "base" Service ID. For now there exists only 1 high priority * path between nodes even for non zero LMC fabrics. * * Normal/Low Priority Paths * ------------------------- * * Currently these paths are the same i.e. they are queried for the same * Service ID/vFabric which is the Base Service ID for High Priority + 1. * * Use case Scenarios * ------------------ * * Since with vFabrics we have the capability to define different QoS * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is * setup in a separate vFabric for high priority traffic. The NORMAL paths * are setup in a separate vFabric optimized for high bandwidth. This allows * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.) * also use the high priority control vFabric. * * NOTE: In order to distinguish between the different vFabrics the user * specifies the service ID to use via mpirun (or environment variable). * This is the service ID for the high priority control traffic. The bulk * data vFabric is identified by service ID + 1. So for each MPI application * one should specify two service IDs for the high priority and bulk data. * Both these service IDs can be placed in the same vFabric which can be * configured for high priority or bandwidth traffic giving us the default * behavior upto Infinhfi 2.5 release. * * NOTE: All of the above would have really helped if the S20 silicon could * correctly support IBTA QoS features. Due to S20 design we can only have * high priority VLarb table (low priority VLarb table results in round * robin arbitration ignoring the weights!). But if this is fixed in a * subsequent chip respin then this may potentially help our scalability * on large fabrics. * * Mesh/Torus and DOR routed networks * ---------------------------------- * * In a mesh/torus fabric we always have a non zero LMC (at least 1 can be * more). We would like to take advantage of dispersive routing on these * fabrics as well to obtain better "worst case/congested" bandwidth. For * these networks currently the base LIDs are used for UPDN routing which * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR * routing (Dimension Ordered Routing) to avoid deadlocks and provide * higher performance. If a fabric is disrupted then only the base UPDN * routing is available. PSM should continue to operate in this environment * albeit with degraded performance. In disrupted fabric the OPP path * record queries may fail for some DOR routed LIDs i.e. no path exists * PSM should hence ignore path record failures as they indicate a disrupted * fabric and only use valid paths that are returned from the replica. This * will degenerate to only using the UPDN paths on disrupted fabrics and DOR * routes only for fully configured fabrics. Note: For a clean fabric the * base LIDs that are configured for UPDN route will not exist in the replica * as DOR routes are preferred. Hence we will only dispersively route across * the DOR routes only using the UPDN route for disrupted fabrics. * * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN * TAKE PLACE. */ /* Check if this path grp is already in hash table */ snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); elid.key = eplid; hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); if (epath) { /* Find path group in cache */ *ppathgrp = (ips_path_grp_t *) epath->data; return err; } /* If base lids are only used then reset num_path to 1 */ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) num_path = 1; /* Allocate a new pathgroup */ elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); pathgrp = (ips_path_grp_t *) psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) + num_path * IPS_PATH_MAX_PRIORITY * sizeof(ips_path_rec_t *)); if (!elid.key || !pathgrp) { if (elid.key) psmi_free(elid.key); if (pathgrp) psmi_free(pathgrp); err = PSM2_NO_MEMORY; goto fail; } /* * dlid is the peer base lid. * slid is the base lid for the local end point. * Store here in network byte order. */ pathgrp->pg_base_dlid = dlid; pathgrp->pg_base_slid = slid; pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 0; /* For now there is always only one high priority path between nodes. */ for (pidx = 0, cpath = 0; pidx < num_path && cpath == 0; pidx++) { path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto, path_slid, path_dlid, desthfi_type, &path); if (err == PSM2_OK) { /* Valid high priority path found */ /* Resolved high priority path successfully */ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]++; pathgrp->pg_path[cpath][IPS_PATH_HIGH_PRIORITY] = path; /* Increment current path index */ cpath++; } PSM2_LOG_MSG("path %p slid %hu dlid %hu\n", path, __be16_to_cpu(path->pr_slid), __be16_to_cpu(path->pr_dlid)); } /* Make sure we have atleast 1 high priority path */ if (pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) { psmi_free(elid.key); psmi_free(pathgrp); err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, "OFED Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" PRIx64 " defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id); goto fail; } /* Once we have the high-priority path, set the partition key */ if (psmi_hal_set_pkey(proto->ep->context.psm_hw_ctxt, (uint16_t) pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey) != 0) { err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, "Couldn't set device pkey 0x%x: %s", (int)pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY]->pr_pkey, strerror(errno)); psmi_free(elid.key); psmi_free(pathgrp); goto fail; } /* Next setup the bulk paths. If the subnet administrator has misconfigured * or rather not configured two separate service IDs we place the bulk * paths in the same vFabric as the control paths. */ path_type = IPS_PATH_NORMAL_PRIORITY; for (pidx = 0, cpath = 0; pidx < num_path; pidx++) { path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); retry_normal_path_res: err = ips_opp_get_path_rec(path_type, proto, path_slid, path_dlid, desthfi_type, &path); if (err != PSM2_OK) { if (path_type == IPS_PATH_NORMAL_PRIORITY) { /* Subnet may only be configured for one service ID/vFabric. Default * to using the control vFabric/service ID for bulk data as well. */ path_type = IPS_PATH_HIGH_PRIORITY; goto retry_normal_path_res; } /* Unable to resolve path for . This is possible * for disrupted fabrics using DOR routing so continue to acquire paths */ err = PSM2_OK; continue; } /* Valid path. */ pathgrp->pg_path[cpath][IPS_PATH_NORMAL_PRIORITY] = path; pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]++; cpath++; } /* Make sure we have at least have a single bulk data transfer path */ if (pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) { psmi_free(elid.key); psmi_free(pathgrp); err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, "OFED Plus path lookup failed. Unable to resolve normal priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" PRIx64 " defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id); goto fail; } path_type = IPS_PATH_LOW_PRIORITY; for (pidx = 0, cpath = 0; pidx < num_path; pidx++) { path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); retry_low_path_res: err = ips_opp_get_path_rec(path_type, proto, path_slid, path_dlid, desthfi_type, &path); if (err != PSM2_OK) { if (path_type == IPS_PATH_LOW_PRIORITY) { /* Subnet may only be configured for one service ID/vFabric. Default * to using the control vFabric/service ID for bulk data as well. */ path_type = IPS_PATH_HIGH_PRIORITY; goto retry_low_path_res; } /* Unable to resolve path for . This is possible * for disrupted fabrics using DOR routing so continue to acquire paths */ err = PSM2_OK; continue; } /* Valid path. */ pathgrp->pg_path[cpath][IPS_PATH_LOW_PRIORITY] = path; pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]++; cpath++; } /* Make sure we have at least have a single bulk data transfer path */ if (pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] == 0) { psmi_free(elid.key); psmi_free(pathgrp); err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, "OFED Plus path lookup failed. Unable to resolve low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" PRIx64 " defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id); goto fail; } if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] = proto->epinfo.ep_context % pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]; pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] = proto->epinfo.ep_context % pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]; } /* Add path group into cache */ strcpy(elid.key, eplid); elid.data = (void *)pathgrp; hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash); *ppathgrp = pathgrp; fail: if (err != PSM2_OK) _HFI_PRDBG ("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n", slid, dlid); return err; } static psm2_error_t ips_opp_fini(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; if (proto->opp_lib) dlclose(proto->opp_lib); return err; } psm2_error_t ips_opp_init(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; char hfiName[32]; proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW); if (!proto->opp_lib) { char *err = dlerror(); _HFI_ERROR ("Unable to open OFED Plus Plus library %s. Error: %s\n", DF_OPP_LIBRARY, err ? err : "no dlerror()"); goto fail; } /* Resolve symbols that we require within opp library */ proto->opp_fn.op_path_find_hca = dlsym(proto->opp_lib, "op_path_find_hfi"); proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open"); proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close"); proto->opp_fn.op_path_get_path_by_rec = dlsym(proto->opp_lib, "op_path_get_path_by_rec"); /* If we can't resovle any symbol then fail to load opp module */ if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open || !proto->opp_fn.op_path_close || !proto->opp_fn.op_path_get_path_by_rec) { _HFI_ERROR ("Unable to resolve symbols in OPP library. Unloading.\n"); goto fail; } /* If PSM2_IDENTIFY is set display the OPP library location being used. */ if (getenv("PSM2_IDENTIFY")) { Dl_info info_opp; printf ("PSM2 path record queries using OFED Plus Plus (%s) from %s\n", DF_OPP_LIBRARY, dladdr(proto->opp_fn.op_path_open, &info_opp) ? info_opp. dli_fname : "Unknown/unsupported version of OPP library found!"); } /* Obtain handle to hfi (requires verbs on node) */ snprintf(hfiName, sizeof(hfiName), "%s_%d", psmi_hal_get_hfi_name(), psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt)); proto->hndl = proto->opp_fn.op_path_find_hca(hfiName, &proto->device); if (!proto->hndl) { _HFI_ERROR ("OPP: Unable to find HFI %s. Disabling OPP interface for path record queries.\n", hfiName); goto fail; } /* Get OPP context */ proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, 1); if (!proto->opp_ctxt) { _HFI_ERROR ("OPP: Unable to obtain OPP context. Disabling OPP interface for path record queries.\n"); goto fail; } /* Setup default errorcheck timeout. OPP may change it later. */ proto->epinfo.ep_timeout_ack = ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); proto->epinfo.ep_timeout_ack_max = ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); proto->epinfo.ep_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT; /* OPP initialized successfully */ proto->ibta.get_path_rec = ips_opp_path_rec; proto->ibta.fini = ips_opp_fini; proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC; return err; fail: _HFI_ERROR("Make sure SM is running...\n"); _HFI_ERROR("Make sure service ibacm is running...\n"); _HFI_ERROR("to start ibacm: service ibacm start\n"); _HFI_ERROR("or enable it at boot time: opaconfig -E ibacm\n\n"); err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, "Unable to initialize OFED Plus library successfully.\n"); if (proto->opp_lib) dlclose(proto->opp_lib); return err; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_path_rec.c000066400000000000000000000617261370564314600205130ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include #include #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" /* * These are the default values used in parsing the environment * variable PSM2_PATH_NO_LMC_RANGE, which can be used to exclude * a range of message sizes from the LMC LID assignments used to * implement dispersive routing. * * This value is 2^32 - 1. */ #define DEF_LIMITS_STRING "4294967295:4294967295" #define DEF_LIMITS_VALUE 4294967295 static void ips_gen_ipd_table(struct ips_proto *proto) { uint8_t delay = 0, step = 1; /* Based on our current link rate setup the IPD table */ memset(proto->ips_ipd_delay, 0xFF, sizeof(proto->ips_ipd_delay)); /* * Based on the starting rate of the link, we let the code to * fall through to next rate without 'break' in the code. The * decrement is doubled at each rate level... */ switch (proto->epinfo.ep_link_rate) { case IBV_RATE_300_GBPS: proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_200_GBPS: proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_168_GBPS: proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_120_GBPS: proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; case IBV_RATE_112_GBPS: proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; case IBV_RATE_100_GBPS: proto->ips_ipd_delay[IBV_RATE_100_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_80_GBPS: proto->ips_ipd_delay[IBV_RATE_80_GBPS] = delay; case IBV_RATE_60_GBPS: proto->ips_ipd_delay[IBV_RATE_60_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_40_GBPS: proto->ips_ipd_delay[IBV_RATE_40_GBPS] = delay; case IBV_RATE_30_GBPS: proto->ips_ipd_delay[IBV_RATE_30_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_25_GBPS: proto->ips_ipd_delay[IBV_RATE_25_GBPS] = delay; case IBV_RATE_20_GBPS: proto->ips_ipd_delay[IBV_RATE_20_GBPS] = delay; delay += step; step *= 2; case IBV_RATE_10_GBPS: proto->ips_ipd_delay[IBV_RATE_10_GBPS] = delay; case IBV_RATE_5_GBPS: proto->ips_ipd_delay[IBV_RATE_5_GBPS] = delay; default: break; } } static psm2_error_t ips_gen_cct_table(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; uint32_t cca_divisor, ipdidx, ipdval = 1; uint16_t *cct_table; /* The CCT table is static currently. If it's already created then return */ if (proto->cct) goto fail; /* Allocate the CCT table */ cct_table = psmi_calloc(proto->ep, UNDEFINED, proto->ccti_size, sizeof(uint16_t)); if (!cct_table) { err = PSM2_NO_MEMORY; goto fail; } if (proto->ccti_size) { /* The first table entry is always 0 i.e. no IPD delay */ cct_table[0] = 0; } /* Generate the remaining CCT table entries */ for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4, ipdval++) for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) { if ((ipdidx + cca_divisor) == proto->ccti_size) break; cct_table[ipdidx + cca_divisor] = (((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) | (ipdval & 0x3FFF)); _HFI_CCADBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n", ipdidx + cca_divisor, cct_table[ipdidx + cca_divisor], (cct_table[ipdidx + cca_divisor] >> CCA_DIVISOR_SHIFT), cct_table[ipdidx + cca_divisor] & CCA_IPD_MASK); } /* On link up/down CCT is re-generated. If CCT table is previously created * free it */ if (proto->cct) { psmi_free(proto->cct); proto->cct = NULL; } /* Update to the new CCT table */ proto->cct = cct_table; fail: return err; } static opa_rate ips_default_hfi_rate(uint16_t hfi_type) { opa_rate rate; switch (hfi_type) { case PSMI_HFI_TYPE_OPA1: rate = IBV_RATE_100_GBPS; break; case PSMI_HFI_TYPE_OPA2: rate = IBV_RATE_120_GBPS; break; default: rate = IBV_RATE_MAX; } return rate; } static opa_rate ips_rate_to_enum(int link_rate) { opa_rate rate; switch (link_rate) { case 300: rate = IBV_RATE_300_GBPS; break; case 200: rate = IBV_RATE_200_GBPS; break; case 100: rate = IBV_RATE_100_GBPS; break; case 25: rate = IBV_RATE_25_GBPS; break; case 168: rate = IBV_RATE_168_GBPS; break; case 112: rate = IBV_RATE_112_GBPS; break; case 56: rate = IBV_RATE_56_GBPS; break; case 14: rate = IBV_RATE_14_GBPS; break; case 120: rate = IBV_RATE_120_GBPS; break; case 80: rate = IBV_RATE_80_GBPS; break; case 60: rate = IBV_RATE_60_GBPS; break; case 40: rate = IBV_RATE_40_GBPS; break; case 30: rate = IBV_RATE_30_GBPS; break; case 20: rate = IBV_RATE_20_GBPS; break; case 10: rate = IBV_RATE_10_GBPS; break; case 5: rate = IBV_RATE_5_GBPS; break; default: rate = IBV_RATE_MAX; } return rate; } static psm2_error_t ips_none_get_path_rec(struct ips_proto *proto, uint16_t slid, uint16_t dlid, uint16_t desthfi_type, unsigned long timeout, ips_path_rec_t **ppath_rec) { psm2_error_t err = PSM2_OK; ips_path_rec_t *path_rec; ENTRY elid, *epath = NULL; char eplid[128]; /* Query the path record cache */ snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); elid.key = eplid; hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); if (!epath) { elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); path_rec = (ips_path_rec_t *) psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_rec_t)); if (!elid.key || !path_rec) { if (elid.key) psmi_free(elid.key); if (path_rec) psmi_free(path_rec); return PSM2_NO_MEMORY; } /* Create path record */ path_rec->pr_slid = slid; path_rec->pr_dlid = dlid; path_rec->pr_mtu = proto->epinfo.ep_mtu; path_rec->pr_pkey = proto->epinfo.ep_pkey; path_rec->pr_sl = proto->epinfo.ep_sl; /* Determine the IPD based on our local link rate and default link rate for * remote hfi type. */ path_rec->pr_static_ipd = proto->ips_ipd_delay[ips_default_hfi_rate(desthfi_type)]; _HFI_CCADBG("pr_static_ipd = %d\n", (int) path_rec->pr_static_ipd); /* Setup CCA parameters for path */ if (path_rec->pr_sl > PSMI_SL_MAX) { psmi_free(elid.key); psmi_free(path_rec); return PSM2_INTERNAL_ERR; } if (!(proto->ccti_ctrlmap & (1 << path_rec->pr_sl))) { _HFI_CCADBG("No CCA for sl %d, disable CCA\n", path_rec->pr_sl); proto->flags &= ~IPS_PROTO_FLAG_CCA; proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; } if (!psmi_hal_has_cap(PSM_HAL_CAP_STATIC_RATE_CTRL)) { _HFI_CCADBG("No Static-Rate-Control, disable CCA\n"); proto->flags &= ~IPS_PROTO_FLAG_CCA; proto->flags &= ~IPS_PROTO_FLAG_CCA_PRESCAN; } path_rec->proto = proto; path_rec->pr_ccti = proto->cace[path_rec->pr_sl].ccti_min; path_rec->pr_timer_cca = NULL; /* Determine active IPD for path. Is max of static rate and CCT table */ if (!(proto->flags & IPS_PROTO_FLAG_CCA)) { _HFI_CCADBG("No IPS_PROTO_FLAG_CCA\n"); path_rec->pr_active_ipd = 0; path_rec->pr_cca_divisor = 0; _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd); _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor); } else if ((path_rec->pr_static_ipd) && ((path_rec->pr_static_ipd + 1) > (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) { _HFI_CCADBG("IPS_PROTO_FLAG_CCA set, Setting pr_active_ipd.\n"); path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1; path_rec->pr_cca_divisor = 0; _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd); _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor); } else { /* Pick it from the CCT table */ _HFI_CCADBG("Picking up active IPD from CCT table, index %d, value 0x%x\n", (int) path_rec->pr_ccti, (int) proto->cct[path_rec->pr_ccti]); path_rec->pr_active_ipd = proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK; path_rec->pr_cca_divisor = proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT; _HFI_CCADBG("pr_active_ipd = %d\n", (int) path_rec->pr_active_ipd); _HFI_CCADBG("pr_cca_divisor = %d\n", (int) path_rec->pr_cca_divisor); } /* Add path record into cache */ strcpy(elid.key, eplid); elid.data = (void *)path_rec; hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); } else path_rec = (ips_path_rec_t *) epath->data; /* Return IPS path record */ *ppath_rec = path_rec; return err; } static psm2_error_t ips_none_path_rec(struct ips_proto *proto, uint16_t slid, uint16_t dlid, uint16_t desthfi_type, unsigned long timeout, ips_path_grp_t **ppathgrp) { psm2_error_t err = PSM2_OK; uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc); uint16_t path_slid, path_dlid; ips_path_rec_t *path; ips_path_grp_t *pathgrp; ENTRY elid, *epath = NULL; char eplid[128]; /* For the "none" path record resolution all paths are assumed to be * of equal priority however since we want to isolate all control * traffic (acks, naks) to a separate path for non zero LMC subnets * the "first path" between a pair of endpoints is always the "higher" * priority paths. The rest of the paths are the normal (and low * priority) paths. */ /* Query the path record cache */ snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); elid.key = eplid; hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); if (epath) { /* Find path group in cache */ *ppathgrp = (ips_path_grp_t *) epath->data; return err; } /* If base lids are only used then reset num_path to 1 */ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) num_path = 1; /* Allocate a new pathgroup */ elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); pathgrp = (ips_path_grp_t *) psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) + num_path * IPS_PATH_MAX_PRIORITY * sizeof(ips_path_rec_t *)); if (!elid.key || !pathgrp) { if (elid.key) psmi_free(elid.key); if (pathgrp) psmi_free(pathgrp); err = PSM2_NO_MEMORY; goto fail; } /* * dlid is the peer base lid. * slid is the base lid for the local end point. * Store in network byte order. */ pathgrp->pg_base_dlid = dlid; pathgrp->pg_base_slid = slid; if (num_path > 1) { /* One control path and (num_path - 1) norm and low priority paths */ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1; pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1; } else { /* LMC of 0. Use the same path for all priorities */ pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1; pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 1; } /* For "none" path record we just setup 2^lmc paths. To get better load * balance */ for (pidx = 0; pidx < num_path; pidx++) { path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); err = ips_none_get_path_rec(proto, path_slid, path_dlid, desthfi_type, timeout, &path); if (err != PSM2_OK) { psmi_free(elid.key); psmi_free(pathgrp); goto fail; } if (num_path > 1) { if (pidx == 0) { /* First path is always the high priority path */ pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path; } else { pathgrp->pg_path[pidx - 1][IPS_PATH_NORMAL_PRIORITY] = path; pathgrp->pg_path[pidx - 1][IPS_PATH_LOW_PRIORITY] = path; } } else { pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path; pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path; pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path; } PSM2_LOG_MSG("path %p slid %hu dlid %hu \n", path, __be16_to_cpu(path->pr_slid), __be16_to_cpu(path->pr_dlid)); } if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] = proto->epinfo.ep_context % pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]; pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] = proto->epinfo.ep_context % pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]; } /* Add path record into cache */ strcpy(elid.key, eplid); elid.data = (void *)pathgrp; hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash); *ppathgrp = pathgrp; fail: if (err != PSM2_OK) _HFI_PRDBG ("Unable to get path record for LID %x <---> DLID %x.\n", slid, dlid); return err; } static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; /* Obtain the SL and PKEY to use from the environment (HFI_SL & PSM_KEY) */ proto->epinfo.ep_sl = proto->ep->out_sl; proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey; /* * Parse the err_chk settings from the environment. * :: */ { union psmi_envvar_val env_to; char *errchk_to = PSM_TID_TIMEOUT_DEFAULT; int tvals[3] = { IPS_PROTO_ERRCHK_MS_MIN_DEFAULT, IPS_PROTO_ERRCHK_MS_MAX_DEFAULT, IPS_PROTO_ERRCHK_FACTOR_DEFAULT }; if (!psmi_getenv("PSM2_ERRCHK_TIMEOUT", "Errchk timeouts in mS ", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)errchk_to, &env_to)) { /* Not using default values, parse what we can */ errchk_to = env_to.e_str; psmi_parse_str_tuples(errchk_to, 3, tvals); /* Adjust for max smaller than min, things would break */ if (tvals[1] < tvals[0]) tvals[1] = tvals[0]; } proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]); proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]); proto->epinfo.ep_timeout_ack_factor = tvals[2]; } proto->ibta.get_path_rec = ips_none_path_rec; proto->ibta.fini = NULL; /* With no path records queries set pkey manually */ if (psmi_hal_set_pkey(proto->ep->context.psm_hw_ctxt, (uint16_t) proto->ep->network_pkey) != 0) { err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, "Couldn't set device pkey 0x%x: %s", (int)proto->ep->network_pkey, strerror(errno)); } return err; } /* (Re)load the SL2SC table */ psm2_error_t ips_ibta_init_sl2sc_table(struct ips_proto *proto) { int ret, i; /* Get SL2SC table for unit, port */ for (i = 0; i < PSMI_N_SCS; i++) { if ((ret = psmi_hal_get_port_sl2sc(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), (uint8_t) i)) < 0) { /* Unable to get SL2SC. Set it to default */ ret = PSMI_SC_DEFAULT; } proto->sl2sc[i] = (uint16_t) ret; } psmi_hal_get_sc2vl_map(proto); return PSM2_OK; } /* On link up/down we need to update some state */ psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; int ret; /* Get base lid, lmc and rate as these may have changed if the link bounced */ proto->epinfo.ep_base_lid = __cpu_to_be16((uint16_t) psm2_epid_nid(proto->ep->context.epid)); if ((ret = psmi_hal_get_port_lmc(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt))) < 0) { err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, "Could not obtain LMC for unit %u:%u. Error: %s", psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), strerror(errno)); goto fail; } proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC); if ((ret = psmi_hal_get_port_rate(psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt))) < 0) { err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, "Could obtain link rate for unit %u:%u. Error: %s", psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), strerror(errno)); goto fail; } proto->epinfo.ep_link_rate = ips_rate_to_enum(ret); /* Load the SL2SC2VL table */ ips_ibta_init_sl2sc_table(proto); /* Regenerate new IPD table for the updated link rate. */ ips_gen_ipd_table(proto); /* Generate the CCT table. */ err = ips_gen_cct_table(proto); fail: return err; } psm2_error_t MOCKABLE(ips_ibta_init)(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; union psmi_envvar_val psm_path_policy; union psmi_envvar_val disable_cca; union psmi_envvar_val cca_prescan; union psmi_envvar_val path_disable_lmc_interval; /* Get the path selection policy */ psmi_getenv("PSM2_PATH_SELECTION", "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"adaptive", &psm_path_policy); if (!strcasecmp((const char *)psm_path_policy.e_str, "adaptive")) proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE; else if (!strcasecmp((const char *)psm_path_policy.e_str, "static_src")) proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC; else if (!strcasecmp ((const char *)psm_path_policy.e_str, "static_dest")) proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST; else if (!strcasecmp ((const char *)psm_path_policy.e_str, "static_base")) proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE; if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) _HFI_PRDBG("Using adaptive path selection.\n"); if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) _HFI_PRDBG("Static path selection: Src Context\n"); if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) _HFI_PRDBG("Static path selection: Dest Context\n"); if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) _HFI_PRDBG("Static path selection: Base LID\n"); psmi_getenv("PSM2_DISABLE_CCA", "Disable use of Congestion Control Architecture (CCA) [enabled] ", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &disable_cca); if (disable_cca.e_uint) _HFI_CCADBG("CCA is disabled for congestion control.\n"); else { int i; char ccabuf[256]; uint8_t *p; /* Start out by turning on both styles of congestion control. * Later, we will eliminate the correct one. */ proto->flags |= IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CC_REPL_BECN; /* * If user set any environment variable, use self CCA. */ if (getenv("PSM2_CCTI_INCREMENT") || getenv("PSM2_CCTI_TIMER") || getenv("PSM2_CCTI_TABLE_SIZE")) { goto disablecca; } psmi_getenv("PSM2_CCA_PRESCAN", "Enable Congestion Control Prescanning (disabled by default) ", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &cca_prescan); if (cca_prescan.e_uint) proto->flags |= IPS_PROTO_FLAG_CCA_PRESCAN; /* * Check qib driver CCA setting, and try to use it if available. * Fall to self CCA setting if errors. */ i = psmi_hal_get_cc_settings_bin( psmi_hal_get_unit_id(proto->ep->context.psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context.psm_hw_ctxt), ccabuf, sizeof(ccabuf)); if (i <= 0) { goto disablecca; } p = (uint8_t *) ccabuf; memcpy(&proto->ccti_ctrlmap, p, 4); p += 4; memcpy(&proto->ccti_portctrl, p, 2); p += 2; for (i = 0; i < 32; i++) { proto->cace[i].ccti_increase = *p; p++; /* skip reserved u8 */ p++; memcpy(&proto->cace[i].ccti_timer_cycles, p, 2); p += 2; proto->cace[i].ccti_timer_cycles = us_2_cycles(proto->cace[i].ccti_timer_cycles); proto->cace[i].ccti_threshold = *p; p++; proto->cace[i].ccti_min = *p; p++; } i = psmi_hal_get_cc_table_bin(psmi_hal_get_unit_id(proto->ep->context. psm_hw_ctxt), psmi_hal_get_port_num(proto->ep->context. psm_hw_ctxt), &proto->cct); if (i < 0) { err = PSM2_NO_MEMORY; goto fail; } else if (i == 0) { goto disablecca; } proto->ccti_limit = i; proto->ccti_size = proto->ccti_limit + 1; _HFI_CCADBG("ccti_limit = %d\n", (int) proto->ccti_limit); for (i = 0; i < proto->ccti_limit; i++) _HFI_CCADBG("cct[%d] = 0x%04x\n", i, (int) proto->cct[i]); /* Note, here, we are leaving CC style(s): (IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN) */ proto->flags &= ~IPS_PROTO_FLAG_CC_REPL_BECN; goto finishcca; /* * Disable CCA. */ disablecca: /* Note, here, we are leaving CC style: IPS_PROTO_FLAG_CC_REPL_BECN */ proto->flags &= ~(IPS_PROTO_FLAG_CCA | IPS_PROTO_FLAG_CCA_PRESCAN); } finishcca: /* Initialize path record/group hash table */ { uint32_t lmc_disable_low, lmc_disable_high; int sscanf_ret; /* The default disable_low and disable_low values * are 2^32 - 1, the maximum allowable message size. * So by default all messages should be smaller than the * lower limit, and so will not have LMC dispersive * routing disabled. * * Add to this, these limits are applied only to SDMA * and PIO message, NOT TID messages. So this size * bigger than any PIO size. */ psmi_getenv("PSM2_PATH_NO_LMC_RANGE", "Disable LMC route dispersion within this range, " "low_value:high_value\n", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)DEF_LIMITS_STRING, &path_disable_lmc_interval); sscanf_ret = sscanf(path_disable_lmc_interval.e_str, "%u:%u", &lmc_disable_low, &lmc_disable_high); /* * It's "invalid" for the low end of the range to be * larger than the hig end of the range, so revert * to the "maximum message size" (2^32 - 1). */ if ((sscanf_ret != 2) || (lmc_disable_low > lmc_disable_high)) { lmc_disable_low = lmc_disable_high = DEF_LIMITS_VALUE; } PSM2_LOG_MSG("PSM2_PATH_NO_LMC_RANGE: " "lmc_disable_low %u lmc_disable_high %u\n", lmc_disable_low, lmc_disable_high); /* * These specify the range of message sizes in bytes, of * the messages to disable LMC dynamic LID assignment. */ proto->ips_lmc_disable_low = lmc_disable_low; proto->ips_lmc_disable_high = lmc_disable_high; } hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash); hcreate_r(DF_PATH_GRP_HASH_SIZE, &proto->ips_path_grp_hash); /* On startup treat it as a link up/down event to setup state . */ if ((err = ips_ibta_link_updown_event(proto)) != PSM2_OK) goto fail; /* Setup the appropriate query interface for the endpoint */ switch (proto->ep->path_res_type) { case PSM2_PATH_RES_OPP: err = ips_opp_init(proto); if (err != PSM2_OK) _HFI_ERROR ("Unable to use OFED Plus Plus for path record queries.\n"); break; case PSM2_PATH_RES_UMAD: _HFI_ERROR ("Path record queries using UMAD is not supported in PSM version %d.%dx\n", PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); err = PSM2_EPID_PATH_RESOLUTION; break; case PSM2_PATH_RES_NONE: default: err = ips_none_path_rec_init(proto); } fail: return err; } MOCK_DEF_EPILOGUE(ips_ibta_init); psm2_error_t ips_ibta_fini(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; if (proto->ibta.fini) err = proto->ibta.fini(proto); /* Destroy the path record/group hash */ hdestroy_r(&proto->ips_path_rec_hash); hdestroy_r(&proto->ips_path_grp_hash); return err; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_path_rec.h000066400000000000000000000131121370564314600205020ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2009-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_PATH_REC_H_ #define _IPS_PATH_REC_H_ #include /* Default size of path record hash table */ #define DF_PATH_REC_HASH_SIZE 2047 /* Default size of path group hash table */ #define DF_PATH_GRP_HASH_SIZE 255 /* Default size of CCT table. Must be multiple of 64 */ #define DF_CCT_TABLE_SIZE 128 /* CCT max IPD delay. */ #define DF_CCT_MAX_IPD_DELAY_US 21 /* CCA divisor shift */ #define CCA_DIVISOR_SHIFT 14 /* CCA ipd mask */ #define CCA_IPD_MASK 0x3FFF /* A lot of these are IBTA specific defines that are available in other header * files. To minimize dependencies with PSM build process they are listed * here. Most of this is used to implement IBTA compliance features with PSM * like path record query etc. */ enum opa_mtu { IBTA_MTU_256 = 1, IBTA_MTU_512 = 2, IBTA_MTU_1024 = 3, IBTA_MTU_2048 = 4, IBTA_MTU_4096 = 5, OPA_MTU_8192 = 6, OPA_MTU_10240 = 7, IBTA_MTU_MIN = IBTA_MTU_256, OPA_MTU_MIN = IBTA_MTU_256, OPA_MTU_MAX = OPA_MTU_10240, }; typedef enum { IBV_RATE_MAX = 0, IBV_RATE_2_5_GBPS = 2, IBV_RATE_5_GBPS = 5, IBV_RATE_10_GBPS = 3, IBV_RATE_20_GBPS = 6, IBV_RATE_30_GBPS = 4, IBV_RATE_40_GBPS = 7, IBV_RATE_60_GBPS = 8, IBV_RATE_80_GBPS = 9, IBV_RATE_120_GBPS = 10, IBV_RATE_14_GBPS = 11, IBV_RATE_56_GBPS = 12, IBV_RATE_112_GBPS = 13, IBV_RATE_168_GBPS = 14, IBV_RATE_25_GBPS = 15, IBV_RATE_100_GBPS = 16, IBV_RATE_200_GBPS = 17, IBV_RATE_300_GBPS = 18 } opa_rate; static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { switch (mtu) { case IBTA_MTU_256: return 256; case IBTA_MTU_512: return 512; case IBTA_MTU_1024: return 1024; case IBTA_MTU_2048: return 2048; case IBTA_MTU_4096: return 4096; case OPA_MTU_8192: return 8192; case OPA_MTU_10240: return 10240; default: return -1; } } /* This is same as ob_path_rec from ib_types.h. Listed here to be self * contained to minimize dependencies during build etc. */ typedef struct _ibta_path_rec { uint64_t service_id; /* net order */ uint8_t dgid[16]; uint8_t sgid[16]; uint16_t dlid; /* net order */ uint16_t slid; /* net order */ uint32_t hop_flow_raw; /* net order */ uint8_t tclass; uint8_t num_path; uint16_t pkey; /* net order */ uint16_t qos_class_sl; /* net order */ uint8_t mtu; /* IBTA encoded */ uint8_t rate; /* IBTA encoded */ uint8_t pkt_life; /* IBTA encoded */ uint8_t preference; uint8_t resv2[6]; } ibta_path_rec_t; /* * PSM IPS path record components for endpoint. * * For Torus/non-zero LMC fabrics, pr_slid and pr_dlid may be different from * the "base lid" values for this connection. */ struct ips_proto; typedef struct ips_path_rec { uint16_t pr_slid; uint16_t pr_dlid; uint16_t pr_mtu; /* < Path's MTU */ uint16_t pr_pkey; uint16_t pr_static_ipd; /* Static rate IPD from path record */ uint8_t pr_sl; /* IBTA CCA parameters per path */ uint8_t pr_cca_divisor; /* CCA divisor [14:15] in CCT entry */ uint16_t pr_active_ipd; /* The current active IPD. max(static,cct) */ uint16_t pr_ccti; /* CCA table index */ /* Congestion timer for epr_ccti increment. */ psmi_timer *pr_timer_cca; struct ips_proto *proto; /* for global info */ } ips_path_rec_t; psm2_error_t ips_opp_init(struct ips_proto *proto); #endif opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto.c000066400000000000000000002265571370564314600200760ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ /* * IPS - Interconnect Protocol Stack. */ #include #include /* writev */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_proto_internal.h" #include "ips_proto_help.h" #include "psmi_wrappers.h" #include "psm_mq_internal.h" #ifdef PSM_CUDA #include "psm_gdrcpy.h" #endif /* * Control message types have their own flag to determine whether a message of * that type is queued or not. These flags are kept in a state bitfield. */ #define CTRL_MSG_ACK_QUEUED 0x0001 #define CTRL_MSG_NAK_QUEUED 0x0002 #define CTRL_MSG_BECN_QUEUED 0x0004 #define CTRL_MSG_ERR_CHK_QUEUED 0x0008 #define CTRL_MSG_ERR_CHK_GEN_QUEUED 0x0010 #define CTRL_MSG_CONNECT_REQUEST_QUEUED 0x0020 #define CTRL_MSG_CONNECT_REPLY_QUEUED 0x0040 #define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0080 #define CTRL_MSG_DISCONNECT_REPLY_QUEUED 0x0100 #ifdef PSM_CUDA uint32_t gpudirect_send_threshold; uint32_t gpudirect_recv_threshold; #endif static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto); static psm2_error_t proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context); #ifdef PSM_CUDA void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj) { struct ips_cuda_hostbuf *icb; struct ips_cuda_hostbuf_mpool_cb_context *ctxt = (struct ips_cuda_hostbuf_mpool_cb_context *) context; icb = (struct ips_cuda_hostbuf *)obj; if (is_alloc) { PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &icb->host_buf, ctxt->bufsz, CU_MEMHOSTALLOC_PORTABLE); PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT); } else { if (icb->host_buf) { PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf); PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status); } } return; } #endif static uint16_t ips_proto_compute_mtu_code(int mtu) { static const struct MapMTUToMtuCode { int mtu; uint16_t mtu_code; } mtumap[] = { { 256, IBTA_MTU_256 }, { 512, IBTA_MTU_512 }, { 1024, IBTA_MTU_1024}, { 2048, IBTA_MTU_2048}, { 4096, IBTA_MTU_4096}, { 8192, OPA_MTU_8192 }, {10240, OPA_MTU_10240}, }; int i; for (i=0;i < sizeof(mtumap)/sizeof(mtumap[0]);i++) if (mtu == mtumap[i].mtu) return mtumap[i].mtu_code; return 0; } psm2_error_t ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, const struct psmi_timer_ctrl *timerq, const struct ips_epstate *epstate, void *spioc, struct ips_proto *proto) { uint32_t protoexp_flags, cksum_sz; union psmi_envvar_val env_tid, env_cksum, env_mtu; psm2_error_t err = PSM2_OK; /* * Checksum packets within PSM. Default is off. * This is heavy weight and done in software so not recommended for * production runs. */ psmi_getenv("PSM2_CHECKSUM", "Enable checksum of messages (0 disables checksum)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, &env_cksum); memset(proto, 0, sizeof(struct ips_proto)); proto->ptl = (ptl_t *) ptl; proto->ep = context->ep; /* cached */ proto->mq = context->ep->mq; /* cached */ proto->pend_sends.proto = proto; psmi_timer_entry_init(&proto->pend_sends.timer, ips_proto_timer_pendq_callback, &proto->pend_sends); STAILQ_INIT(&proto->pend_sends.pendq); proto->epstate = (struct ips_epstate *)epstate; proto->timerq = (struct psmi_timer_ctrl *)timerq; proto->spioc = spioc; proto->epinfo.ep_baseqp = psmi_hal_get_bthqp(context->psm_hw_ctxt); proto->epinfo.ep_context = psmi_hal_get_context(context->psm_hw_ctxt); /* "real" context */ proto->epinfo.ep_subcontext = psmi_hal_get_subctxt(context->psm_hw_ctxt); proto->epinfo.ep_hfi_type = psmi_hal_get_hfi_type(context->psm_hw_ctxt); proto->epinfo.ep_jkey = psmi_hal_get_jkey(context->psm_hw_ctxt); /* If checksums enabled we insert checksum at end of packet */ cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0; proto->epinfo.ep_mtu = context->ep->mtu; /* Decrement checksum */ proto->epinfo.ep_mtu -= cksum_sz; /* See if user specifies a lower MTU to use */ if (!psmi_getenv ("PSM2_MTU", "MTU specified by user: 1-7,256-8192,10240]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_mtu)) { if (env_mtu.e_int != 256 && env_mtu.e_int != 512 && env_mtu.e_int != 1024 && env_mtu.e_int != 2048 && env_mtu.e_int != 4096 && env_mtu.e_int != 8192 && env_mtu.e_int != 10240) { if (env_mtu.e_int < OPA_MTU_MIN || env_mtu.e_int > OPA_MTU_MAX) env_mtu.e_int = OPA_MTU_8192; env_mtu.e_int = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int); } if (proto->epinfo.ep_mtu > env_mtu.e_int) proto->epinfo.ep_mtu = env_mtu.e_int; } proto->epinfo.ep_mtu_code = ips_proto_compute_mtu_code(proto->epinfo.ep_mtu); /* * The PIO size should not include the ICRC because it is * stripped by HW before delivering to receiving buffer. * We decide to use minimum 2 PIO buffers so that PSM has * turn-around time to do PIO transfer. Each credit is a * block of 64 bytes. Also PIO buffer size must not be * bigger than MTU. */ proto->epinfo.ep_piosize = psmi_hal_get_pio_size(context->psm_hw_ctxt) - cksum_sz; proto->epinfo.ep_piosize = min(proto->epinfo.ep_piosize, proto->epinfo.ep_mtu); /* Keep PIO as multiple of cache line size */ if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES) proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1); /* Save back to hfi level. */ psmi_hal_set_effective_mtu(proto->epinfo.ep_mtu, proto->ep->context.psm_hw_ctxt); psmi_hal_set_pio_size(proto->epinfo.ep_piosize, proto->ep->context.psm_hw_ctxt); /* sdma queue size */ proto->sdma_queue_size = psmi_hal_get_sdma_ring_size(context->psm_hw_ctxt); /* don't use the last slot */ if (proto->sdma_queue_size > 8) { /* configure sdma_avail_counter */ union psmi_envvar_val env_sdma_avail; int tmp_queue_size = 8; psmi_getenv("PSM2_MAX_PENDING_SDMA_REQS", "PSM maximum pending SDMA requests", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val) tmp_queue_size, &env_sdma_avail); if ((env_sdma_avail.e_int < 8) || (env_sdma_avail.e_int > (proto->sdma_queue_size - 1))) proto->sdma_avail_counter = 8; else proto->sdma_avail_counter = env_sdma_avail.e_int; } else { err = PSM2_PARAM_ERR; goto fail; } proto->sdma_fill_index = 0; proto->sdma_done_index = 0; proto->sdma_scb_queue = (struct ips_scb **) psmi_calloc(proto->ep, UNDEFINED, proto->sdma_queue_size, sizeof(struct ips_scb *)); if (proto->sdma_scb_queue == NULL) { err = PSM2_NO_MEMORY; goto fail; } proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT); proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; proto->t_init = get_cycles(); proto->t_fini = 0; proto->flags = env_cksum.e_uint ? IPS_PROTO_FLAG_CKSUM : 0; proto->runid_key = getpid(); proto->num_connected_outgoing = 0; proto->num_connected_incoming = 0; proto->num_disconnect_requests = 0; proto->stray_warn_interval = (uint64_t) -1; proto->done_warning = 0; proto->done_once = 0; proto->num_bogus_warnings = 0; proto->psmi_logevent_tid_send_reqs.interval_secs = 15; proto->psmi_logevent_tid_send_reqs.next_warning = 0; proto->psmi_logevent_tid_send_reqs.count = 0; /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */ if ((err = ips_ibta_init(proto))) goto fail; { /* User asks for HFI loopback? */ union psmi_envvar_val env_loopback; psmi_getenv("PSM2_HFI_LOOPBACK", "PSM uses HFI loopback (default is disabled i.e. 0)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, /* Disabled by default */ &env_loopback); if (env_loopback.e_uint) proto->flags |= IPS_PROTO_FLAG_LOOPBACK; } /* Update JKey if necessary */ if (getenv("PSM2_SELINUX")) proto->epinfo.ep_jkey = psmi_hal_get_jkey(context->psm_hw_ctxt); { /* Disable coalesced ACKs? */ union psmi_envvar_val env_coalesce_acks; psmi_getenv("PSM2_COALESCE_ACKS", "Coalesce ACKs on the wire (default is enabled i.e. 1)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, /* Enabled by default */ &env_coalesce_acks); if (env_coalesce_acks.e_uint) proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS; } { /* Number of credits per flow */ union psmi_envvar_val env_flow_credits; int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc); psmi_getenv("PSM2_FLOW_CREDITS", "Number of unacked packets (credits) per flow (default is 64)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)df_flow_credits, &env_flow_credits); proto->flow_credits = env_flow_credits.e_uint; } /* * Pre-calculate the PSN mask to support 24 or 31 bits PSN. */ if (psmi_hal_has_cap(PSM_HAL_CAP_EXTENDED_PSN)) { proto->psn_mask = 0x7FFFFFFF; } else { proto->psn_mask = 0xFFFFFF; } /* * Initialize SDMA, otherwise, turn on all PIO. */ if (psmi_hal_has_cap(PSM_HAL_CAP_SDMA)) { if ((err = proto_sdma_init(proto, context))) goto fail; } else { proto->flags |= IPS_PROTO_FLAG_SPIO; proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; } /* * Setup the protocol wide short message ep flow. */ if (proto->flags & IPS_PROTO_FLAG_SDMA) { proto->msgflowid = EP_FLOW_GO_BACK_N_DMA; } else { proto->msgflowid = EP_FLOW_GO_BACK_N_PIO; } /* * Clone sendreq mpool configuration for pend sends config */ { uint32_t chunks, maxsz; psmi_assert_always(proto->ep->mq->sreq_pool != NULL); psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks, &maxsz); proto->pend_sends_pool = psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks, maxsz, 0, DESCRIPTORS, NULL, NULL); if (proto->pend_sends_pool == NULL) { err = PSM2_NO_MEMORY; goto fail; } } /* * Create a pool of CCA timers for path_rec. The timers should not * exceed the scb number num_of_send_desc(default 4K). */ { uint32_t chunks, maxsz; chunks = 256; maxsz = num_of_send_desc; proto->timer_pool = psmi_mpool_create(sizeof(struct psmi_timer), chunks, maxsz, 0, DESCRIPTORS, NULL, NULL); if (proto->timer_pool == NULL) { err = PSM2_NO_MEMORY; goto fail; } } /* * Register ips protocol statistics * * We put a (*) in the output to denote stats that may cause a drop in * performance. * * We put a (**) in the output of those stats that "should never happen" */ { uint64_t *pio_stall_cnt = NULL; psmi_hal_get_pio_stall_cnt(context->psm_hw_ctxt,&pio_stall_cnt); struct psmi_stats_entry entries[] = { PSMI_STATS_DECLU64("pio busy count", &proto->stats.pio_busy_cnt), /* Throttling by kernel */ PSMI_STATS_DECLU64("writev busy cnt", &proto->stats.writev_busy_cnt), /* When local dma completion is in the way... */ PSMI_STATS_DECLU64("writev compl. eagain", &proto->stats.writev_compl_eagain), /* When remote completion happens before local completion */ PSMI_STATS_DECLU64("writev compl. delay (*)", &proto->stats.writev_compl_delay), PSMI_STATS_DECLU64("scb unavail eager count", &proto->stats.scb_egr_unavail_cnt), PSMI_STATS_DECLU64("scb unavail exp count", &proto->stats.scb_exp_unavail_cnt), PSMI_STATS_DECLU64("rcvhdr overflows", /* Normal egr/hdr ovflw */ &proto->stats.hdr_overflow), PSMI_STATS_DECLU64("rcveager overflows", &proto->stats.egr_overflow), PSMI_STATS_DECLU64("lid zero errs (**)", /* shouldn't happen */ &proto->stats.lid_zero_errs), PSMI_STATS_DECLU64("unknown packets (**)", /* shouldn't happen */ &proto->stats.unknown_packets), PSMI_STATS_DECLU64("stray packets (*)", &proto->stats.stray_packets), PSMI_STATS_DECLU64("pio stalls (*)", /* shouldn't happen too often */ pio_stall_cnt), PSMI_STATS_DECLU64("ICRC error (*)", &proto->error_stats.num_icrc_err), PSMI_STATS_DECLU64("ECC error ", &proto->error_stats.num_ecc_err), PSMI_STATS_DECLU64("Len error", &proto->error_stats.num_len_err), PSMI_STATS_DECLU64("TID error ", &proto->error_stats.num_tid_err), PSMI_STATS_DECLU64("DC error ", &proto->error_stats.num_dc_err), PSMI_STATS_DECLU64("DCUNC error ", &proto->error_stats.num_dcunc_err), PSMI_STATS_DECLU64("KHDRLEN error ", &proto->error_stats.num_khdrlen_err), }; err = psmi_stats_register_type ("OPA low-level protocol stats", PSMI_STATSTYPE_IPSPROTO, entries, PSMI_STATS_HOWMANY(entries), NULL); if (err != PSM2_OK) goto fail; } /* * Control Queue and messaging */ ctrlq_init(&proto->ctrlq, proto); /* * Receive-side handling */ if ((err = ips_proto_recv_init(proto))) goto fail; /* If progress thread is enabled, set the proto flag */ { if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD)) proto->flags |= IPS_PROTO_FLAG_RCVTHREAD; } /* * Eager buffers. We don't care to receive a callback when eager buffers * are newly released since we actively poll for new bufs. */ { /* configure PSM bounce buffer size */ union psmi_envvar_val env_bbs; psmi_getenv("PSM2_BOUNCE_SZ", "PSM bounce buffer size (default is 8192B)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)8192, &env_bbs); proto->scb_bufsize = env_bbs.e_uint; } if ((err = ips_scbctrl_init(context, num_of_send_desc, num_of_send_bufs, imm_size, proto->scb_bufsize, NULL, NULL, &proto->scbc_egr))) goto fail; /* * Expected protocol handling. * If we enable tid-based expected rendezvous, the expected protocol code * handles its own rv scb buffers. If not, we have to enable eager-based * rendezvous and we allocate scb buffers for it. */ psmi_getenv("PSM2_TID", "Tid proto flags (0 disables protocol)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)IPS_PROTOEXP_FLAGS_DEFAULT, &env_tid); protoexp_flags = env_tid.e_uint; if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) { PSMI_CUDA_CALL(cuStreamCreate, &proto->cudastream_send, CU_STREAM_NON_BLOCKING); } #endif proto->scbc_rv = NULL; if ((err = ips_protoexp_init(context, proto, protoexp_flags, num_of_send_bufs, num_of_send_desc, &proto->protoexp))) goto fail; } else { proto->protoexp = NULL; proto->scbc_rv = (struct ips_scbctrl *) psmi_calloc(proto->ep, DESCRIPTORS, 1, sizeof(struct ips_scbctrl)); if (proto->scbc_rv == NULL) { err = PSM2_NO_MEMORY; goto fail; } /* * Rendezvous buffers. We want to get a callback for rendezvous bufs * since we asynchronously try to make progress on these sends and only * schedule them on the timerq if there are pending sends and available * bufs. */ if ((err = ips_scbctrl_init(context, num_of_send_desc, 0 /* no bufs */ , 0, 0 /* bufsize==0 */ , ips_proto_rv_scbavail_callback, proto, proto->scbc_rv))) goto fail; } /* * Parse the tid error settings from the environment. * : */ { int tvals[2]; char *tid_err; union psmi_envvar_val env_tiderr; tid_err = "-1:0"; /* no tiderr warnings, never exits */ tvals[0] = -1; tvals[1] = 0; if (!psmi_getenv("PSM2_TID_ERROR", "Tid error control ", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)tid_err, &env_tiderr)) { /* not using default values */ tid_err = env_tiderr.e_str; psmi_parse_str_tuples(tid_err, 2, tvals); } if (tvals[0] >= 0) proto->tiderr_warn_interval = sec_2_cycles(tvals[0]); else proto->tiderr_warn_interval = UINT64_MAX; proto->tiderr_max = tvals[1]; _HFI_PRDBG("Tid error control: warning every %d secs%s, " "fatal error after %d tid errors%s\n", tvals[0], (tvals[0] < 0) ? " (no warnings)" : "", tvals[1], (tvals[1] == 0) ? " (never fatal)" : ""); } /* Active Message interface. AM requests compete with MQ for eager * buffers, since request establish the amount of buffering in the * network (maximum number of requests in flight). The AM init function * does not allow the number of send buffers to be set separately from * the number of send descriptors, because otherwise it would have to * impose extremely arcane constraints on the relative amounts to avoid * a deadlock scenario. Thus, it handles it internally. The constraint * is: In a node pair, the number of reply send buffers on at least one * of the nodes must be at least double the number (optimal: double + 1) * of send descriptors on the other node. */ if ((err = ips_proto_am_init(proto, min(num_of_send_bufs, num_of_send_desc), imm_size, &proto->proto_am))) goto fail; #if 0 if (!host_pid) { char ipbuf[INET_ADDRSTRLEN], *p; host_pid = (uint32_t) getpid(); host_ipv4addr = psmi_get_ipv4addr(); /* already be */ if (host_ipv4addr == 0) { _HFI_DBG("Unable to obtain local IP address, " "not fatal but some features may be disabled\n"); } else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) { _HFI_INFO("Localhost IP address is set to the " "loopback address 127.0.0.1, " "not fatal but some features may be disabled\n"); } else { p = (char *)inet_ntop(AF_INET, (const void *)&host_ipv4addr, ipbuf, sizeof(ipbuf)); _HFI_PRDBG("Ethernet Host IP=%s and PID=%d\n", p, host_pid); } /* Store in big endian for use in ERR_CHK */ host_pid = __cpu_to_be32(host_pid); } #endif #ifdef PSM_CUDA union psmi_envvar_val env_gpudirect_rdma; psmi_getenv("PSM2_GPUDIRECT", "Use GPUDirect RDMA support to allow the HFI to directly read" " from the GPU for SDMA. Requires driver support.(default is " " disabled i.e. 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, /* Disabled by default */ &env_gpudirect_rdma); /* The following cases need to be handled: * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or * by default - Turn off GDR COPY * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave *. this config as it is. */ if (!env_gpudirect_rdma.e_uint) is_gdr_copy_enabled = 0; /* Default Send threshold for Gpu-direct set to 30000 */ union psmi_envvar_val env_gpudirect_send_thresh; psmi_getenv("PSM2_GPUDIRECT_SEND_THRESH", "GPUDirect feature on send side will be switched off if threshold value is exceeded.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)30000, &env_gpudirect_send_thresh); gpudirect_send_threshold = env_gpudirect_send_thresh.e_uint; union psmi_envvar_val env_gpudirect_recv_thresh; psmi_getenv("PSM2_GPUDIRECT_RECV_THRESH", "GPUDirect feature on receive side will be switched off if threshold value is exceeded.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh); gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint; if (env_gpudirect_rdma.e_uint && device_support_gpudirect) { if (PSMI_IS_CUDA_DISABLED || /* All pio, No SDMA*/ (proto->flags & IPS_PROTO_FLAG_SPIO) || !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) || PSMI_IS_DRIVER_GPUDIRECT_DISABLED) err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Requires hfi1 driver with GPU-Direct feature enabled.\n"); proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; } else { /* The following environment variables are here for internal * experimentation and will not be documented for any customers. */ /* Use GPUDirect RDMA for SDMA send? */ union psmi_envvar_val env_gpudirect_rdma_send; psmi_getenv("PSM2_GPUDIRECT_RDMA_SEND", "Use GPUDirect RDMA support to allow the HFI to directly" " read from the GPU for SDMA. Requires driver" " support.(default is disabled i.e. 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, /* Disabled by default */ &env_gpudirect_rdma_send); if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) { if (PSMI_IS_CUDA_DISABLED || /* All pio, No SDMA*/ (proto->flags & IPS_PROTO_FLAG_SPIO)) err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to start run as PSM would require cuda, sdma" "and TID support\n"); proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; } /* Use GPUDirect RDMA for recv? */ union psmi_envvar_val env_gpudirect_rdma_recv; psmi_getenv("PSM2_GPUDIRECT_RDMA_RECV", "Use GPUDirect RDMA support to allow the HFI to directly" " write into GPU. Requires driver support.(default is" " disabled i.e. 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, /* Disabled by default */ &env_gpudirect_rdma_recv); if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) { if (PSMI_IS_CUDA_DISABLED || !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to start run as PSM would require cuda," " sdma and TID support\n"); proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; } } if (PSMI_IS_CUDA_ENABLED && (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; if ((err = psmi_parse_mpool_env(proto->mq, 1, &rlim, &maxsz, &chunksz))) goto fail; /* the maxsz is the amount in MB, not the number of entries, * since the element size depends on the window size */ max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; /* mpool requires max_elements to be power of 2. round down. */ max_elements = 1 << (31 - __builtin_clz(max_elements)); proto->cuda_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv; proto->cuda_hostbuf_pool_send = psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, psmi_cuda_hostbuf_alloc_func, (void *) &proto->cuda_hostbuf_send_cfg); if (proto->cuda_hostbuf_pool_send == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, "Couldn't allocate CUDA host send buffer pool"); goto fail; } /* use the same number of elements for the small pool */ proto->cuda_hostbuf_small_send_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ; proto->cuda_hostbuf_pool_small_send = psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, psmi_cuda_hostbuf_alloc_func, (void *) &proto->cuda_hostbuf_small_send_cfg); if (proto->cuda_hostbuf_pool_small_send == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, "Couldn't allocate CUDA host small send buffer pool"); goto fail; } /* Configure the amount of prefetching */ union psmi_envvar_val env_prefetch_limit; psmi_getenv("PSM2_CUDA_PREFETCH_LIMIT", "How many TID windows to prefetch at RTS time(default is 2)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT, &env_prefetch_limit); proto->cuda_prefetch_limit = env_prefetch_limit.e_uint; } #endif fail: return err; } psm2_error_t ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) { struct psmi_eptab_iterator itor; uint64_t t_start; uint64_t t_grace_start, t_grace_time, t_grace_interval; psm2_epaddr_t epaddr; psm2_error_t err = PSM2_OK; int i; union psmi_envvar_val grace_intval; /* Poll one more time to attempt to synchronize with the peer ep's. */ ips_ptl_poll(proto->ptl, 0); psmi_getenv("PSM2_CLOSE_GRACE_PERIOD", "Additional grace period in seconds for closing end-point.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &grace_intval); if (getenv("PSM2_CLOSE_GRACE_PERIOD")) { t_grace_time = grace_intval.e_uint * SEC_ULL; } else if (timeout_in > 0) { /* default to half of the close time-out */ t_grace_time = timeout_in / 2; } else { /* propagate the infinite time-out case */ t_grace_time = 0; } if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT) t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT; /* At close we will busy wait for the grace interval to see if any * receive progress is made. If progress is made we will wait for * another grace interval, until either no progress is made or the * entire grace period has passed. If the grace interval is too low * we may miss traffic and exit too early. If the grace interval is * too large the additional time spent while closing the program * will become visible to the user. */ psmi_getenv("PSM2_CLOSE_GRACE_INTERVAL", "Grace interval in seconds for closing end-point.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &grace_intval); if (getenv("PSM2_CLOSE_GRACE_INTERVAL")) { t_grace_interval = grace_intval.e_uint * SEC_ULL; } else { /* A heuristic is used to scale up the timeout linearly with * the number of endpoints, and we allow one second per 1000 * endpoints. */ t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000; } if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL) t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL; if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL) t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL; PSMI_LOCK_ASSERT(proto->mq->progress_lock); t_start = proto->t_fini = get_cycles(); /* Close whatever has been left open */ if (proto->num_connected_outgoing > 0) { int num_disc = 0; int *mask; psm2_error_t *errs; psm2_epaddr_t *epaddr_array; psmi_epid_itor_init(&itor, proto->ep); while ((epaddr = psmi_epid_itor_next(&itor))) { if (epaddr->ptlctl->ptl == proto->ptl) num_disc++; } psmi_epid_itor_fini(&itor); mask = (int *)psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(int)); errs = (psm2_error_t *) psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(psm2_error_t)); epaddr_array = (psm2_epaddr_t *) psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(psm2_epaddr_t)); if (errs == NULL || epaddr_array == NULL || mask == NULL) { if (epaddr_array) psmi_free(epaddr_array); if (errs) psmi_free(errs); if (mask) psmi_free(mask); err = PSM2_NO_MEMORY; goto fail; } psmi_epid_itor_init(&itor, proto->ep); i = 0; while ((epaddr = psmi_epid_itor_next(&itor))) { /* * if cstate_outgoing is CSTATE_NONE, then we know it * is an uni-directional connect, in that the peer * sent a connect request to us, but we never sent one * out to the peer epid. Ignore handling those in * ips_proto_disconnect() as we will do the right thing * when a disconnect request for the epaddr comes in from the peer. */ if (epaddr->ptlctl->ptl == proto->ptl && ((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { mask[i] = 1; epaddr_array[i] = epaddr; i++; IPS_MCTXT_REMOVE((ips_epaddr_t *) epaddr); } } psmi_epid_itor_fini(&itor); err = ips_proto_disconnect(proto, force, num_disc, epaddr_array, mask, errs, timeout_in); psmi_free(mask); psmi_free(errs); psmi_free(epaddr_array); } t_grace_start = get_cycles(); while (psmi_cycles_left(t_grace_start, t_grace_time)) { uint64_t t_grace_interval_start = get_cycles(); int num_disconnect_requests = proto->num_disconnect_requests; PSMI_BLOCKUNTIL( proto->ep, err, proto->num_connected_incoming == 0 || (!psmi_cycles_left(t_start, timeout_in) && (!psmi_cycles_left(t_grace_interval_start, t_grace_interval) || !psmi_cycles_left(t_grace_start, t_grace_time)))); if (num_disconnect_requests == proto->num_disconnect_requests) { /* nothing happened in this grace interval so break out early */ break; } } #if _HFI_DEBUGGING if (_HFI_PRDBG_ON) { uint64_t t_grace_finish = get_cycles(); _HFI_PRDBG_ALWAYS( "Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n", proto->num_connected_outgoing, proto->num_connected_incoming, (int)(cycles_to_nanosecs(t_grace_finish - t_grace_start) / MSEC_ULL), (int)(t_grace_time / MSEC_ULL)); } #endif #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) { PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send); } #endif if ((err = ips_ibta_fini(proto))) goto fail; if ((err = ips_proto_am_fini(&proto->proto_am))) goto fail; if ((err = ips_scbctrl_fini(&proto->scbc_egr))) goto fail; ips_proto_recv_fini(proto); if (proto->protoexp) { if ((err = ips_protoexp_fini(proto->protoexp))) goto fail; } else { ips_scbctrl_fini(proto->scbc_rv); psmi_free(proto->scbc_rv); } psmi_mpool_destroy(proto->pend_sends_pool); psmi_mpool_destroy(proto->timer_pool); psmi_free(proto->sdma_scb_queue); fail: proto->t_fini = proto->t_init = 0; return err; } static psm2_error_t proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context) { union psmi_envvar_val env_sdma, env_hfiegr; psm2_error_t err = PSM2_OK; /* * Only initialize if RUNTIME_SDMA is enabled. */ psmi_assert_always(psmi_hal_has_cap(PSM_HAL_CAP_SDMA)); psmi_getenv("PSM2_SDMA", "hfi send dma flags (0 disables send dma, 2 disables send pio, " "1 for both sdma/spio, default 1)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, &env_sdma); if (env_sdma.e_uint == 0) proto->flags |= IPS_PROTO_FLAG_SPIO; else if (env_sdma.e_uint == 2) proto->flags |= IPS_PROTO_FLAG_SDMA; if (!(proto->flags & (IPS_PROTO_FLAG_SDMA | IPS_PROTO_FLAG_SPIO))) { /* use both spio and sdma */ if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M) { proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2; proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_PHI2; } else { proto->iovec_thresh_eager = MQ_HFI_THRESH_EGR_SDMA_SQ_XEON; proto->iovec_thresh_eager_blocking = MQ_HFI_THRESH_EGR_SDMA_XEON; } if (!psmi_getenv("PSM2_MQ_EAGER_SDMA_SZ", "hfi pio-to-sdma eager switchover", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) proto->iovec_thresh_eager, &env_hfiegr)) { proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = env_hfiegr.e_uint; } } else if (proto->flags & IPS_PROTO_FLAG_SDMA) { /* all sdma */ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = 0; } else if (proto->flags & IPS_PROTO_FLAG_SPIO) { /* all spio */ proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; } return err; } static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto) { /* clear the ctrl send queue */ memset(ctrlq, 0, sizeof(*ctrlq)); proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED; proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED; proto->message_type_to_index[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED; proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED; proto->message_type_to_index[OPCODE_ERR_CHK_GEN] = CTRL_MSG_ERR_CHK_GEN_QUEUED; proto->message_type_to_index[OPCODE_CONNECT_REQUEST] = CTRL_MSG_CONNECT_REQUEST_QUEUED; proto->message_type_to_index[OPCODE_CONNECT_REPLY] = CTRL_MSG_CONNECT_REPLY_QUEUED; proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] = CTRL_MSG_DISCONNECT_REQUEST_QUEUED; proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] = CTRL_MSG_DISCONNECT_REPLY_QUEUED; ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0; ctrlq->ctrlq_overflow = 0; ctrlq->ctrlq_proto = proto; /* * We never enqueue ctrl messages with real payload. If we do, * the queue 'elem_payload' size needs to be big enough. * Note: enqueue nak/ack is very important for performance. */ proto->ctrl_msg_queue_enqueue = CTRL_MSG_ACK_QUEUED | CTRL_MSG_NAK_QUEUED | CTRL_MSG_BECN_QUEUED; psmi_timer_entry_init(&ctrlq->ctrlq_timer, ips_proto_timer_ctrlq_callback, ctrlq); return; } static __inline__ void _build_ctrl_message(struct ips_proto *proto, struct ips_flow *flow, uint8_t message_type, ips_scb_t *ctrlscb, uint32_t paylen) { uint32_t tot_paywords = (sizeof(struct ips_message_header) + HFI_CRC_SIZE_IN_BYTES + paylen) >> BYTE2DWORD_SHIFT; uint32_t slid, dlid; ips_epaddr_t *ipsaddr = flow->ipsaddr; struct ips_message_header *p_hdr = &ctrlscb->ips_lrh; ips_path_rec_t *ctrl_path = ipsaddr->pathgrp->pg_path[ipsaddr-> hpp_index][IPS_PATH_HIGH_PRIORITY]; if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) && (++ipsaddr->hpp_index >= ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY])) ipsaddr->hpp_index = 0; /* * If the size of the transfer is NOT within the "exclusion range", * then use the "dispersive routling" slid/dlid. Otherwise * use the base LIDS. * * This is a control message, so it should never be a TID transfer. */ slid = ctrl_path->pr_slid; dlid = ctrl_path->pr_dlid; if (ctrlscb->scb_flags & IPS_SEND_FLAG_NO_LMC) { slid = ipsaddr->pathgrp->pg_base_slid; dlid = ipsaddr->pathgrp->pg_base_dlid; } /* Control messages go over the control path. */ p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) << HFI_LRH_SL_SHIFT) | ((proto->sl2sc[ctrl_path->pr_sl] & HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT)); p_hdr->lrh[1] = dlid; p_hdr->lrh[2] = __cpu_to_be16(tot_paywords & HFI_LRH_PKTLEN_MASK); p_hdr->lrh[3] = slid; p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey | (message_type << HFI_BTH_OPCODE_SHIFT)); /* If flow is congested then generate a BECN for path. */ if_pf(flow->flags & IPS_FLOW_FLAG_GEN_BECN) { p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | ipsaddr-> subcontext << HFI_BTH_SUBCTXT_SHIFT | flow-> flowid << HFI_BTH_FLOWID_SHIFT | proto->epinfo. ep_baseqp << HFI_BTH_QP_SHIFT | 1 << HFI_BTH_BECN_SHIFT); flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN; } else { p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | ipsaddr-> subcontext << HFI_BTH_SUBCTXT_SHIFT | flow-> flowid << HFI_BTH_FLOWID_SHIFT | proto->epinfo. ep_baseqp << HFI_BTH_QP_SHIFT); } /* p_hdr->bth[2] already set by caller, or don't care */ /* p_hdr->ack_seq_num already set by caller, or don't care */ p_hdr->connidx = ipsaddr->connidx_outgoing; p_hdr->flags = 0; p_hdr->khdr.kdeth0 = __cpu_to_le32( (ctrlscb->scb_flags & IPS_SEND_FLAG_INTR) | (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); p_hdr->khdr.kdeth1 = __cpu_to_le32(proto->epinfo.ep_jkey); return; } psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire) { struct ips_ctrlq *ctrlq = (struct ips_ctrlq *)timer->context; struct ips_proto *proto = ctrlq->ctrlq_proto; struct ips_ctrlq_elem *cqe; uint32_t have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM; psm2_error_t err; /* service ctrl send queue first */ while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) { cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail]; /* When PSM_PERF is enabled, the following line causes the PMU to start a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch is stopped below. */ GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) { err = psmi_hal_spio_transfer_frame(proto, cqe->msg_scb.flow, &cqe->msg_scb.pbc, cqe->msg_scb.cksum, 0, PSMI_TRUE, have_cksum, cqe->msg_scb.cksum[0], proto->ep->context.psm_hw_ctxt #ifdef PSM_CUDA , 0 #endif ); } else { err = ips_dma_transfer_frame(proto, cqe->msg_scb.flow, &cqe->msg_scb, cqe->msg_scb.cksum, 0, have_cksum, cqe->msg_scb.cksum[0]); } /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); if (err == PSM2_OK) { PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&cqe->msg_scb.ips_lrh,"PKT_STRM: err: %d", err); ips_proto_epaddr_stats_set(proto, cqe->message_type); *cqe->msg_queue_mask &= ~message_type2index(proto, cqe->message_type); cqe->msg_queue_mask = NULL; ctrlq->ctrlq_tail = (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE; } else { psmi_assert(err == PSM2_EP_NO_RESOURCES); if (proto->flags & IPS_PROTO_FLAG_SDMA) proto->stats.writev_busy_cnt++; else proto->stats.pio_busy_cnt++; /* re-request a timer expiration */ psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, PSMI_TIMER_PRIO_0); return PSM2_OK; } } return PSM2_OK; } /* Update cqe struct which is a single element from pending control message queue */ PSMI_ALWAYS_INLINE( void ips_proto_update_cqe(struct ips_ctrlq_elem *cqe, uint16_t *msg_queue_mask, struct ips_flow *flow, ips_scb_t *ctrlscb, uint8_t message_type)){ cqe->message_type = message_type; cqe->msg_queue_mask = msg_queue_mask; psmi_mq_mtucpy(&cqe->msg_scb.ips_lrh, &ctrlscb->ips_lrh, sizeof(ctrlscb->ips_lrh)); cqe->msg_scb.flow = flow; cqe->msg_scb.cksum[0] = ctrlscb->cksum[0]; } psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, uint16_t *msg_queue_mask, ips_scb_t *ctrlscb, void *payload, uint32_t paylen) { psm2_error_t err = PSM2_EP_NO_RESOURCES; ips_epaddr_t *ipsaddr = flow->ipsaddr; struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto; struct ips_ctrlq *ctrlq = &proto->ctrlq; struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe; uint32_t have_cksum; psmi_assert(message_type >= OPCODE_ACK && message_type <= OPCODE_DISCONNECT_REPLY); psmi_assert((paylen & 0x3) == 0); /* require 4-byte multiple */ psmi_assert(flow->frag_size >= (paylen + PSM_CRC_SIZE_IN_BYTES)); /* Drain queue if non-empty */ if (cqe[ctrlq->ctrlq_tail].msg_queue_mask) ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL); /* finish setup control message header */ ips_set_LMC_LID_choice(proto, ctrlscb, paylen); _build_ctrl_message(proto, flow, message_type, ctrlscb, paylen); /* If enabled checksum control message */ have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM; if (have_cksum) { ctrlscb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; ips_do_cksum(proto, &ctrlscb->ips_lrh, payload, paylen, ctrlscb->cksum); } /* * for ACK/NAK/BECN, we use the fast flow to send over, otherwise, * we use the original flow */ if (message_type == OPCODE_ACK || message_type == OPCODE_NAK || message_type == OPCODE_BECN) { psmi_assert(proto->msgflowid < EP_FLOW_LAST); flow = &ipsaddr->flows[proto->msgflowid]; } switch (flow->transfer) { case PSM_TRANSFER_PIO: /* When PSM_PERF is enabled, the following line causes the PMU to start a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch is stopped below. */ GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); err = psmi_hal_spio_transfer_frame(proto, flow, &ctrlscb->pbc, payload, paylen, PSMI_TRUE, have_cksum, ctrlscb->cksum[0], proto->ep->context.psm_hw_ctxt #ifdef PSM_CUDA , 0 #endif ); /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); break; case PSM_TRANSFER_DMA: /* When PSM_PERF is enabled, the following line causes the PMU to start a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch is stopped below. */ GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); err = ips_dma_transfer_frame(proto, flow, ctrlscb, payload, paylen, have_cksum, ctrlscb->cksum[0]); /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); break; default: err = PSM2_INTERNAL_ERR; break; } if (err == PSM2_OK) { PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&ctrlscb->ips_lrh,"PKT_STRM: err: %d", err); ips_proto_epaddr_stats_set(proto, message_type); } _HFI_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d," "src=%p,len=%d returns %d\n", (int)_get_proto_hfi_opcode(&ctrlscb->ips_lrh), __be16_to_cpu(ctrlscb->ips_lrh.lrh[1]), payload, paylen, err); if (err != PSM2_EP_NO_RESOURCES) return err; if (proto->flags & IPS_PROTO_FLAG_SDMA) proto->stats.writev_busy_cnt++; else proto->stats.pio_busy_cnt++; if (proto->ctrl_msg_queue_enqueue & proto-> message_type_to_index[message_type]) { /* We only queue control msg without payload */ psmi_assert(paylen == 0); if ((*msg_queue_mask) & proto-> message_type_to_index[message_type]) { if (message_type == OPCODE_ACK) { /* Pending queue should contain latest ACK type message, * overwrite the previous one. */ ips_proto_update_cqe(&cqe[flow->ack_index], msg_queue_mask, flow, ctrlscb, message_type); } err = PSM2_OK; } else if (cqe[ctrlq->ctrlq_head].msg_queue_mask == NULL) { /* entry is free */ if (message_type == OPCODE_ACK) { /* Track the index of last ACK type message in queue*/ flow->ack_index = ctrlq->ctrlq_head; } *msg_queue_mask |= message_type2index(proto, message_type); ips_proto_update_cqe(&cqe[ctrlq->ctrlq_head], msg_queue_mask, flow, ctrlscb, message_type); ctrlq->ctrlq_head = (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE; /* _HFI_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); */ psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, PSMI_TIMER_PRIO_0); err = PSM2_OK; } else { proto->ctrl_msg_queue_overflow++; } } return err; } void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb) { ips_epaddr_t *ipsaddr = flow->ipsaddr; struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto; ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb); if ((proto->flags & IPS_PROTO_FLAG_CKSUM) && (scb->tidctrl == 0) && (scb->nfrag == 1)) { scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; ips_do_cksum(proto, &scb->ips_lrh, ips_scb_buffer(scb), scb->payload_size, &scb->cksum[0]); } /* If this is the first scb on flow, pull in both timers. */ if (flow->timer_ack == NULL) { psmi_assert(flow->timer_send == NULL); flow->timer_ack = scb->timer_ack; flow->timer_send = scb->timer_send; } psmi_assert(flow->timer_ack != NULL); psmi_assert(flow->timer_send != NULL); /* Every flow has a pending head that points into the unacked queue. * If sends are already pending, process those first */ if (SLIST_EMPTY(&flow->scb_pend)) { PSM2_LOG_PKT_STRM(PSM2_LOG_PEND,&scb->ips_lrh,"PKT_STRM: pkt in pend list"); SLIST_FIRST(&flow->scb_pend) = scb; } /* Insert scb into flow's unacked queue */ STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq); #ifdef PSM_DEBUG /* update scb counters in flow. */ flow->scb_num_pending++; flow->scb_num_unacked++; #endif } MOCK_DEF_EPILOGUE(ips_proto_flow_enqueue); /* * This function attempts to flush the current list of pending * packets through PIO. * * Recoverable errors: * PSM2_OK: Packet triggered through PIO. * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled. * * Unrecoverable errors: * PSM2_EP_NO_NETWORK: No network, no lid, ... * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. */ psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) { struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; struct ips_scb_pendlist *scb_pend = &flow->scb_pend; int num_sent = 0; uint64_t t_cyc; ips_scb_t *scb; psm2_error_t err = PSM2_OK; psmi_assert(!SLIST_EMPTY(scb_pend)); /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) { if (nflushed) *nflushed = 0; return PSM2_EP_NO_RESOURCES; } while (!SLIST_EMPTY(scb_pend) && flow->credits > 0) { scb = SLIST_FIRST(scb_pend); psmi_assert(scb->nfrag == 1); /* When PSM_PERF is enabled, the following line causes the PMU to start a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch is stopped below. */ GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); if ((err = psmi_hal_spio_transfer_frame(proto, flow, &scb->pbc, ips_scb_buffer(scb), scb->payload_size, PSMI_FALSE, scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM, scb->cksum[0], proto->ep->context.psm_hw_ctxt #ifdef PSM_CUDA , IS_TRANSFER_BUF_GPU_MEM(scb) #endif )) == PSM2_OK) { /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); t_cyc = get_cycles(); scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; scb->ack_timeout = proto->epinfo.ep_timeout_ack; scb->abs_timeout = proto->epinfo.ep_timeout_ack + t_cyc; psmi_timer_request(proto->timerq, flow->timer_ack, scb->abs_timeout); num_sent++; flow->credits--; SLIST_REMOVE_HEAD(scb_pend, next); #ifdef PSM_DEBUG flow->scb_num_pending--; #endif PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: err: %d", err); } else { /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the TX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); break; } } /* If out of flow credits re-schedule send timer */ if (!SLIST_EMPTY(scb_pend)) { proto->stats.pio_busy_cnt++; psmi_timer_request(proto->timerq, flow->timer_send, get_cycles() + proto->timeout_send); } if (nflushed != NULL) *nflushed = num_sent; return err; } /* * Flush all packets currently marked as pending */ static psm2_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, struct ips_scb_pendlist *slist, int *num_sent); /* * Flush all packets queued up on a flow via send DMA. * * Recoverable errors: * PSM2_OK: Able to flush entire pending queue for DMA. * PSM2_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA. * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets * or writev returned a recoverable error (no mem for * descriptors, dma interrupted or no space left in dma * queue). * * Unrecoverable errors: * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, * rxe/txe parity error. * PSM2_EP_NO_NETWORK: No network, no lid, ... */ psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed) { struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; struct ips_scb_pendlist *scb_pend = &flow->scb_pend; ips_scb_t *scb = NULL; psm2_error_t err = PSM2_OK; int nsent = 0; psmi_assert(!SLIST_EMPTY(scb_pend)); /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ if_pf((flow->credits <= 0) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) { if (nflushed) *nflushed = 0; return PSM2_EP_NO_RESOURCES; } err = scb_dma_send(proto, flow, scb_pend, &nsent); if (err != PSM2_OK && err != PSM2_EP_NO_RESOURCES && err != PSM2_OK_NO_PROGRESS) goto fail; if (nsent > 0) { uint64_t t_cyc = get_cycles(); int i = 0; /* * inflight counter proto->iovec_cntr_next_inflight should not drift * from completion counter proto->iovec_cntr_last_completed away too * far because we only have very small scb counter compared with * uint32_t counter value. */ #ifdef PSM_DEBUG flow->scb_num_pending -= nsent; #endif SLIST_FOREACH(scb, scb_pend, next) { if (++i > nsent) break; PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: (dma)"); scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; scb->ack_timeout = scb->nfrag * proto->epinfo.ep_timeout_ack; scb->abs_timeout = scb->nfrag * proto->epinfo.ep_timeout_ack + t_cyc; psmi_assert(proto->sdma_scb_queue [proto->sdma_fill_index] == NULL); proto->sdma_scb_queue[proto->sdma_fill_index] = scb; scb->dma_complete = 0; proto->sdma_avail_counter--; proto->sdma_fill_index++; if (proto->sdma_fill_index == proto->sdma_queue_size) proto->sdma_fill_index = 0; /* Flow credits can temporarily go to negative for * packets tracking purpose, because we have sdma * chunk processing which can't send exact number * of packets as the number of credits. */ flow->credits -= scb->nfrag; } SLIST_FIRST(scb_pend) = scb; } if (SLIST_FIRST(scb_pend) != NULL) { psmi_assert(flow->scb_num_pending > 0); switch (flow->protocol) { case PSM_PROTOCOL_TIDFLOW: /* For Tidflow we can cancel the ack timer if we have flow credits * available and schedule the send timer. If we are out of flow * credits then the ack timer is scheduled as we are waiting for * an ACK to reclaim credits. This is required since multiple * tidflows may be active concurrently. */ if (flow->credits > 0) { /* Cancel ack timer and reschedule send timer. Increment * writev_busy_cnt as this really is DMA buffer exhaustion. */ psmi_timer_cancel(proto->timerq, flow->timer_ack); psmi_timer_request(proto->timerq, flow->timer_send, get_cycles() + (proto->timeout_send << 1)); proto->stats.writev_busy_cnt++; } else { /* Re-instate ACK timer to reap flow credits */ psmi_timer_request(proto->timerq, flow->timer_ack, get_cycles() + (proto->epinfo. ep_timeout_ack >> 2)); } break; case PSM_PROTOCOL_GO_BACK_N: default: if (flow->credits > 0) { /* Schedule send timer and increment writev_busy_cnt */ psmi_timer_request(proto->timerq, flow->timer_send, get_cycles() + (proto->timeout_send << 1)); proto->stats.writev_busy_cnt++; } else { /* Schedule ACK timer to reap flow credits */ psmi_timer_request(proto->timerq, flow->timer_ack, get_cycles() + (proto->epinfo. ep_timeout_ack >> 2)); } break; } } else { /* Schedule ack timer */ psmi_timer_cancel(proto->timerq, flow->timer_send); psmi_timer_request(proto->timerq, flow->timer_ack, get_cycles() + proto->epinfo.ep_timeout_ack); } /* We overwrite error with its new meaning for flushing packets */ if (nsent > 0) if (scb) err = PSM2_OK_NO_PROGRESS; /* partial flush */ else err = PSM2_OK; /* complete flush */ else err = PSM2_EP_NO_RESOURCES; /* no flush at all */ fail: if (nflushed) *nflushed = nsent; return err; } #ifdef PSM_FI /* * Fault injection in dma sends. Since DMA through writev() is all-or-nothing, * we don't inject faults on a packet-per-packet basis since the code gets * quite complex. Instead, each call to flush_dma or transfer_frame is treated * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND * setting. * * The effect is as if the event was successful but dropped on the wire * somewhere. */ PSMI_ALWAYS_INLINE(int dma_do_fault()) { if_pf(PSMI_FAULTINJ_ENABLED()) { PSMI_FAULTINJ_STATIC_DECL(fi, "dmalost", 1, IPS_FAULTINJ_DMALOST); return psmi_faultinj_is_fault(fi); } else return 0; } #endif /* #ifdef PSM_FI */ /* * Driver defines the following sdma completion error code, returned * as negative value: * #define SDMA_TXREQ_S_OK 0 * #define SDMA_TXREQ_S_SENDERROR 1 * #define SDMA_TXREQ_S_ABORTED 2 * #define SDMA_TXREQ_S_SHUTDOWN 3 * * When hfi is in freeze mode, driver will complete all the pending * sdma request as aborted. Since PSM needs to recover from hfi * freeze mode, this routine ignore aborted error. */ psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto) { ips_scb_t *scb; while (proto->sdma_done_index != proto->sdma_fill_index) { psmi_hal_sdma_ring_slot_status status; uint32_t errorCode; int rc = psmi_hal_get_sdma_ring_slot_status(proto->sdma_done_index, &status, &errorCode, proto->ep->context.psm_hw_ctxt); psmi_rmb(); if (rc < 0) return PSM2_INTERNAL_ERR; if (status == PSM_HAL_SDMA_RING_QUEUED) return PSM2_OK; /* Mark sdma request is complete */ scb = proto->sdma_scb_queue[proto->sdma_done_index]; if (scb) { psmi_assert(status == PSM_HAL_SDMA_RING_COMPLETE); scb->dma_complete = 1; proto->sdma_scb_queue[proto->sdma_done_index] = NULL; } if (status == PSM_HAL_SDMA_RING_ERROR && (int)errorCode != -2) { psm2_error_t err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, "SDMA completion error: %d (fd=%d, index=%d)", 0 - ((int32_t)errorCode), psmi_hal_get_fd(proto->ep->context. psm_hw_ctxt), proto->sdma_done_index); return err; } proto->sdma_avail_counter++; proto->sdma_done_index++; if (proto->sdma_done_index == proto->sdma_queue_size) proto->sdma_done_index = 0; } return PSM2_OK; } /* Handles ENOMEM on a DMA completion. */ static inline psm2_error_t handle_ENOMEM_on_DMA_completion(struct ips_proto *proto) { psm2_error_t err; time_t now = time(NULL); if (proto->protoexp && proto->protoexp->tidc.tid_cachemap.payload.nidle) { uint64_t lengthEvicted = ips_tidcache_evict(&proto->protoexp->tidc, -1); if (!proto->writevFailTime) proto->writevFailTime = now; if (lengthEvicted) return PSM2_OK; /* signals a retry of the writev command. */ else { #ifdef PSM_CUDA if (PSMI_IS_GDR_COPY_ENABLED && gdr_cache_evict()) { return PSM2_OK; } else #endif return PSM2_EP_NO_RESOURCES; /* should signal a return of no progress, and retry later */ } } #ifdef PSM_CUDA else if (PSMI_IS_GDR_COPY_ENABLED) { uint64_t lengthEvicted = gdr_cache_evict(); if (!proto->writevFailTime) proto->writevFailTime = now; if (lengthEvicted) return PSM2_OK; else return PSM2_EP_NO_RESOURCES; } #endif else if (!proto->writevFailTime) { proto->writevFailTime = now; return PSM2_EP_NO_RESOURCES; /* should signal a return of no progress, and retry later */ } else { static const double thirtySeconds = 30.0; if (difftime(now, proto->writevFailTime) > thirtySeconds) { err = psmi_handle_error( proto->ep, PSM2_EP_DEVICE_FAILURE, "SDMA completion error: out of " "memory (fd=%d, index=%d)", psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt), proto->sdma_done_index); return err; } return PSM2_EP_NO_RESOURCES; /* should signal a return of no progress, and retry later */ } } /* ips_dma_transfer_frame is used only for control messages, and is * not enabled by default, and not tested by QA; expected send * dma goes through scb_dma_send() */ psm2_error_t ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, ips_scb_t *scb, void *payload, uint32_t paylen, uint32_t have_cksum, uint32_t cksum) { ssize_t ret; psm2_error_t err; struct psm_hal_sdma_req_info *sdmahdr; uint16_t iovcnt; struct iovec iovec[2]; #ifdef PSM_FI /* See comments above for fault injection */ if_pf(dma_do_fault()) return PSM2_OK; #endif /* #ifdef PSM_FI */ /* * Check if there is a sdma queue slot. */ if (proto->sdma_avail_counter == 0) { err = ips_proto_dma_completion_update(proto); if (err) return err; if (proto->sdma_avail_counter == 0) { return PSM2_EP_NO_RESOURCES; } } /* * If we have checksum, put to the end of payload. We make sure * there is enough space in payload for us to put 8 bytes checksum. * for control message, payload is internal PSM buffer, not user buffer. */ if (have_cksum) { uint32_t *ckptr = (uint32_t *) ((char *)payload + paylen); *ckptr = cksum; ckptr++; *ckptr = cksum; paylen += PSM_CRC_SIZE_IN_BYTES; } /* * Setup PBC. */ psmi_hal_set_pbc(proto, flow, PSMI_TRUE, &scb->pbc, HFI_MESSAGE_HDR_SIZE, paylen); /* * Setup SDMA header and io vector. */ size_t extra_bytes; sdmahdr = psmi_get_sdma_req_info(scb, &extra_bytes); sdmahdr->npkts = 1; sdmahdr->fragsize = flow->frag_size; sdmahdr->comp_idx = proto->sdma_fill_index; psmi_assert(psmi_hal_dma_slot_available(proto->sdma_fill_index, proto->ep->context.psm_hw_ctxt)); iovcnt = 1; iovec[0].iov_base = sdmahdr; iovec[0].iov_len = psmi_hal_get_sdma_req_size(proto->ep->context.psm_hw_ctxt) + extra_bytes; if (paylen > 0) { iovcnt++; iovec[1].iov_base = payload; iovec[1].iov_len = paylen; } #ifdef PSM_CUDA if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { sdmahdr->ctrl = 2 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); } else #endif { sdmahdr->ctrl = 1 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); } /* * Write into driver to do SDMA work. */ retry: ret = psmi_hal_writev(iovec, iovcnt, &proto->epinfo, proto->ep->context.psm_hw_ctxt); if (ret > 0) { proto->writevFailTime = 0; psmi_assert_always(ret == 1); proto->sdma_avail_counter--; proto->sdma_fill_index++; if (proto->sdma_fill_index == proto->sdma_queue_size) proto->sdma_fill_index = 0; /* * Wait for completion of this control message if * stack buffer payload is used. This should not be * a performance issue because sdma control message * is not a performance code path. */ if (iovcnt > 1) { /* Setup scb ready for completion. */ psmi_assert(proto->sdma_scb_queue [sdmahdr->comp_idx] == NULL); proto->sdma_scb_queue[sdmahdr->comp_idx] = scb; scb->dma_complete = 0; /* Wait for completion */ err = ips_proto_dma_wait_until(proto, scb); } else err = PSM2_OK; } else { /* * ret == 0: Driver did not queue packet. Try later. * ENOMEM: No kernel memory to queue request, try later? * * ECOMM: Link may have gone down * EINTR: Got interrupt while in writev */ if (errno == ENOMEM) { err = handle_ENOMEM_on_DMA_completion(proto); if (err == PSM2_OK) goto retry; } else if (ret == 0 || errno == ECOMM || errno == EINTR) { err = psmi_context_check_status( (const psmi_context_t *)&proto->ep->context); /* * During a link bounce the err returned from * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case * the error code which we need to return to the calling flush * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to * signal it to restart the timers to flush the packets. * Not doing so would leave the packet on the unacked and * pending q without the sdma descriptors ever being updated. */ if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK) err = PSM2_EP_NO_RESOURCES; } else err = psmi_handle_error(proto->ep, PSM2_EP_DEVICE_FAILURE, "Unhandled error in writev(): " "%s (fd=%d,iovec=%p,len=%d)", strerror(errno), psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt), &iovec, 1); } return err; } /* * Caller still expects num_sent to always be correctly set in case of an * error. * * Recoverable errors: * PSM2_OK: At least one packet was successfully queued up for DMA. * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets * or writev returned a recoverable error (no mem for * descriptors, dma interrupted or no space left in dma * queue). * PSM2_OK_NO_PROGRESS: Cable pulled. * * Unrecoverable errors: * PSM2_EP_DEVICE_FAILURE: Error calling hfi_sdma_inflight() or unexpected * error in calling writev(), or chip failure, rxe/txe * parity error. * PSM2_EP_NO_NETWORK: No network, no lid, ... */ static psm2_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, struct ips_scb_pendlist *slist, int *num_sent) { psm2_error_t err = PSM2_OK; struct psm_hal_sdma_req_info *sdmahdr; struct ips_scb *scb; struct iovec *iovec; uint16_t iovcnt; unsigned int vec_idx = 0; unsigned int scb_idx = 0, scb_sent = 0; unsigned int num = 0, max_elem; uint32_t have_cksum; uint32_t fillidx; int16_t credits; ssize_t ret; #ifdef PSM_FI /* See comments above for fault injection */ if_pf(dma_do_fault()) goto fail; #endif /* #ifdef PSM_FI */ /* Check how many SCBs to send based on flow credits */ credits = flow->credits; psmi_assert(SLIST_FIRST(slist) != NULL); SLIST_FOREACH(scb, slist, next) { num++; credits -= scb->nfrag; if (credits <= 0) break; } if (proto->sdma_avail_counter < num) { /* if there is not enough sdma slot, * update and use what we have. */ err = ips_proto_dma_completion_update(proto); if (err) goto fail; if (proto->sdma_avail_counter == 0) { err = PSM2_EP_NO_RESOURCES; goto fail; } if (proto->sdma_avail_counter < num) num = proto->sdma_avail_counter; } /* header, payload, checksum, tidarray */ max_elem = 4 * num; iovec = alloca(sizeof(struct iovec) * max_elem); fillidx = proto->sdma_fill_index; SLIST_FOREACH(scb, slist, next) { /* Can't exceed posix max writev count */ if (vec_idx + (int)!!(scb->payload_size > 0) >= UIO_MAXIOV) break; psmi_assert(vec_idx < max_elem); psmi_assert_always(((scb->payload_size & 0x3) == 0) || psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE)); /* Checksum all eager packets */ have_cksum = scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM; /* * Setup PBC. */ psmi_hal_set_pbc( proto, flow, PSMI_FALSE, &scb->pbc, HFI_MESSAGE_HDR_SIZE, scb->payload_size + (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0)); psmi_assert(psmi_hal_dma_slot_available(fillidx, proto->ep->context. psm_hw_ctxt)); size_t extra_bytes; sdmahdr = psmi_get_sdma_req_info(scb, &extra_bytes); sdmahdr->npkts = scb->nfrag > 1 ? scb->nfrag_remaining : scb->nfrag; sdmahdr->fragsize = scb->frag_size ? scb->frag_size : flow->frag_size; sdmahdr->comp_idx = fillidx; fillidx++; if (fillidx == proto->sdma_queue_size) fillidx = 0; /* * Setup io vector. */ iovec[vec_idx].iov_base = sdmahdr; iovec[vec_idx].iov_len = psmi_hal_get_sdma_req_size(proto->ep->context. psm_hw_ctxt) + extra_bytes; vec_idx++; iovcnt = 1; _HFI_VDBG("hdr=%p,%d\n", iovec[vec_idx - 1].iov_base, (int)iovec[vec_idx - 1].iov_len); if (scb->payload_size > 0) { /* * OPA1 supports byte-aligned payload. If it is * single packet per scb, use payload_size, else * multi-packets per scb, use remaining chunk_size. * payload_size is the remaining chunk first packet * length. */ iovec[vec_idx].iov_base = ips_scb_buffer(scb); iovec[vec_idx].iov_len = scb->nfrag > 1 ? scb->chunk_size_remaining : scb->payload_size; vec_idx++; iovcnt++; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && IS_TRANSFER_BUF_GPU_MEM(scb)) { /* without this attr, CUDA memory accesses * do not synchronize with gpudirect-rdma accesses. * We set this field only if the currently loaded driver * supports this field. If not, we have other problems * where we have a non gpu-direct enabled driver loaded * and PSM2 is trying to use GPU features. */ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) sdmahdr->flags = PSM_HAL_BUF_GPU_MEM; else sdmahdr->flags = 0; } else if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) sdmahdr->flags = 0; #endif _HFI_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n", scb->seq_num.psn_num, iovec[vec_idx - 2].iov_base, (int)iovec[vec_idx - 2].iov_len, iovec[vec_idx - 1].iov_base, (int)iovec[vec_idx - 1].iov_len); } /* If checksum then update checksum */ if (have_cksum) { scb->cksum[1] = scb->cksum[0]; iovec[vec_idx].iov_base = scb->cksum; iovec[vec_idx].iov_len = PSM_CRC_SIZE_IN_BYTES; vec_idx++; iovcnt++; _HFI_VDBG("chsum=%p,%d\n", iovec[vec_idx - 1].iov_base, (int)iovec[vec_idx - 1].iov_len); } /* * If it is TID receive, attached tid info. */ if (scb->tidctrl) { iovec[vec_idx].iov_base = scb->tsess; iovec[vec_idx].iov_len = scb->tsess_length; vec_idx++; iovcnt++; #ifdef PSM_CUDA /* * The driver knows to check for "flags" field in * sdma_req_info only if ctrl=2. */ if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { sdmahdr->ctrl = 2 | (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); } else #endif { sdmahdr->ctrl = 1 | (PSM_HAL_EXP << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); } _HFI_VDBG("tid-info=%p,%d\n", iovec[vec_idx - 1].iov_base, (int)iovec[vec_idx - 1].iov_len); } else { #ifdef PSM_CUDA if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { sdmahdr->ctrl = 2 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); } else #endif { sdmahdr->ctrl = 1 | (PSM_HAL_EGR << PSM_HAL_SDMA_REQ_OPCODE_SHIFT) | (iovcnt << PSM_HAL_SDMA_REQ_IOVCNT_SHIFT); } } /* Can bound the number to send by 'num' */ if (++scb_idx == num) break; } psmi_assert(vec_idx > 0); retry: ret = psmi_hal_writev(iovec, vec_idx, &proto->epinfo, proto->ep->context.psm_hw_ctxt); if (ret > 0) { proto->writevFailTime = 0; /* No need for inflight system call, we can infer it's value * from * writev's return value */ scb_sent += ret; } else { /* * ret == 0: Driver did not queue packet. Try later. * ENOMEM: No kernel memory to queue request, try later? * ECOMM: Link may have gone down * EINTR: Got interrupt while in writev */ if (errno == ENOMEM) { err = handle_ENOMEM_on_DMA_completion(proto); if (err == PSM2_OK) goto retry; } else if (ret == 0 || errno == ECOMM || errno == EINTR) { err = psmi_context_check_status( (const psmi_context_t *)&proto->ep->context); /* * During a link bounce the err returned from * psmi_context_check_status is PSM2_EP_NO_NETWORK. In this case * the error code which we need to return to the calling flush * function(ips_proto_flow_flush_dma) is PSM2_EP_NO_RESOURCES to * signal the caller to restart the timers to flush the packets. * Not doing so would leave the packet on the unacked and * pending q without the sdma descriptors ever being updated. */ if (err == PSM2_OK || err == PSM2_EP_NO_NETWORK) err = PSM2_EP_NO_RESOURCES; } else { err = psmi_handle_error( proto->ep, PSM2_EP_DEVICE_FAILURE, "Unexpected error in writev(): %s (errno=%d) " "(fd=%d,iovec=%p,len=%d)", strerror(errno), errno, psmi_hal_get_fd(proto->ep->context.psm_hw_ctxt), iovec, vec_idx); goto fail; } } fail: *num_sent = scb_sent; psmi_assert(*num_sent <= num && *num_sent >= 0); return err; } /* * Because we only lazily reap send dma completions, it's possible that we * receive a packet's remote acknowledgement before seeing that packet's local * completion. As part of processing ack packets and releasing scbs, we issue * a wait for the local completion if the scb is marked as having been sent via * send dma. */ psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb) { psm2_error_t err = PSM2_OK; int spin_cnt = 0; int did_yield = 0; PSMI_PROFILE_BLOCK(); do { if (spin_cnt++ == proto->ep->yield_spin_cnt) { /* Have to yield holding the PSM lock, mostly because we don't * support another thread changing internal state at this point in * the code. */ did_yield = 1; spin_cnt = 0; sched_yield(); } err = ips_proto_dma_completion_update(proto); if (err) return err; } while (scb->dma_complete == 0); if (did_yield) proto->stats.writev_compl_delay++; PSMI_PROFILE_UNBLOCK(); return err; } psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *current_timer, uint64_t current) { struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; uint64_t t_cyc_next = get_cycles(); psmi_seqnum_t err_chk_seq; ips_scb_t *scb, ctrlscb; uint8_t message_type; if (STAILQ_EMPTY(&flow->scb_unacked)) return PSM2_OK; scb = STAILQ_FIRST(&flow->scb_unacked); if (current >= scb->abs_timeout) { int done_local = 0; /* We have to ensure that the send is at least locally complete before * sending an error check or else earlier data can get to the * destination *after* we pio or dma this err_chk. */ if (flow->transfer == PSM_TRANSFER_DMA) { /* error is caught inside this routine */ ips_proto_dma_completion_update(proto); if (scb->dma_complete) done_local = 1; else proto->stats.writev_compl_eagain++; } else done_local = 1; /* Always done for PIO flows */ scb->ack_timeout = min(scb->ack_timeout * proto->epinfo.ep_timeout_ack_factor, proto->epinfo.ep_timeout_ack_max); scb->abs_timeout = t_cyc_next + scb->ack_timeout; if (done_local) { _HFI_VDBG ("sending err_chk flow=%d with first=%d,last=%d\n", flow->flowid, STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num, STAILQ_LAST(&flow->scb_unacked, ips_scb, nextq)->seq_num.psn_num); ctrlscb.scb_flags = 0; if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD) ctrlscb.scb_flags |= IPS_SEND_FLAG_INTR; err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ? flow->xmit_seq_num : SLIST_FIRST(&flow->scb_pend)->seq_num; if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { message_type = OPCODE_ERR_CHK_GEN; err_chk_seq.psn_seq -= 1; /* Receive descriptor index */ ctrlscb.ips_lrh.data[0].u64 = scb->tidsendc->rdescid.u64; /* Send descriptor index */ ctrlscb.ips_lrh.data[1].u64 = scb->tidsendc->sdescid.u64; } else { PSM2_LOG_MSG("sending ERR_CHK message"); message_type = OPCODE_ERR_CHK; err_chk_seq.psn_num = (err_chk_seq.psn_num - 1) & proto->psn_mask; } ctrlscb.ips_lrh.bth[2] = __cpu_to_be32(err_chk_seq.psn_num); ips_proto_send_ctrl_message(flow, message_type, &flow->ipsaddr->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } t_cyc_next = get_cycles() + scb->ack_timeout; } else t_cyc_next += (scb->abs_timeout - current); psmi_timer_request(proto->timerq, current_timer, t_cyc_next); return PSM2_OK; } psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *current_timer, uint64_t current) { struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; /* If flow is marked as congested adjust injection rate - see process nak * when a congestion NAK is received. */ if_pf(flow->flags & IPS_FLOW_FLAG_CONGESTED) { /* Clear congestion flag and decrease injection rate */ flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; if ((flow->path->pr_ccti + proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) ips_cca_adjust_rate(flow->path, proto->cace[flow->path->pr_sl]. ccti_increase); } if (!SLIST_EMPTY(&flow->scb_pend)) flow->flush(flow, NULL); return PSM2_OK; } psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment) { struct ips_proto *proto = path_rec->proto; /* Increment/decrement ccti for path */ psmi_assert_always(path_rec->pr_ccti >= proto->cace[path_rec->pr_sl].ccti_min); path_rec->pr_ccti += cct_increment; /* Determine new active IPD. */ #if _HFI_DEBUGGING uint16_t prev_ipd = 0; uint16_t prev_divisor = 0; if (_HFI_CCADBG_ON) { prev_ipd = path_rec->pr_active_ipd; prev_divisor = path_rec->pr_cca_divisor; } #endif if ((path_rec->pr_static_ipd) && ((path_rec->pr_static_ipd + 1) > (proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK))) { path_rec->pr_active_ipd = path_rec->pr_static_ipd + 1; path_rec->pr_cca_divisor = 0; } else { path_rec->pr_active_ipd = proto->cct[path_rec->pr_ccti] & CCA_IPD_MASK; path_rec->pr_cca_divisor = proto->cct[path_rec->pr_ccti] >> CCA_DIVISOR_SHIFT; } #if _HFI_DEBUGGING if (_HFI_CCADBG_ON) { _HFI_CCADBG_ALWAYS("CCA: %s injection rate to <%x.%x> from <%x.%x>\n", (cct_increment > 0) ? "Decreasing" : "Increasing", path_rec->pr_cca_divisor, path_rec->pr_active_ipd, prev_divisor, prev_ipd); } #endif /* Reschedule CCA timer if this path is still marked as congested */ if (path_rec->pr_ccti > proto->cace[path_rec->pr_sl].ccti_min) { if (path_rec->pr_timer_cca == NULL) { path_rec->pr_timer_cca = (struct psmi_timer *)psmi_mpool_get(proto-> timer_pool); psmi_assert(path_rec->pr_timer_cca != NULL); psmi_timer_entry_init(path_rec->pr_timer_cca, ips_cca_timer_callback, path_rec); } psmi_timer_request(proto->timerq, path_rec->pr_timer_cca, get_cycles() + proto->cace[path_rec->pr_sl]. ccti_timer_cycles); } else if (path_rec->pr_timer_cca) { psmi_mpool_put(path_rec->pr_timer_cca); path_rec->pr_timer_cca = NULL; } return PSM2_OK; } psm2_error_t ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current) { ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context; /* Increase injection rate for flow. Decrement CCTI */ if (path_rec->pr_ccti > path_rec->proto->cace[path_rec->pr_sl].ccti_min) return ips_cca_adjust_rate(path_rec, -1); psmi_mpool_put(path_rec->pr_timer_cca); path_rec->pr_timer_cca = NULL; return PSM2_OK; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto.h000066400000000000000000000572331370564314600200740ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef _IPS_PROTO_H #define _IPS_PROTO_H #include "ips_config.h" #include "psm_user.h" #include "ips_tid.h" #include "ips_recvhdrq.h" #include "ips_epstate.h" #include "ips_proto_am.h" #include "ips_tidflow.h" #include "ips_path_rec.h" typedef enum ips_path_type { IPS_PATH_LOW_PRIORITY, IPS_PATH_NORMAL_PRIORITY, IPS_PATH_HIGH_PRIORITY, IPS_PATH_MAX_PRIORITY } ips_path_type_t; /* * Local Endpoint info. * * Contains information necessary for composing packets for the local endpoint */ struct ips_epinfo { uint16_t ep_base_lid; uint8_t ep_baseqp; uint8_t ep_lmc; opa_rate ep_link_rate; uint16_t ep_context; uint16_t ep_subcontext; uint16_t ep_hfi_type; uint16_t ep_sl; /* HFI_SL only when path record not used */ uint16_t ep_mtu; uint16_t ep_mtu_code; uint16_t ep_piosize; uint16_t ep_pkey; /* PSM2_PKEY only when path record not used */ uint16_t ep_jkey; uint64_t ep_timeout_ack; /* PSM2_ERRCHK_TIMEOUT if no path record */ uint64_t ep_timeout_ack_max; uint32_t ep_timeout_ack_factor; }; /* * This contains a path record table table that Enumerate the paths available * between the local node and a remote node associated with an end point. * Also maintain a state value for each message priority that keeps indicates * which path should be assigned to the next message of that priority. * * For LMC/Torus, keep list of base and max dlid. Used for pkt verification * * pg_base_dlid and pg_base_slid are in network byte order. */ #define IPS_MAX_PATH_LMC 3 typedef struct ips_path_grp { uint16_t pg_base_dlid; uint16_t pg_base_slid; uint8_t pg_num_paths[IPS_PATH_MAX_PRIORITY]; uint8_t pg_next_path[IPS_PATH_MAX_PRIORITY]; ips_path_rec_t *pg_path[0][IPS_PATH_MAX_PRIORITY]; } ips_path_grp_t; /* * Start and finish routines for constructing an ips_proto. */ struct ips_proto; psm2_error_t ips_proto_init(const psmi_context_t *context, const struct ptl *ptl, int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, const struct psmi_timer_ctrl *timerq, /* PTL's timerq */ const struct ips_epstate *epstate, /* PTL's epstate */ void *spioc, /* PTL's opaque spio control */ struct ips_proto *proto); /* output protocol */ psm2_error_t ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout); /* * Control message structures * * ips low-level control messages to ensure reliability of eager packets. */ #define CTRL_MSG_QEUEUE_SIZE 64 /* power of two */ struct ips_ctrlq_elem { uint8_t message_type; uint16_t *msg_queue_mask; ips_scb_t msg_scb; }; struct ips_ctrlq { /* Queued control messages, queued when pio is busy */ struct ips_proto *ctrlq_proto; uint32_t ctrlq_head; uint32_t ctrlq_tail; uint32_t ctrlq_overflow; struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN; struct psmi_timer ctrlq_timer; /* when in timerq */ }; /* Connect/disconnect, as implemented by ips */ /* * Connections are not pairwise but we keep a single 'epaddr' for messages-from * and messages-to a remote 'epaddr'. State transitions for connecting TO and * FROM 'epaddrs' are the following: * Connect TO (Connect OUTGOING): * NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE * * Connect FROM (we receive a connect request - Connect INCOMING) * NONE -> ESTABLISHED -> NONE */ #define CSTATE_ESTABLISHED 1 #define CSTATE_NONE 2 #define CSTATE_OUTGOING_DISCONNECTED 3 #define CSTATE_OUTGOING_WAITING 4 #define CSTATE_OUTGOING_WAITING_DISC 5 psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep, const psm2_epid_t *array_of_epid, const int *array_of_epid_mask, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in); psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep, psm2_epaddr_t array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t array_of_errors[], uint64_t timeout_in); int ips_proto_isconnected(struct ips_epaddr *ipsaddr); /* * Pending operation structures */ struct ips_pend_sreq { STAILQ_ENTRY(ips_pend_sreq) next; psm2_mq_req_t req; uint32_t type; }; #define IPS_PENDSEND_EAGER_DATA 1 #define IPS_PENDSEND_EAGER_REQ 2 #define IPS_PENDSEND_EXP_TIDS 3 #define IPS_PENDSEND_EXP_SENDS 4 STAILQ_HEAD(ips_pendsendq, ips_pend_sreq); struct ips_pend_sends { struct ips_proto *proto; /* back ptr */ struct psmi_timer timer; struct ips_pendsendq pendq; }; /* * One instance of the protocol */ struct ips_protoexp; struct ips_proto_stats { uint64_t pio_busy_cnt; uint64_t writev_busy_cnt; uint64_t writev_compl_eagain; uint64_t writev_compl_delay; uint64_t scb_egr_unavail_cnt; uint64_t scb_exp_unavail_cnt; uint64_t hdr_overflow; uint64_t egr_overflow; uint64_t lid_zero_errs; uint64_t unknown_packets; uint64_t stray_packets; }; struct ips_proto_error_stats { uint64_t num_icrc_err; uint64_t num_ecc_err; uint64_t num_len_err; uint64_t num_tid_err; uint64_t num_dc_err; uint64_t num_dcunc_err; uint64_t num_khdrlen_err; }; /* * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init */ struct ips_proto_epaddr_stats { uint64_t err_chk_send; uint64_t err_chk_recv; uint64_t nak_send; uint64_t nak_recv; uint64_t connect_req; uint64_t disconnect_req; uint64_t tids_grant_send; uint64_t tids_grant_recv; uint64_t send_rexmit; uint64_t congestion_pkts; /* IB CCA FECN packets */ }; /* OPP support structure. */ struct opp_api { void *(*op_path_find_hca) (const char *name, void **device); void *(*op_path_open) (void *device, int port_num); void (*op_path_close) (void *context); int (*op_path_get_path_by_rec) (void *context, ibta_path_rec_t *query, ibta_path_rec_t *response); }; struct ips_ibta_compliance_fn { psm2_error_t(*get_path_rec) (struct ips_proto *proto, uint16_t slid, uint16_t dlid, uint16_t desthfi_type, unsigned long timeout, ips_path_grp_t **ppathgrp); psm2_error_t(*fini) (struct ips_proto *proto); }; /* please don't change the flow id order */ typedef enum ips_epaddr_flow { EP_FLOW_GO_BACK_N_PIO, EP_FLOW_GO_BACK_N_DMA, EP_FLOW_TIDFLOW, /* Can either pio or dma for tidflow */ EP_FLOW_LAST /* Keep this the last endpoint flow */ } ips_epaddr_flow_t; typedef enum psm_transfer_type { PSM_TRANSFER_PIO, PSM_TRANSFER_DMA, PSM_TRANSFER_LAST /* Keep this the last transfer type */ } psm_transfer_type_t; typedef enum psm_protocol_type { PSM_PROTOCOL_GO_BACK_N, PSM_PROTOCOL_TIDFLOW, PSM_PROTOCOL_LAST /* Keep this the last protocol type */ } psm_protocol_type_t; struct ips_proto { struct ptl *ptl; /* cached */ psm2_ep_t ep; /* cached, for errors */ psm2_mq_t mq; /* cached, for mq handling */ /* Pending sends */ struct ips_pend_sends pend_sends; struct ips_epstate *epstate; struct psmi_timer_ctrl *timerq; struct ips_protoexp *protoexp; struct ips_scbctrl *scbc_rv; struct ips_spio *spioc; struct ips_scbctrl scbc_egr; struct ips_epinfo epinfo; ips_scb_t **sdma_scb_queue; uint16_t sdma_queue_size; uint16_t sdma_fill_index; uint16_t sdma_done_index; uint16_t sdma_avail_counter; uint64_t timeout_send; uint32_t flags; /* < if IPS_PROTO_FLAG_SDMA is NOT set, SPIO flow will be initialized * < if IPS_PROTO_FLAG_SPIO is NOT set, SDMA flow will be initialized * < so both flows (SDMA and PIO) will be initialized if both of the * < IPS_PROTO_FLAG_S{DMA,PIO} are CLEARED */ uint32_t iovec_thresh_eager; uint32_t iovec_thresh_eager_blocking; uint32_t psn_mask; uint32_t scb_bufsize; uint16_t flow_credits; mpool_t pend_sends_pool; mpool_t timer_pool; struct ips_ibta_compliance_fn ibta; struct ips_proto_stats stats; struct ips_proto_error_stats error_stats; struct ips_proto_epaddr_stats epaddr_stats; struct ips_proto_am proto_am; struct ips_ctrlq ctrlq; /* pure sdma mode, use dma flow, otherwise, use pio flow */ ips_epaddr_flow_t msgflowid; /* Handling tid errors */ uint32_t tiderr_cnt; uint32_t tiderr_max; uint64_t tiderr_tnext; uint64_t tiderr_warn_interval; uint64_t t_init; uint64_t t_fini; uint32_t runid_key; int num_connected_outgoing; int num_connected_incoming; int num_disconnect_requests; /* misc state variables. */ /* Smallest interval in cycles between which we warn about stray * messages This is a per-endpoint quantity, overridable with * PSM_STRAY_WARN_INTERVAL We use the same interval to send the "die" * message. */ uint64_t stray_warn_interval; int done_warning; int done_once; int num_bogus_warnings; struct { uint32_t interval_secs; uint64_t next_warning; uint64_t count; } psmi_logevent_tid_send_reqs; /* SL2SC and SC2VL table for protocol */ uint16_t sl2sc[32]; /* CCA per port */ uint16_t *cct; /* cct table */ uint16_t ccti_size; /* ccti table size */ uint16_t ccti_limit; /* should be <= size-1 */ uint16_t ccti_portctrl; /* QP or SL CC */ uint32_t ccti_ctrlmap; /* map for valid sl */ struct cace { /* CACongestionEntry */ uint8_t ccti_increase; /* steps to increase */ /* uint16_t ccti_timer;*/ /* CCTI Timer in units of 1.024 usec */ uint64_t ccti_timer_cycles; /* converted from us_2_cycles() */ uint8_t ccti_threshold; /* threshold to make log */ uint8_t ccti_min; /* min value for ccti */ } cace[32]; /* 32 service levels */ /* Path record support */ uint8_t ips_ipd_delay[IBV_RATE_300_GBPS + 1]; /* * Disable the LMC based dispersive routing for all message * sizes in bytes between ips_lmc_disable_low and ips_lmc_disable_high, * inclusive. */ uint32_t ips_lmc_disable_low; uint32_t ips_lmc_disable_high; struct hsearch_data ips_path_rec_hash; struct hsearch_data ips_path_grp_hash; void *opp_lib; void *hndl; void *device; void *opp_ctxt; struct opp_api opp_fn; #ifdef PSM_CUDA struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_send_cfg; struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_send_cfg; mpool_t cuda_hostbuf_pool_send; mpool_t cuda_hostbuf_pool_small_send; CUstream cudastream_send; unsigned cuda_prefetch_limit; #endif /* * Control message queue for pending messages. * * Control messages are queued as pending when no PIO is available for sending * the message. They are composed on the fly and do not need buffering. * * Variables here are write once (at init) and read afterwards (except the msg * queue overflow counters). */ uint32_t ctrl_msg_queue_overflow; uint32_t ctrl_msg_queue_enqueue; uint32_t message_type_to_index[256]; #define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)]) time_t writevFailTime; }; static inline int ips_proto_is_disabled_pio(struct ips_proto *proto) { return !!(proto->flags & IPS_PROTO_FLAG_SDMA); } static inline int ips_proto_is_disabled_sdma(struct ips_proto *proto) { return !!(proto->flags & IPS_PROTO_FLAG_SPIO); } /* * Test the payload length against the lmc_disable_low and lmc_disable_hi * values, to determine if a transfer of this size should use LMC LIDs. * Set the IPS_SEND_FLAG_NO_LMC flag in the scb. */ static inline void ips_set_LMC_LID_choice(struct ips_proto *proto, ips_scb_t *scb, uint32_t len) { if ((len >= proto->ips_lmc_disable_low) && (len <= proto->ips_lmc_disable_high)) { PSM2_LOG_MSG("DISABLE LMC paylen %u\n", len); scb->scb_flags |= IPS_SEND_FLAG_NO_LMC; } return; } /* * Endpoint address, encapsulates per-endpoint protocol metadata * * Directly implements the ptl epaddr. */ typedef psm2_error_t(*ips_flow_flush_fn_t) (struct ips_flow *, int *nflushed); /** * ips_flow is a structure that combines all information regarding a send * from one endpoint to another one. Specifically, it is the place where * the Maximum Transmission Unit for a send is calculated, given how many * factors could possibly influence the MTU calculation. See ips_flow_init * documentation for more details. */ struct ips_flow { SLIST_ENTRY(ips_flow) next; /* List of flows with pending acks */ ips_flow_flush_fn_t flush; /* flush function for this flow */ struct ips_epaddr *ipsaddr; /* back pointer, remote endpoint */ ips_path_rec_t *path; /* Path to use for flow */ uint16_t frag_size; /* < This flow's fragment size, calculated as the < minimum of all relevant MTUs involved */ uint16_t flowid:2; /* flow id: pio(0) or dma(1) or tidflow(2) */ uint16_t transfer:3; /* spio or sdma */ uint16_t protocol:3; /* go-back-n or tidflow */ uint16_t flags:8; /* flow state flags */ uint16_t cca_ooo_pkts; /* cca out of order packets */ uint16_t cwin; /* Size of congestion window */ uint16_t ack_interval; /* interval to ack packets */ uint16_t ack_counter; /* counter to ack packets */ int16_t credits; /* Current credits available to send on flow */ uint32_t ack_index; /* Index of the last ACK message type in pending message queue */ psmi_seqnum_t xmit_seq_num; /* transmit packet sequence number */ psmi_seqnum_t xmit_ack_num; /* acked packet sequence number */ psmi_seqnum_t recv_seq_num; /* recieved packet sequence number */ psmi_timer *timer_send; /* timer for frames that got a busy PIO */ psmi_timer *timer_ack; /* timer for unacked frames */ STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked; /* unacked queue */ SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend; /* pending queue */ #ifdef PSM_DEBUG uint32_t scb_num_pending; /* pending scb counter */ uint32_t scb_num_unacked; /* unacked scb counter */ #endif }; #define IPS_FLOW_MSG_TOGGLE_OOO_MASK (1 << 0) /* ooo msg check */ #define IPS_FLOW_MSG_TOGGLE_UNEXP_MASK (1 << 1) /* unexp msg check */ /* * Make sure ips_epaddr_t and psm2_epaddr_t can be converted each other. */ struct ips_epaddr { struct psm2_epaddr epaddr; /* inlined psm level epaddr */ struct ips_msgctl *msgctl; /* ips level msg control */ struct ips_epaddr *next; /* linklist */ struct ips_flow flows[EP_FLOW_LAST - 1]; /* pio and dma */ ips_path_grp_t *pathgrp; /* pointer to slid/dlid group in hash */ uint32_t connidx_outgoing; /* peer's connection idx */ uint32_t connidx_incoming; /* my connection idx */ uint16_t ctrl_msg_queued; /* bitmap of queued control messages to be send */ uint32_t window_rv; /* RNDV window size per connection */ uint8_t hpp_index; /* high priority index */ uint8_t context; /* real context value */ uint8_t subcontext; /* sub context, 3 bits, 5 bits for future */ uint8_t msg_toggle; /* only 2 bits used, 6 bits for future */ /* this portion is only for connect/disconnect */ uint64_t s_timeout; /* used as a time in close */ uint32_t runid_key; /* peer process pid */ uint32_t credit:2; /* credit to connect/disconnect: 0 or 1 */ uint32_t cstate_outgoing:3; /* connection state to, max 7 */ uint32_t cstate_incoming:3; /* connection state from, max 7 */ uint32_t delay_in_ms:8; /* disconnect delay in ms */ uint32_t cerror_outgoing:8; /* error code during connection */ uint32_t cerror_incoming:8; /* error code during connection */ }; /* * ips_msgctl_t is per connection struct. */ struct ips_msgctl { struct ips_epaddr master_epaddr; /* Master rail's epaddr */ struct ips_epaddr *ipsaddr_next; /* next ipsaddr to send packet */ uint16_t mq_send_seqnum; /* next sending message sequence */ uint16_t mq_recv_seqnum; /* next receiving message sequence */ uint16_t am_send_seqnum; /* next sending message sequence */ uint16_t am_recv_seqnum; /* next receiving message sequence */ uint16_t ipsaddr_count; /* number of ipsaddr to use */ uint16_t outoforder_count; /* number of outoforder messages */ }; static inline __attribute__ ((unused)) void IPS_MCTXT_APPEND(ips_epaddr_t *head, ips_epaddr_t *node) { ips_epaddr_t *cur; /* The new node is inserted before head. */ node->next = head; /* Circle around the linked list to head's predecessor and update. */ for (cur = head; cur->next != head; cur = cur->next); cur->next = node; } static inline __attribute__ ((unused)) void IPS_MCTXT_REMOVE(ips_epaddr_t *node) { ips_epaddr_t *cur; /* Circle around to node's predecessor and update. */ for (cur = node; cur->next != node; cur = cur->next); cur->next = node->next; node->next = node; } /* * Initialize a flow, setting its attributes. Selects the path the flow will * use as well as calculates the flow's fragment size defined as: * - min(remote EP MTU, selected path's MTU, local EP MTU) for DMA sends * - min(remote EP MTU, selected path's MTU, local EP MTU, local PIO bufsize) for PIO sends */ void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, psm_protocol_type_t protocol, ips_path_type_t path_type, uint32_t flow_index); MOCK_DCL_EPILOGUE(ips_flow_init); void ips_scb_prepare_flow(ips_scb_t *scb, ips_epaddr_t *ipsaddr, struct ips_flow *flow); void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb); MOCK_DCL_EPILOGUE(ips_proto_flow_enqueue); psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed); psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed); /* Wrapper for enqueue + flush */ psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb); void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb); psm2_error_t ips_proto_scb_dma_flush(struct ips_proto *proto, ips_epaddr_t *ipsaddr, int *nflushed); psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb); psm2_error_t ips_proto_dma_completion_update(struct ips_proto *proto); psm2_error_t ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, ips_scb_t *scb, void *payload, uint32_t paylen, uint32_t have_cksum, uint32_t cksum); /* * Protocol receive processing * */ /* Error handling for unknown packet, packet is unknown when epid doesn't match * in epstate table */ int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev); /* Exposed for fastpath only */ int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev); int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev); /* Handling error cases */ int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev); /* * Protocol exception handling and frame dumps */ void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len); void ips_proto_dump_err_stats(struct ips_proto *proto); void ips_proto_show_rhf_errors(const uint32_t *rhdr); void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg); void ips_proto_dump_frame(void *frame, int lenght, char *message); void ips_proto_dump_data(void *data, int data_length); void ips_proto_dump_eager(uint32_t *curr_rcv_hdr); /* * Checksum of ips packets */ uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc); /* * Matched-Queue processing and sends */ psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req); psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req); int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev); int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev); int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev); int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev); int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev); void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl); int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev); psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len); psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context, psm2_mq_req_t *req_o); psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et, uint32_t *out, psm2_mq_t mq, psm2_epaddr_t); int ips_proto_am(struct ips_recvhdrq_event *rcv_ev); /* * IPS packet service routine table. */ typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev); extern ips_packet_service_fn_t ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED]; /* IBTA feature related functions (path record, sl2sc etc.) */ psm2_error_t ips_ibta_init_sl2sc_table(struct ips_proto *proto); psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto); psm2_error_t MOCKABLE(ips_ibta_init)(struct ips_proto *proto); MOCK_DCL_EPILOGUE(ips_ibta_init); psm2_error_t ips_ibta_fini(struct ips_proto *proto); PSMI_ALWAYS_INLINE( struct psm_hal_sdma_req_info * psmi_get_sdma_req_info(struct ips_scb *scb, size_t *extra)) { *extra = 0; #ifdef PSM_CUDA if (PSMI_IS_DRIVER_GPUDIRECT_DISABLED) return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info) - PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA)); *extra = PSM_HAL_CUDA_SDMA_REQ_INFO_EXTRA; #endif return (struct psm_hal_sdma_req_info *)(((char *)&scb->pbc) - (sizeof(struct psm_hal_sdma_req_info))); } #ifdef PSM_CUDA PSMI_ALWAYS_INLINE( uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset, uint32_t len)) { uint32_t window_len; window_len = len - offset; if (window_len >= max_window) window_len = max_window; return window_len; } #endif /* Determine if FECN bit is set IBTA 1.2.1 CCA Annex A*/ static __inline__ uint8_t _is_cca_fecn_set(const struct ips_message_header *p_hdr) { return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FECN_SHIFT) & 0x1; } /* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/ static __inline__ uint8_t _is_cca_becn_set(const struct ips_message_header *p_hdr) { return (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_BECN_SHIFT) & 0x1; } #endif /* _IPS_PROTO_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_am.c000066400000000000000000000447731370564314600205510ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "psm2_am.h" #include "psm_am_internal.h" #include "psm_mq_internal.h" #include "ips_proto.h" #include "ips_expected_proto.h" #include "ips_proto_help.h" struct ips_am_token { struct psmi_am_token tok; /* ptl-specific token stuff */ struct ips_epaddr *epaddr_rail; struct ips_proto_am *proto_am; }; struct ips_am_message { struct ips_message_header p_hdr; struct ips_am_message *next; struct ips_epaddr *ipsaddr; struct ips_proto_am *proto_am; uint64_t *payload; uint32_t paylen; uint16_t seqnum; }; /* These variables are shared for all packet flows in a PSM process; they are * shared across multiple rails. There is no single AM object to hang these * off of, so they are declared here as globals. */ static struct { struct ips_am_message head; struct ips_am_message *tail; } ips_am_outoforder_q; static mpool_t ips_am_msg_pool; /* This calculation ensures that the number of reply slots will always be at * least twice as large + 1 as the number of request slots. This is optimal: the * minimum amount required is actually only twice as many, but it is much * slower. */ #define calc_optimal_num_reply_slots(nslots) (((nslots)*2 / 3) + 1) psm2_error_t MOCKABLE(ips_proto_am_init)(struct ips_proto *proto, int num_send_slots, uint32_t imm_size, struct ips_proto_am *proto_am) { psm2_error_t err = PSM2_OK; int send_buf_size = psmi_hal_get_pio_size(proto->ep->context.psm_hw_ctxt); int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots); int num_req_slots = num_send_slots - num_rep_slots; proto_am->proto = proto; /* In a node pair, the number of reply send buffers on at least one of * the nodes must be at least double the number (optimal: double + 1) of * send descriptors on the other node. While this constraint applies * only to the reply send buffers, allowing the caller to tune only the * number of request send buffers would be awkward, as they have no * knowledge of the subdivision of the memory into separate mempools for * requests and replies. It's an internal concern at this point. */ if ((err = ips_scbctrl_init(&proto->ep->context, num_req_slots, num_req_slots, imm_size, send_buf_size, NULL, NULL, &proto_am->scbc_request))) goto fail; if ((err = ips_scbctrl_init(&proto->ep->context, num_rep_slots, num_rep_slots, imm_size, send_buf_size, NULL, NULL, &proto_am->scbc_reply))) goto fail; if (ips_am_msg_pool == NULL) { union psmi_envvar_val max_msgs; ips_am_outoforder_q.head.next = NULL; ips_am_outoforder_q.tail = &ips_am_outoforder_q.head; psmi_getenv("PSM2_AM_MAX_OOO_MSGS", "Maximum number of OOO Active Messages to queue before dropping.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1024, &max_msgs); ips_am_msg_pool = psmi_mpool_create( sizeof(struct ips_am_message), 32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL); } fail: return err; } MOCK_DEF_EPILOGUE(ips_proto_am_init); psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am) { ips_scbctrl_fini(&proto_am->scbc_request); ips_scbctrl_fini(&proto_am->scbc_reply); if (ips_am_msg_pool != NULL) { psmi_mpool_destroy(ips_am_msg_pool); ips_am_msg_pool = NULL; } return PSM2_OK; } /* Fill in AM capabilities parameters */ psm2_error_t ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) { int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS); int max_payload = psmi_hal_get_pio_size(ep->context.psm_hw_ctxt) - ((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t)); if (parameters == NULL) { return PSM2_PARAM_ERR; } parameters->max_handlers = 1 << IPS_AM_HDR_HIDX_BITS; parameters->max_nargs = max_nargs; parameters->max_request_short = max_payload; parameters->max_reply_short = max_payload; return PSM2_OK; } static psm2_error_t am_short_reqrep(ips_scb_t *scb, struct ips_epaddr *ipsaddr, psm2_amarg_t *args, int nargs, uint8_t opcode, void *src, size_t len, int flags, int pad_bytes) { int i, hdr_qwords = IPS_AM_HDR_NARGS; struct ips_proto *proto = ((psm2_epaddr_t)ipsaddr)->proto; psmi_assert(proto->msgflowid < EP_FLOW_LAST); struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; /* There are a limited number of bits for nargs in the header, making overflow very easy. Make sure the values match. */ psmi_assert(nargs == scb->ips_lrh.amhdr_nargs); _HFI_VDBG("%s src=%p len=%d, nargs=%d\n", ((opcode == OPCODE_AM_REQUEST) || (opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep", src, (int)len, nargs); if (nargs == 1) { /* fastpath */ scb->ips_lrh.data[0].u64w0 = args[0].u64w0; hdr_qwords--; } else if (nargs > 1) { /* Easily unrollable but leave as is in case we can increase * qwords on the chip in the near future */ for (i = 0; i < IPS_AM_HDR_NARGS; i++, hdr_qwords--) scb->ips_lrh.data[i].u64w0 = args[i].u64w0; if (nargs > IPS_AM_HDR_NARGS) { /* Slow case -- we don't have iovec and not enough * space in the message header, so we have to copy the * user's arguments even if the payload is marked ASYNC */ uintptr_t bufp = (uintptr_t) ips_scb_buffer(scb); size_t arg_payload_len = sizeof(psm2_amarg_t) * (nargs - IPS_AM_HDR_NARGS); psmi_mq_mtucpy((void *)bufp, &args[IPS_AM_HDR_NARGS], arg_payload_len); bufp += arg_payload_len; scb->payload_size = arg_payload_len; if (src != NULL && len > 0) { psmi_mq_mtucpy((void *)bufp, src, len); scb->payload_size += len; } psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); scb->payload_size += pad_bytes; scb->ips_lrh.amhdr_len = pad_bytes; goto send_scb; } } if (len == 0) { scb->payload_size = 0; scb->ips_lrh.amhdr_len = 0; } else if (len <= (hdr_qwords << 3)) { /* Inline the payload into the header. */ /* This path CANNOT handle length = 0 due to limited space in the header. If IPS_SEND_FLAG_AMISTINY is set, an amhdr_len value of 0 means a full payload, i.e. 1 << IPS_AM_HDR_LEN_BITS bytes of packed payload. */ psmi_assert(len > 0); psmi_mq_mtucpy(&scb->ips_lrh. data[IPS_AM_HDR_NARGS - hdr_qwords], src, len); scb->payload_size = 0; psmi_assert(len <= (1 << IPS_AM_HDR_LEN_BITS)); scb->ips_lrh.amhdr_len = len & ((1 << IPS_AM_HDR_LEN_BITS) - 1); scb->scb_flags |= IPS_SEND_FLAG_AMISTINY; } else { /* Whatever's left requires a separate payload */ if (ips_scb_buffer(scb) == NULL) /* Just attach the buffer */ ips_scb_buffer(scb) = src; else /* May need to re-xmit user data, keep it around */ psmi_mq_mtucpy(ips_scb_buffer(scb), src, len); psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); scb->payload_size = len + pad_bytes; scb->ips_lrh.amhdr_len = pad_bytes; } send_scb: ips_scb_opcode(scb) = opcode; scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->am_send_seqnum++; ips_proto_flow_enqueue(flow, scb); flow->flush(flow, NULL); return PSM2_OK; } static inline int calculate_pad_bytes(size_t len) { /* Align to dword (4 bytes) */ size_t dword_aligned_len = (len + 3) & ~3; return dword_aligned_len - len; } static inline void ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs, int pad_bytes, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); scb->completion_am = completion_fn; scb->cb_param = completion_ctxt; scb->ips_lrh.amhdr_hidx = handler; scb->ips_lrh.amhdr_len = pad_bytes; scb->ips_lrh.amhdr_nargs = nargs; scb->ips_lrh.flags = 0; if (completion_fn) scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; return; } psm2_error_t ips_am_short_request(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { struct ips_proto_am *proto_am = &epaddr->proto->proto_am; psm2_error_t err; ips_scb_t *scb; ips_epaddr_t *ipsaddr; int pad_bytes = calculate_pad_bytes(len); int payload_sz = (nargs << 3); if_pt(!(flags & PSM2_AM_FLAG_ASYNC)) payload_sz += len; if (payload_sz > (IPS_AM_HDR_NARGS << 3)) { /* Payload can't fit in header, allocate buffer to carry data */ int arg_sz = (nargs > IPS_AM_HDR_NARGS) ? ((nargs - IPS_AM_HDR_NARGS) << 3) : 0; /* len + pad_bytes + overflow_args */ PSMI_BLOCKUNTIL(epaddr->ptlctl->ep, err, ((scb = ips_scbctrl_alloc( &proto_am->scbc_request, 1, len + pad_bytes + arg_sz, IPS_SCB_FLAG_ADD_BUFFER)) != NULL)); } else { PSMI_BLOCKUNTIL(epaddr->ptlctl->ep, err, ((scb = ips_scbctrl_alloc_tiny( &proto_am->scbc_request)) != NULL)); } psmi_assert_always(scb != NULL); ips_am_scb_init(scb, handler, nargs, pad_bytes, completion_fn, completion_ctxt); /* Select the next ipsaddr for multi-rail */ ipsaddr = ((ips_epaddr_t *)epaddr)->msgctl->ipsaddr_next; ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; return am_short_reqrep(scb, ipsaddr, args, nargs, (flags & PSM2_AM_FLAG_NOREPLY) ? OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST, src, len, flags, pad_bytes); } psm2_error_t ips_am_short_reply(psm2_am_token_t tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { struct ips_am_token *token = (struct ips_am_token *)tok; struct ips_proto_am *proto_am = token->proto_am; struct ips_epaddr *ipsaddr = token->epaddr_rail; int pad_bytes = calculate_pad_bytes(len); int scb_flags = 0; ips_scb_t *scb; if (!token->tok.can_reply) { _HFI_ERROR("Invalid AM reply for request!"); return PSM2_AM_INVALID_REPLY; } psmi_assert(ips_scbctrl_avail(&proto_am->scbc_reply)); if ((nargs << 3) + len <= (IPS_AM_HDR_NARGS << 3)) { scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply); } else { int payload_sz = (nargs << 3); payload_sz += (flags & PSM2_AM_FLAG_ASYNC) ? 0 : (len + pad_bytes); scb_flags |= (payload_sz > (IPS_AM_HDR_NARGS << 3)) ? IPS_SCB_FLAG_ADD_BUFFER : 0; scb = ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz, scb_flags); } psmi_assert_always(scb != NULL); ips_am_scb_init(scb, handler, nargs, pad_bytes, completion_fn, completion_ctxt); am_short_reqrep(scb, ipsaddr, args, nargs, OPCODE_AM_REPLY, src, len, flags, pad_bytes); return PSM2_OK; } /* Prepares and runs a handler from a receive event. */ static int ips_am_run_handler(const struct ips_message_header *p_hdr, struct ips_epaddr *ipsaddr, struct ips_proto_am *proto_am, uint64_t *payload, uint32_t paylen) { struct ips_am_token token; int nargs = p_hdr->amhdr_nargs; int ret; struct psm2_ep_am_handle_entry *hentry; psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data; token.tok.flags = p_hdr->flags; token.tok.epaddr_incoming = (psm2_epaddr_t)&ipsaddr->msgctl->master_epaddr; token.tok.can_reply = (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST); token.epaddr_rail = ipsaddr; token.proto_am = proto_am; if (token.tok.flags & IPS_SEND_FLAG_AMISTINY) { /* Payload is packed into header after args */ payload = (uint64_t *)&p_hdr->data[nargs].u64; paylen = p_hdr->amhdr_len; /* Interpret amhdr_len == 0 as 16 bytes of payload */ if (paylen == 0) paylen = 1 << IPS_AM_HDR_LEN_BITS; } else { if (nargs > IPS_AM_HDR_NARGS) { /* Args are split across header and payload */ int payload_args_len = (nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t); args = alloca(PSMI_AM_MAX_ARGS * sizeof(psm2_amarg_t)); args[0].u64 = p_hdr->data[0].u64; args[1].u64 = p_hdr->data[1].u64; memcpy(&args[2], payload, payload_args_len); payload += nargs - IPS_AM_HDR_NARGS; paylen -= payload_args_len; } /* Subtract off padding bytes (dword padding) for non-TINY. */ paylen -= p_hdr->amhdr_len; } hentry = psm_am_get_handler_function(proto_am->proto->ep, p_hdr->amhdr_hidx); /* Note a guard here for hentry != NULL is not needed because at * initialization, a psmi_assert_always() assure the entry will be * non-NULL. */ if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { psm2_am_handler_2_fn_t hfn2 = (psm2_am_handler_2_fn_t)hentry->hfn; ret = hfn2(&token, args, nargs, payload, paylen, hentry->hctx); } else { psm2_am_handler_fn_t hfn1 = (psm2_am_handler_fn_t)hentry->hfn; ret = hfn1(&token, args, nargs, payload, paylen); } return ret; } static int ips_proto_am_handle_outoforder_queue() { struct ips_am_message *msg, *prev; int ret = IPS_RECVHDRQ_CONTINUE; prev = &ips_am_outoforder_q.head; msg = ips_am_outoforder_q.head.next; while (msg != NULL) { struct ips_epaddr *ipsaddr = msg->ipsaddr; if (ipsaddr->msgctl->am_recv_seqnum != msg->seqnum) { prev = msg; msg = msg->next; continue; } ipsaddr->msgctl->am_recv_seqnum++; if (ips_am_run_handler(&msg->p_hdr, ipsaddr, msg->proto_am, msg->payload, msg->paylen)) ret = IPS_RECVHDRQ_BREAK; prev->next = msg->next; if (prev->next == NULL) ips_am_outoforder_q.tail = prev; psmi_mq_sysbuf_free(msg->proto_am->proto->mq, msg->payload); psmi_mpool_put(msg); msg = prev->next; } return ret; } static void ips_proto_am_queue_msg(struct ips_am_message *msg) { msg->next = NULL; ips_am_outoforder_q.tail->next = msg; ips_am_outoforder_q.tail = msg; } int ips_proto_am(struct ips_recvhdrq_event *rcv_ev) { struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr; struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am; ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); struct ips_flow *flow; struct ips_am_message *msg = NULL; int ret = IPS_RECVHDRQ_CONTINUE; enum ips_msg_order msgorder; psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; /* * Based on AM request/reply traffic pattern, if we don't have a reply * scb slot then we can't process the request packet, we just silently * drop it. Otherwise, it will be a deadlock. note: * ips_proto_is_expected_or_nak() can not be called in this case. */ if (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST && !ips_scbctrl_avail(&proto_am->scbc_reply)) return IPS_RECVHDRQ_CONTINUE; if (!ips_proto_is_expected_or_nak(rcv_ev)) return IPS_RECVHDRQ_CONTINUE; uint16_t send_msgseq = __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; msgorder = ips_proto_check_msg_order(ipsaddr, flow, send_msgseq, &ipsaddr->msgctl->am_recv_seqnum); if (msgorder == IPS_MSG_ORDER_FUTURE) return IPS_RECVHDRQ_REVISIT; else if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) { uint64_t *msg_payload; uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev); uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(paylen == 0 || payload); msg = psmi_mpool_get(ips_am_msg_pool); if (unlikely(msg == NULL)) { /* Out of memory, drop the packet. */ flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & rcv_ev->proto->psn_mask; return IPS_RECVHDRQ_BREAK; } msg_payload = psmi_mq_sysbuf_alloc( proto_am->proto->mq, ips_recvhdrq_event_paylen(rcv_ev)); if (unlikely(msg_payload == NULL)) { /* Out of memory, drop the packet. */ flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & rcv_ev->proto->psn_mask; psmi_mpool_put(msg); return IPS_RECVHDRQ_BREAK; } memcpy(&msg->p_hdr, p_hdr, sizeof(struct ips_message_header)); memcpy(msg_payload, payload, paylen); msg->payload = msg_payload; msg->ipsaddr = ipsaddr; msg->proto_am = proto_am; msg->paylen = paylen; msg->seqnum = __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; ips_proto_am_queue_msg(msg); } else if ((msgorder == IPS_MSG_ORDER_EXPECTED) || (msgorder == IPS_MSG_ORDER_EXPECTED_MATCH)) { uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev); uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(paylen == 0 || payload); if (ips_am_run_handler(p_hdr, ipsaddr, proto_am, payload, paylen)) ret = IPS_RECVHDRQ_BREAK; ips_proto_am_handle_outoforder_queue(); } /* Look if the handler replied, if it didn't, ack the request */ if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); return ret; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_am.h000066400000000000000000000065151370564314600205460ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_PROTO_AM_H #define _IPS_PROTO_AM_H #include "psm_user.h" #include "ips_scb.h" struct ips_proto_am { struct ips_proto *proto; /* back pointer */ struct ips_scbctrl scbc_request; struct ips_scbctrl scbc_reply; }; psm2_error_t ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters); psm2_error_t ips_am_short_reply(psm2_am_token_t tok, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); psm2_error_t ips_am_short_request(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt); psm2_error_t MOCKABLE(ips_proto_am_init)(struct ips_proto *proto, int num_send_slots, uint32_t imm_size, struct ips_proto_am *proto_am); MOCK_DCL_EPILOGUE(ips_proto_am_init); psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am); #endif /* _IPS_PROTO_AM_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_connect.c000066400000000000000000001355231370564314600215770ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "psm_mq_internal.h" #include "ips_proto_internal.h" /* * define connection version. this is the basic version, optimized * version will be added later for scalability. */ #define IPS_CONNECT_VERNO 0x0001 struct ips_connect_hdr { uint16_t connect_verno; /* should be ver 1 */ uint16_t psm_verno; /* should be 2.0 */ uint32_t connidx; /* ignore if 0xffffffff */ uint64_t epid; /* epid of connector process */ }; struct ips_connect_reqrep { uint16_t connect_verno; /* should be ver 1 */ uint16_t psm_verno; /* should be 2.0 */ uint32_t connidx; /* ignore if 0xffffffff */ uint64_t epid; /* epid of connector process */ /* above should be same as ips_connect_hdr */ uint16_t connect_result; /* error code */ uint16_t sl; /* service level for matching */ uint16_t mtu; /* receive payload */ uint16_t job_pkey; /* partition key for verification */ uint32_t runid_key; /* one-time stamp connect key */ uint32_t initpsn; /* initial psn for flow */ char hostname[128]; /* sender's hostname string */ }; /* Startup protocol in PSM/IPS * * Start timer. * * For all nodes to connect to: * Grab connect lock * Look up epid in table * MATCH. * assert cstate_outgoing != CONNECT_WAITING (no re-entrancy) * If cstate_outgoing == CONNECT_DONE * return the already connected address. * else * assert cstate_outgoing == CONNECT_NONE * assert cstate_incoming == CONNECT_DONE * cstate_outgoing := CONNECT_WAITING * assert connidx_outgoing != UNKNOWN && connidx_incoming != UNKNOWN * req->connidx := epaddr->connidx_incoming * add to list of pending connect. * NO MATCH * allocate epaddr and put in table * cstate_outgoing := CONNECT_WAITING * cstate_incoming := CONNECT_NONE * connidx_outgoing := UNKNOWN * req->connidx := epaddr->connidx_incoming := NEW connidx integer * add to list of pending connect * Release connect lock * * expected_connect_count = ep->total_connect_count + num_to_connect * while (expected_connect_count != ep->total_connect_count) * check for timeout * progress(); * * For all connection requests received (within progress loop) * If uuid doesn't match, NAK the connect and skip request * Grab connect lock * Lock up epid in table * MATCH * if cstate_incoming == CONNECT_DONE * req->connidx := epaddr->connidx_incoming * compose reply and send again (this is a dupe request). * else * assert cstate_incoming == CONNECT_NONE * assert cstate_outgoing == (CONNECT_WAITING | CONNECT_DONE) * cstate_incoming := CONNECT_DONE * epaddr->connidx_outgoing := req->connidx * req->connidx := epaddr->connidx_incoming * NO MATCH * allocate epaddr and put in table * cstate_incoming := CONNECT_DONE * epaddr->connidx_outgoing = req->connidx; * rep->connidx := epaddr->connidx_incoming := NEW connidx integer * compose connect reply and send * Release connect lock * * For all connection replies received: * If connect_result != 0, process error and skip. * assert cstate_outgoing == CONNECT_WAITING * if cstate_incoming == CONNECT_DONE * assert rep->connidx == epaddr->connidx_outgoing * else * epaddr->connidx_outgoing := rep->connidx * cstate_outgoing := CONNECT_DONE * ep->total_connect_count ++ * * * Fill in a connection request: * 1. Set connect protocol version and PSM versions * 2. Set the uuid attached to current endpoint and add the job_pkey * the node wishes to communicate post-connect. * 3. Set our mtu, bitwidth and endianess to detect inconsistencies * */ /** * Configure flows for an ipsaddr. * * @arg ipsaddr - the ipsaddr to configure the flows for * @arg proto - the protocol used * * @pre proto's flags must be set * * Flows should be configured: * - immediately upon creation of an ipsaddr * - whenever a connection is established and the receiver's characteristics * (e.g. mtu) become known */ ustatic void ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto) { /* PIO flow uses the normal priority path, to separate low * priority path for bulk sdma data packets */ ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], proto, ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO); /* DMA flow uses the low priority path, multi MTU sized eager * message uses the same flow to transfer to avoid out of order. */ ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA], proto, ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N, IPS_PATH_LOW_PRIORITY, EP_FLOW_GO_BACK_N_DMA); } /* * Teardown any unnecessary timers that could still be active and assign NULL * to pointers in flow structs. We do this mainly for PIO and DMA flows. * TidFlow teardowns are conducted in ips_protoexp_fini() */ static void ips_flow_fini(struct ips_epaddr *ipsaddr, struct ips_proto *proto) { struct ips_flow *flow; int i; for (i = 0; i < EP_FLOW_TIDFLOW; i++) { flow = &ipsaddr->flows[i]; /* Cancel any stale flow->timers in flight */ if (flow->timer_ack) { psmi_timer_cancel(proto->timerq, flow->timer_ack); flow->timer_ack = NULL; } if (flow->timer_send) { psmi_timer_cancel(proto->timerq, flow->timer_send); flow->timer_send = NULL; } flow->flush = NULL; flow->path = NULL; flow->ipsaddr = NULL; } } static psm2_epaddr_t ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, const char *hostname, uint16_t hfi_type, unsigned long timeout); /* * Given a connection request, set mtu, communication index and hdr length * parameters. * * The most subtle parameter is the mtu. When set as 'req->mtu', the mtu * is our connecting peer's declared mtu (which may not be the same as our * mtu). The approach is to take the smaller of both mtus when communicating * with that peer. Also, when using pio, the size can be further restricted by * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers). */ static psm2_error_t ips_ipsaddr_set_req_params(struct ips_proto *proto, ips_epaddr_t *ipsaddr, const struct ips_connect_reqrep *req, uint32_t paylen) { psm2_ep_t ep; psm2_epaddr_t epaddr; psm2_error_t err = PSM2_OK; int i, start, count; uint64_t *data; psmi_assert_always(req->mtu > 0); uint16_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu); int ptype, pidx; /* * Make RNDV window size being dependent on MTU size; * This is due to fact that number of send packets * within a given window must not exceed 2048 (@ref PSM_TID_MAX_PKTS). * Use smaller of two values: * unified MTU * PSM_TID_MAX_PKTS vs already configured window size. */ ipsaddr->window_rv = min(common_mtu * PSM_TID_MAX_PKTS, proto->mq->hfi_base_window_rv); /* * For static routes i.e. "none" path resolution update all paths to * have the same profile (mtu, sl etc.). * * For path record queries the epr_mtu and epr_sl are setup correctly * from the path itself. */ for (ptype = IPS_PATH_LOW_PRIORITY; ptype < IPS_PATH_MAX_PRIORITY; ptype++) for (pidx = 0; pidx < ipsaddr->pathgrp->pg_num_paths[ptype]; pidx++) { if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) { ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = common_mtu; } else { ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = min(common_mtu, ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu); } } /* * We've got updated mtu/path records, need to re-initialize the flows to take * into account _real_ (updated) remote endpoint characteristics */ ips_ipsaddr_configure_flows(ipsaddr, proto); /* * Save peer's info. */ ipsaddr->connidx_outgoing = req->connidx; ipsaddr->runid_key = req->runid_key; /* ipsaddr->initpsn = req->initpsn; */ err = psmi_epid_set_hostname(psm2_epid_nid(((psm2_epaddr_t) ipsaddr)->epid), (char *)req->hostname, 0); if (err) return err; /* * Check if there is other rails to setup. */ paylen -= sizeof(struct ips_connect_reqrep); if (paylen == 0) return PSM2_OK; /* * Yes, other rail's gid/epid is attached. */ if (paylen % (sizeof(uint64_t) + sizeof(psm2_epid_t))) { return PSM2_INTERNAL_ERR; } count = paylen / (sizeof(uint64_t) + sizeof(psm2_epid_t)); if (count > HFI_MAX_RAILS) return PSM2_INTERNAL_ERR; /* * Both side are ordered, so just search from small to big. */ start = 0; data = (uint64_t *) (req + 1); ep = proto->ep->mctxt_next; struct drand48_data drand48_data; srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data); /* Loop over all slave endpoints */ while (ep != ep->mctxt_master) { for (i = start; i < count; i++) { /* There is a gid match, create the epaddr */ if (data[2 * i] == ep->gid_hi) { epaddr = ips_alloc_epaddr(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, 0, data[2 * i + 1], NULL, PSMI_HFI_TYPE_OPA1, 5000); if (epaddr == NULL) return PSM2_NO_MEMORY; /* link the ipsaddr */ IPS_MCTXT_APPEND(ipsaddr, (ips_epaddr_t *) epaddr); /* Setup message control info to the same struct */ ((ips_epaddr_t *) epaddr)->msgctl = ipsaddr->msgctl; ipsaddr->msgctl->ipsaddr_count++; /* randomize the rail to start traffic */ long int rnum; lrand48_r(&drand48_data, &rnum); if ((rnum % count) == i) { ipsaddr->msgctl->ipsaddr_next = (ips_epaddr_t *) epaddr; } /* update the starting point, * all previous ones are not valid anymore */ start = i + 1; break; } } ep = ep->mctxt_next; } return PSM2_OK; } static psm2_error_t ips_proto_send_ctrl_message_request(struct ips_proto *proto, struct ips_flow *flow, uint8_t message_type, uint16_t *msg_queue_mask, uint64_t timeout) { psm2_error_t err = PSM2_OK; ips_scb_t ctrlscb; /* msg header plus gid+epid for all rails plus checksum */ char payload[sizeof(struct ips_connect_reqrep) + 16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES]; uint32_t paylen; ctrlscb.scb_flags = 0; paylen = ips_proto_build_connect_message(proto, flow->ipsaddr, message_type, payload); psmi_assert_always(paylen <= sizeof(payload)); do { err = ips_proto_send_ctrl_message(flow, message_type, msg_queue_mask, &ctrlscb, payload, paylen); if (err == PSM2_OK) { break; } if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) { break; } } while (get_cycles() < timeout); return err; } static psm2_error_t ips_proto_send_ctrl_message_reply(struct ips_proto *proto, struct ips_flow *flow, uint8_t message_type, uint16_t *msg_queue_mask) { /* This will try up to 100 times until the message is sent. The code * is persistent because dropping replies will lead to a lack of * overall progress on the connection/disconnection. We do not want * to poll from here, and we cannot afford a lengthy timeout, since * this is called from the receive path. */ psm2_error_t err = PSM2_OK; int i; ips_scb_t ctrlscb; /* msg header plus gid+epid for all rails plus checksum */ char payload[sizeof(struct ips_connect_reqrep) + 16*HFI_MAX_RAILS + PSM_CRC_SIZE_IN_BYTES]; uint32_t paylen; ctrlscb.scb_flags = 0; paylen = ips_proto_build_connect_message(proto, flow->ipsaddr, message_type, payload); psmi_assert_always(paylen <= sizeof(payload)); for (i = 0; i < 100; i++) { err = ips_proto_send_ctrl_message(flow, message_type, msg_queue_mask, &ctrlscb, payload, paylen); if (err == PSM2_OK) { break; } } return err; } int ips_proto_build_connect_message(struct ips_proto *proto, ips_epaddr_t *ipsaddr, uint8_t opcode, void *payload) { struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload; struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload; uint32_t paylen = 0; psmi_assert_always(proto != NULL); hdr->connect_verno = IPS_CONNECT_VERNO; hdr->psm_verno = PSMI_VERNO; hdr->connidx = (uint32_t) ipsaddr->connidx_incoming; hdr->epid = proto->ep->epid; switch (opcode) { case OPCODE_CONNECT_REPLY: case OPCODE_CONNECT_REQUEST: if (opcode == OPCODE_CONNECT_REQUEST) { req->connect_result = PSM2_OK; req->runid_key = proto->runid_key; } else { req->connect_result = ipsaddr->cerror_incoming; req->runid_key = ipsaddr->runid_key; } req->sl = proto->epinfo.ep_sl; req->mtu = proto->epinfo.ep_mtu; req->job_pkey = proto->epinfo.ep_pkey; strncpy(req->hostname, psmi_gethostname(), sizeof(req->hostname) - 1); req->hostname[sizeof(req->hostname) - 1] = '\0'; paylen = sizeof(struct ips_connect_reqrep); /* Attach all multi-context subnetids and epids. */ if (proto->ep->mctxt_master == proto->ep) { psm2_ep_t ep = proto->ep->mctxt_next; uint64_t *data = (uint64_t *) (req + 1); while (ep != proto->ep) { *data = ep->gid_hi; paylen += sizeof(uint64_t); data++; *data = ep->epid; paylen += sizeof(uint64_t); data++; ep = ep->mctxt_next; } } break; case OPCODE_DISCONNECT_REQUEST: case OPCODE_DISCONNECT_REPLY: paylen = sizeof(struct ips_connect_hdr); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unexpected/unhandled connection opcode 0x%x\n", opcode); break; } return paylen; } void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, psm_protocol_type_t protocol, ips_path_type_t path_type, uint32_t flow_index) { psmi_assert_always(protocol < PSM_PROTOCOL_LAST); psmi_assert_always(flow_index < EP_FLOW_LAST); SLIST_NEXT(flow, next) = NULL; if (transfer_type == PSM_TRANSFER_PIO) { flow->flush = ips_proto_flow_flush_pio; } else { flow->flush = ips_proto_flow_flush_dma; } flow->path = ips_select_path(proto, path_type, ipsaddr, ipsaddr->pathgrp); /* Select the fragment size for this flow. Flow is the common * denominator between the local endpoint, the remote endpoint, * the path between those and whether it's a PIO or DMA send. * Hence, it "owns" the maximum transmission unit in its frag_size * member. */ /* min of local MTU and path MTU */ flow->frag_size = min(proto->epinfo.ep_mtu, flow->path->pr_mtu); /* if PIO, need to consider local pio buffer size */ if (transfer_type == PSM_TRANSFER_PIO) { flow->frag_size = min(flow->frag_size, proto->epinfo.ep_piosize); _HFI_VDBG("[ipsaddr=%p] PIO flow->frag_size: %u = min(" "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u), proto->epinfo.ep_piosize(%u))\n", ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, flow->path->pr_mtu, proto->epinfo.ep_piosize); } else { _HFI_VDBG("[ipsaddr=%p] SDMA flow->frag_size: %u = min(" "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n", ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, flow->path->pr_mtu); } flow->ipsaddr = ipsaddr; flow->transfer = transfer_type; flow->protocol = protocol; flow->flowid = flow_index; flow->xmit_seq_num.psn_val = 0; flow->recv_seq_num.psn_val = 0; flow->xmit_ack_num.psn_val = 0; flow->flags = 0; flow->cca_ooo_pkts = 0; flow->credits = flow->cwin = proto->flow_credits; flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1); flow->ack_counter = 0; #ifdef PSM_DEBUG flow->scb_num_pending = 0; flow->scb_num_unacked = 0; #endif flow->timer_ack = NULL; flow->timer_send = NULL; STAILQ_INIT(&flow->scb_unacked); SLIST_INIT(&flow->scb_pend); return; } MOCK_DEF_EPILOGUE(ips_flow_init); static psm2_epaddr_t ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, const char *hostname, uint16_t hfi_type, unsigned long timeout) { psm2_error_t err = PSM2_OK; psm2_epaddr_t epaddr; ips_epaddr_t *ipsaddr; ips_path_grp_t *pathgrp; uint16_t lid; /* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl * structures are collocated in memory for performance reasons -- this is * why ips allocates memory for all three together. * * The PSM/PTL structure data is filled in upon successfully ep connect in * ips_ptl_connect(). */ if (master) { struct ips_msgctl *msgctl; /* Although an ips_msgtl is allocated here, it can be safely casted to both an ips_epaddr and a psm2_epaddr. It is eventually freed as an ips_epaddr. */ msgctl = (struct ips_msgctl *)psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1, sizeof(struct ips_msgctl)); if (msgctl == NULL) return NULL; ipsaddr = &msgctl->master_epaddr; epaddr = (psm2_epaddr_t) ipsaddr; ipsaddr->msgctl = msgctl; /* initialize items in ips_msgctl_t */ msgctl->ipsaddr_next = ipsaddr; msgctl->mq_send_seqnum = 0; msgctl->mq_recv_seqnum = 0; msgctl->am_send_seqnum = 0; msgctl->am_recv_seqnum = 0; msgctl->ipsaddr_count = 1; msgctl->outoforder_count = 0; } else { epaddr = (psm2_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1, sizeof(struct ips_epaddr)); psmi_assert_always(epaddr); ipsaddr = (ips_epaddr_t *) epaddr; } epaddr->ptlctl = ((struct ptl_ips *)(proto->ptl))->ctl; epaddr->proto = proto; epaddr->epid = epid; /* IPS-level epaddr */ ipsaddr->next = ipsaddr; ipsaddr->ctrl_msg_queued = 0; ipsaddr->msg_toggle = 0; /* Actual context of peer */ ipsaddr->context = PSMI_EPID_GET_CONTEXT(epid); /* Subcontext */ ipsaddr->subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); /* Get path record for tuple */ lid = PSMI_EPID_GET_LID(epid); err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, __cpu_to_be16(lid), hfi_type, timeout, &pathgrp); if (err != PSM2_OK) { psmi_free(epaddr); return NULL; } ipsaddr->pathgrp = pathgrp; /* Setup high priority path index, control messages use the high * priority CONTROL path. */ if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) ipsaddr->hpp_index = 0; else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) ipsaddr->hpp_index = ipsaddr->context % ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]; else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) ipsaddr->hpp_index = proto->epinfo.ep_context % ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]; else /* Base LID */ ipsaddr->hpp_index = 0; /* * Set up the flows on this ipsaddr */ ips_ipsaddr_configure_flows(ipsaddr, proto); /* clear connection state. */ ipsaddr->cstate_outgoing = CSTATE_NONE; ipsaddr->cstate_incoming = CSTATE_NONE; /* Add epaddr to PSM's epid table */ psmi_epid_add(proto->ep, epaddr->epid, epaddr); psmi_assert(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr); return epaddr; } static void ips_free_epaddr(psm2_epaddr_t epaddr, struct ips_proto *proto) { ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr; ips_flow_fini(ipsaddr, proto); _HFI_VDBG("epaddr=%p,ipsaddr=%p,connidx_incoming=%d\n", epaddr, ipsaddr, ipsaddr->connidx_incoming); psmi_epid_remove(epaddr->proto->ep, epaddr->epid); ips_epstate_del(epaddr->proto->epstate, ipsaddr->connidx_incoming); psmi_free(epaddr); return; } static psm2_error_t ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, struct ips_connect_reqrep *req, uint32_t paylen); psm2_error_t ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode, struct ips_message_header *p_hdr, void *payload, uint32_t paylen) { struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload; psm2_epaddr_t epaddr; ips_epaddr_t *ipsaddr; psm2_error_t err = PSM2_OK; PSMI_LOCK_ASSERT(proto->mq->progress_lock); epaddr = psmi_epid_lookup(proto->ep, hdr->epid); ipsaddr = epaddr ? (ips_epaddr_t *) epaddr : NULL; switch (opcode) { case OPCODE_CONNECT_REQUEST: err = ptl_handle_connect_req(proto, epaddr, (struct ips_connect_reqrep *)hdr, paylen); break; case OPCODE_CONNECT_REPLY: { struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload; if (!ipsaddr || req->runid_key != proto->runid_key) { _HFI_PRDBG ("Unknown connectrep (ipsaddr=%p, %d,%d) from epid %d:%d:%d\n", ipsaddr, req->runid_key, proto->runid_key, (int)PSMI_EPID_GET_LID(hdr->epid), (int)PSMI_EPID_GET_CONTEXT(hdr->epid), (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid)); } else if (ipsaddr->cstate_outgoing != CSTATE_OUTGOING_WAITING) { /* possible dupe */ _HFI_VDBG("connect dupe, expected %d got %d\n", CSTATE_OUTGOING_WAITING, ipsaddr->cstate_outgoing); } else { /* Reply to our request for connection (i.e. outgoing connection) */ if (ipsaddr->cstate_incoming != CSTATE_ESTABLISHED) { err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen); if (err) goto fail; } ipsaddr->cstate_outgoing = CSTATE_ESTABLISHED; ipsaddr->cerror_outgoing = req->connect_result; } } break; case OPCODE_DISCONNECT_REQUEST: { ips_epaddr_t ipsaddr_f; /* fake a ptl addr */ int epaddr_do_free = 0; psmi_assert_always(paylen == sizeof(struct ips_connect_hdr)); _HFI_VDBG("Got a disconnect from %s\n", psmi_epaddr_get_name(hdr->epid)); proto->num_disconnect_requests++; /* It's possible to get a disconnection request on a ipsaddr that * we've since removed if the request is a dupe. Instead of * silently dropping the packet, we "echo" the request in the * reply. */ if (ipsaddr == NULL) { ips_path_grp_t *pathgrp; uint16_t lid; ipsaddr = &ipsaddr_f; memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t)); ipsaddr_f.context = PSMI_EPID_GET_CONTEXT(hdr->epid); ipsaddr_f.subcontext = PSMI_EPID_GET_SUBCONTEXT(hdr->epid); /* Get path record for peer */ lid = PSMI_EPID_GET_LID(hdr->epid); err = proto->ibta.get_path_rec(proto, proto->epinfo. ep_base_lid, __cpu_to_be16(lid), PSMI_HFI_TYPE_OPA1, 3000, &pathgrp); if (err != PSM2_OK) goto fail; ipsaddr_f.pathgrp = pathgrp; ((psm2_epaddr_t) &ipsaddr_f)->ptlctl = ((struct ptl_ips *)(proto->ptl))->ctl; ((psm2_epaddr_t) &ipsaddr_f)->proto = proto; /* If the send fails because of pio_busy, don't let ips queue * the request on an invalid ipsaddr, just drop the reply */ ipsaddr_f.ctrl_msg_queued = ~0; psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); ips_flow_init(&ipsaddr_f. flows[proto->msgflowid], proto, &ipsaddr_f, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, IPS_PATH_LOW_PRIORITY, EP_FLOW_GO_BACK_N_PIO); _HFI_VDBG ("Disconnect on unknown epaddr, just echo request\n"); } else if (ipsaddr->cstate_incoming != CSTATE_NONE) { ipsaddr->cstate_incoming = CSTATE_NONE; proto->num_connected_incoming--; if (ipsaddr->cstate_outgoing == CSTATE_NONE) { epaddr_do_free = 1; } } psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); ips_proto_send_ctrl_message_reply(proto, &ipsaddr-> flows[proto-> msgflowid], OPCODE_DISCONNECT_REPLY, &ipsaddr-> ctrl_msg_queued); /* We can safely free the ipsaddr if required since disconnect * messages are never enqueued so no reference to ipsaddr is kept */ if (epaddr_do_free) { ips_free_epaddr(epaddr, proto); epaddr = NULL; } } break; case OPCODE_DISCONNECT_REPLY: if (!ipsaddr) { _HFI_VDBG ("Unknown disconnect reply from epid %d:%d.%d\n", (int)PSMI_EPID_GET_LID(hdr->epid), (int)PSMI_EPID_GET_CONTEXT(hdr->epid), (int)PSMI_EPID_GET_SUBCONTEXT(hdr->epid)); break; } else if (ipsaddr->cstate_outgoing == CSTATE_OUTGOING_WAITING_DISC) { ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED; /* Freed in disconnect() if cstate_incoming == NONE */ } /* else dupe reply */ break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unexpected/unhandled connect opcode 0x%x\n", opcode); } fail: return err; } static psm2_error_t ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, struct ips_connect_reqrep *req, uint32_t paylen) { ips_epaddr_t *ipsaddr; psm2_error_t err = PSM2_OK; uint16_t connect_result; int newconnect = 0; if (req->epid == proto->ep->epid) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_NETWORK_ERROR, "Network connectivity problem: Locally detected duplicate " "LIDs 0x%04x on hosts %s and %s. (Exiting)", (uint32_t) psm2_epid_nid(req->epid), psmi_epaddr_get_hostname(req->epid), psmi_gethostname()); /* XXX no return */ abort(); } else if (epaddr == NULL) { /* new ep connect before we call into connect */ newconnect = 1; if ((epaddr = ips_alloc_epaddr(proto, 1, req->epid, req->hostname, PSMI_HFI_TYPE_OPA1, 5000)) == NULL) { err = PSM2_NO_MEMORY; goto fail; } } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_ESTABLISHED) { ipsaddr = (ips_epaddr_t *) epaddr; /* Duplicate lid detection. */ if (ipsaddr->runid_key == req->runid_key) goto do_reply; /* duplicate request, not duplicate lid */ else { /* Some out of context message. Just drop it */ if (!proto->done_warning) { psmi_syslog(proto->ep, 1, LOG_INFO, "Non-fatal connection problem: Received an out-of-context " "connection message from host %s LID=0x%x context=%d. (Ignoring)", req->hostname, (int)psm2_epid_nid(req->epid), psm2_epid_context(req->epid)); proto->done_warning = 1; } goto no_reply; } } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing == CSTATE_NONE) { /* pre-created epaddr in multi-rail */ psmi_assert_always(epaddr->proto->ep != epaddr->proto->ep->mctxt_master); newconnect = 1; } ipsaddr = (ips_epaddr_t *) epaddr; psmi_assert_always(ipsaddr->cstate_incoming == CSTATE_NONE); /* Check connect version and psm version */ if (req->connect_verno < 0x0001) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION, "Connect protocol (%x,%x) is obsolete and incompatible", (req->connect_verno >> 8) & 0xff, req->connect_verno & 0xff); connect_result = PSM2_EPID_INVALID_CONNECT; } else if (!psmi_verno_isinteroperable(req->psm_verno)) { connect_result = PSM2_EPID_INVALID_VERSION; } else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) && proto->epinfo.ep_pkey != HFI_DEFAULT_P_KEY && proto->epinfo.ep_pkey != req->job_pkey) { connect_result = PSM2_EPID_INVALID_PKEY; } else if (req->sl != proto->epinfo.ep_sl) { connect_result = PSM2_EPID_INVALID_CONNECT; _HFI_ERROR("Connection error: Service Level mismatch (local:%d, remote:%d)\n", proto->epinfo.ep_sl, req->sl); } else { connect_result = PSM2_OK; if (ipsaddr->cstate_outgoing == CSTATE_NONE) { ips_epstate_idx idx; psmi_assert_always(newconnect == 1); err = ips_epstate_add(proto->epstate, ipsaddr, &idx); if (err) goto fail; ipsaddr->connidx_incoming = idx; } } /* Incoming connection request */ if (ipsaddr->cstate_outgoing != CSTATE_ESTABLISHED) { err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen); if (err) goto fail; } ipsaddr->cstate_incoming = CSTATE_ESTABLISHED; ipsaddr->cerror_incoming = connect_result; ipsaddr->runid_key = req->runid_key; proto->num_connected_incoming++; do_reply: psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); ips_proto_send_ctrl_message_reply(proto, &ipsaddr->flows[proto->msgflowid], OPCODE_CONNECT_REPLY, &ipsaddr->ctrl_msg_queued); no_reply: fail: return err; } psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep, const psm2_epid_t *array_of_epid, const int *array_of_epid_mask, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in) { int i, n, n_first; psm2_error_t err = PSM2_OK; psm2_epaddr_t epaddr; ips_epaddr_t *ipsaddr; ips_epstate_idx idx; int numep_toconnect = 0, numep_left; union psmi_envvar_val credits_intval; int connect_credits; psmi_getenv("PSM2_CONNECT_CREDITS", "End-point connect request credits.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)100, &credits_intval); connect_credits = credits_intval.e_uint; PSMI_LOCK_ASSERT(proto->mq->progress_lock); /* All timeout values are in cycles */ uint64_t t_start = get_cycles(); /* Print a timeout at the warning interval */ union psmi_envvar_val warn_intval; uint64_t to_warning_interval; uint64_t to_warning_next; /* Setup warning interval */ psmi_getenv("PSM2_CONNECT_WARN_INTERVAL", "Period in seconds to warn if connections are not completed." "Default is 300 seconds, 0 to disable", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)300, &warn_intval); to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL); to_warning_next = t_start + to_warning_interval; /* Some sanity checks */ psmi_assert_always(array_of_epid_mask != NULL); /* First pass: make sure array of errors is at least fully defined */ for (i = 0; i < numep; i++) { _HFI_VDBG("epid-connect=%s connect to %d:%d:%d\n", array_of_epid_mask[i] ? "YES" : " NO", (int)PSMI_EPID_GET_LID(array_of_epid[i]), (int)PSMI_EPID_GET_CONTEXT(array_of_epid[i]), (int)PSMI_EPID_GET_SUBCONTEXT(array_of_epid[i])); if (array_of_epid_mask[i]) { array_of_errors[i] = PSM2_EPID_UNKNOWN; array_of_epaddr[i] = NULL; } } /* Second pass: see what to connect and what is connectable. */ for (i = 0, numep_toconnect = 0; i < numep; i++) { if (!array_of_epid_mask[i]) continue; /* Can't send to epid on same lid if not loopback */ if ((psm2_epid_nid(proto->ep->epid) == psm2_epid_nid(array_of_epid[i])) && !(proto->flags & IPS_PROTO_FLAG_LOOPBACK)) { array_of_errors[i] = PSM2_EPID_UNREACHABLE; continue; } if ((PSMI_EPID_VERSION == PSMI_EPID_V2) && (PSMI_GET_SUBNET_ID(proto->ep->gid_hi) != PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]))) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Trying to connect to a HFI (subnet id - %"PRIu64")on a" " different subnet - %"PRIu64" \n", PSMI_GET_SUBNET_ID(proto->ep->gid_hi), PSMI_EPID_GET_SUBNET_ID(array_of_epid[i])); } epaddr = psmi_epid_lookup(proto->ep, array_of_epid[i]); if (epaddr == NULL) { /* We're sending a connect request message before some other node * has sent its connect message */ epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i], NULL, PSMI_HFI_TYPE_OPA1, (timeout_in / 1000000UL)); if (epaddr == NULL) { err = PSM2_NO_MEMORY; goto fail; } ipsaddr = (ips_epaddr_t *) epaddr; err = ips_epstate_add(proto->epstate, ipsaddr, &idx); if (err) goto fail; ipsaddr->connidx_incoming = idx; } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { /* already connected */ psmi_assert_always(((ips_epaddr_t *) epaddr)-> cstate_outgoing == CSTATE_ESTABLISHED); array_of_errors[i] = PSM2_EPID_ALREADY_CONNECTED; array_of_epaddr[i] = epaddr; continue; } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_NONE) { /* pre-created epaddr in multi-rail */ psmi_assert_always(epaddr->proto->ep != epaddr->proto->ep->mctxt_master); ipsaddr = (ips_epaddr_t *) epaddr; err = ips_epstate_add(proto->epstate, ipsaddr, &idx); if (err) goto fail; ipsaddr->connidx_incoming = idx; } else { /* We've already received a connect request message from a remote * peer, it's time to send our own. */ ipsaddr = (ips_epaddr_t *) epaddr; /* No re-entrancy sanity check and makes sure we are not connected * twice (caller's precondition) */ psmi_assert(ipsaddr->cstate_outgoing == CSTATE_NONE); psmi_assert(ipsaddr->cstate_incoming != CSTATE_NONE); } ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING; ipsaddr->cerror_outgoing = PSM2_OK; array_of_epaddr[i] = epaddr; ipsaddr->s_timeout = get_cycles(); ipsaddr->delay_in_ms = 1; ipsaddr->credit = 0; numep_toconnect++; } /* Second pass: do the actual connect. * PSM2_EPID_UNKNOWN: Not connected yet. * PSM2_EPID_UNREACHABLE: Not to be connected. * PSM2_OK: Successfully connected. * Start sending connect messages at a random index between 0 and numep-1 */ numep_left = numep_toconnect; n_first = ((uint32_t) get_cycles()) % numep; while (numep_left > 0) { for (n = 0; n < numep; n++) { int keep_polling = 1; i = (n_first + n) % numep; if (!array_of_epid_mask[i]) continue; switch (array_of_errors[i]) { case PSM2_EPID_UNREACHABLE: case PSM2_EPID_ALREADY_CONNECTED: case PSM2_OK: continue; default: break; } psmi_assert_always(array_of_epaddr[i] != NULL); ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { /* This is not the real error code, we only set OK here * so we know to stop polling for the reply. The actual * error is in ipsaddr->cerror_outgoing */ array_of_errors[i] = PSM2_OK; numep_left--; connect_credits++; ipsaddr->credit = 0; continue; } while (keep_polling) { if (!psmi_cycles_left(t_start, timeout_in)) { err = PSM2_TIMEOUT; goto err_timeout; } if (to_warning_interval && get_cycles() >= to_warning_next) { #if _HFI_DEBUGGING uint64_t waiting_time = 0; if (_HFI_INFO_ON) { waiting_time = cycles_to_nanosecs( get_cycles() - t_start) / SEC_ULL; } #endif const char *first_name = NULL; int num_waiting = 0; for (i = 0; i < numep; i++) { if (!array_of_epid_mask[i] || array_of_errors[i] != PSM2_EPID_UNKNOWN) continue; if (!first_name) first_name = psmi_epaddr_get_name (array_of_epid[i]); num_waiting++; } if (_HFI_INFO_ON) { if (first_name) { _HFI_INFO_ALWAYS ("Couldn't connect to %s (and %d others). " "Time elapsed %02i:%02i:%02i. Still trying...\n", first_name, num_waiting, (int)(waiting_time / 3600), (int)((waiting_time / 60) - ((waiting_time / 3600) * 60)), (int)(waiting_time - ((waiting_time / 60) * 60))); } } to_warning_next = get_cycles() + to_warning_interval; } if (get_cycles() > ipsaddr->s_timeout) { if (!ipsaddr->credit && connect_credits) { ipsaddr->credit = 1; connect_credits--; } if (ipsaddr->credit) { _HFI_VDBG ("Connect req to %u:%u:%u\n", __be16_to_cpu(ipsaddr-> pathgrp->pg_base_dlid), ipsaddr->context, ipsaddr->subcontext); psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); if ( ips_proto_send_ctrl_message_request (proto, &ipsaddr-> flows[proto->msgflowid], OPCODE_CONNECT_REQUEST, &ipsaddr->ctrl_msg_queued, 0) == PSM2_OK) { keep_polling = 0; ipsaddr->delay_in_ms = min(100, ipsaddr-> delay_in_ms << 1); ipsaddr->s_timeout = get_cycles() + nanosecs_to_cycles (ipsaddr-> delay_in_ms * MSEC_ULL); } /* If not, send got "busy", keep trying */ } else { keep_polling = 0; } } if ((err = psmi_err_only(psmi_poll_internal (proto->ep, 1)))) goto fail; if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { /* This is not the real error code, we only set OK here * so we know to stop polling for the reply. The actual * error is in ipsaddr->cerror_outgoing */ array_of_errors[i] = PSM2_OK; numep_left--; connect_credits++; ipsaddr->credit = 0; break; } } } } err_timeout: /* Find the worst error to report */ for (i = 0; i < numep; i++) { if (!array_of_epid_mask[i]) continue; switch (array_of_errors[i]) { /* These are benign */ case PSM2_EPID_UNREACHABLE: case PSM2_EPID_ALREADY_CONNECTED: break; case PSM2_EPID_UNKNOWN: array_of_errors[i] = PSM2_TIMEOUT; err = psmi_error_cmp(err, PSM2_TIMEOUT); break; case PSM2_OK: /* Restore the real connect error */ ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; array_of_errors[i] = ipsaddr->cerror_outgoing; psmi_assert_always(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); if (ipsaddr->cerror_outgoing != PSM2_OK) { err = psmi_error_cmp(err, ipsaddr->cerror_outgoing); ips_free_epaddr(array_of_epaddr[i], proto); array_of_epaddr[i] = NULL; } else { proto->num_connected_outgoing++; psmi_assert_always(ipsaddr->pathgrp-> pg_path[0] [IPS_PATH_HIGH_PRIORITY]-> pr_mtu > 0); } break; default: break; } } fail: return err; } /* Repercussions on MQ. * * If num_connected==0, everything that exists in the posted queue should * complete and the error must be marked epid_was_closed. * */ psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep, psm2_epaddr_t array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t array_of_errors[], uint64_t timeout_in) { ips_epaddr_t *ipsaddr; int numep_left, numep_todisc, i, n; int n_first; int has_pending; uint64_t timeout; psm2_error_t err = PSM2_OK; uint64_t reqs_sent = 0; union psmi_envvar_val credits_intval; int disconnect_credits; uint64_t t_warning, t_start; union psmi_envvar_val warn_intval; unsigned warning_secs; /* In case of a forced close, we cancel whatever timers are pending * on the proto so that we don't have zombie timers coming back * after the internal structures of PSM2 have been destroyed */ if (force) { struct psmi_timer *t_cursor; TAILQ_FOREACH(t_cursor, &proto->timerq->timerq, timer) { psmi_timer_cancel(proto->timerq, t_cursor); } } psmi_assert_always(numep > 0); psmi_getenv("PSM2_DISCONNECT_CREDITS", "End-point disconnect request credits.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)100, &credits_intval); disconnect_credits = credits_intval.e_uint; /* Setup warning interval */ psmi_getenv("PSM2_DISCONNECT_WARN_INTERVAL", "Period in seconds to warn if disconnections are not completed." "Default is 300 seconds, 0 to disable.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)300, &warn_intval); warning_secs = warn_intval.e_uint; PSMI_LOCK_ASSERT(proto->mq->progress_lock); /* First pass: see what to disconnect and what is disconnectable */ for (i = 0, numep_todisc = 0; i < numep; i++) { if (!array_of_epaddr_mask[i]) continue; psmi_assert_always(array_of_epaddr[i]->ptlctl->ptl == proto->ptl); ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; ipsaddr->credit = 0; if (ipsaddr->cstate_outgoing == CSTATE_NONE) { array_of_errors[i] = PSM2_OK; continue; } else { psmi_assert_always(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); } _HFI_VDBG("disconnecting %p\n", array_of_epaddr[i]); array_of_errors[i] = PSM2_EPID_UNKNOWN; numep_todisc++; } if (numep_todisc == 0) goto success; /* Wait for everyone to ack previous packets before putting */ if (timeout_in == 0) timeout = ~0ULL; else timeout = get_cycles() + nanosecs_to_cycles(timeout_in); t_start = get_cycles(); t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL); n_first = ((uint32_t) get_cycles()) % numep; if (!force) { numep_left = numep_todisc; do { for (n = 0; n < numep; n++) { i = (n_first + n) % numep; if (!array_of_epaddr_mask[i] || array_of_errors[i] == PSM2_OK) continue; ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; switch (ipsaddr->cstate_outgoing) { case CSTATE_OUTGOING_DISCONNECTED: array_of_errors[i] = PSM2_OK; numep_left--; disconnect_credits++; ipsaddr->credit = 0; continue; case CSTATE_OUTGOING_WAITING_DISC: if (ipsaddr->s_timeout > get_cycles()) continue; ipsaddr->delay_in_ms = min(100, ipsaddr->delay_in_ms << 1); ipsaddr->s_timeout = get_cycles() + nanosecs_to_cycles(ipsaddr-> delay_in_ms * MSEC_ULL); psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); ips_proto_send_ctrl_message_request (proto, &ipsaddr->flows[proto->msgflowid], OPCODE_DISCONNECT_REQUEST, &ipsaddr->ctrl_msg_queued, timeout); reqs_sent++; break; case CSTATE_ESTABLISHED: /* Still pending acks, hold off for now */ has_pending = !STAILQ_EMPTY(&ipsaddr->flows [EP_FLOW_GO_BACK_N_PIO]. scb_unacked) || !STAILQ_EMPTY(&ipsaddr->flows [EP_FLOW_GO_BACK_N_DMA]. scb_unacked); if (has_pending) continue; if (!ipsaddr->credit && disconnect_credits) { ipsaddr->credit = 1; disconnect_credits--; } if (!ipsaddr->credit) continue; ipsaddr->delay_in_ms = 1; ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING_DISC; ipsaddr->s_timeout = get_cycles() + nanosecs_to_cycles(MSEC_ULL); psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); ips_proto_send_ctrl_message_request (proto, &ipsaddr->flows[proto->msgflowid], OPCODE_DISCONNECT_REQUEST, &ipsaddr->ctrl_msg_queued, timeout); reqs_sent++; break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unhandled/unknown close state %d", ipsaddr->cstate_outgoing); break; } } if (numep_left == 0) break; if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) goto fail; if (warning_secs && get_cycles() > t_warning) { _HFI_INFO ("graceful close in progress for %d/%d peers " "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", numep_left, numep_todisc, (int)(cycles_to_nanosecs (get_cycles() - t_start) / MSEC_ULL), (int)(timeout_in / MSEC_ULL), (unsigned long long)reqs_sent); t_warning = get_cycles() + nanosecs_to_cycles(warning_secs * SEC_ULL); } } while (timeout > get_cycles()); if (numep_left > 0) { err = PSM2_TIMEOUT; for (i = 0; i < numep; i++) { if (!array_of_epaddr_mask[i]) continue; if (array_of_errors[i] == PSM2_EPID_UNKNOWN) { array_of_errors[i] = PSM2_TIMEOUT; _HFI_VDBG ("disc timeout on index %d, epaddr %s\n", i, psmi_epaddr_get_name (array_of_epaddr[i]->epid)); } } _HFI_PRDBG("graceful close incomplete for %d/%d peers " "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", numep_left, numep_todisc, (int)(cycles_to_nanosecs (get_cycles() - t_start) / MSEC_ULL), (int)(timeout_in / MSEC_ULL), (unsigned long long)reqs_sent); } else _HFI_PRDBG ("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n", numep_todisc, (int)(cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL), (unsigned long long)reqs_sent); } else { psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); for (n = 0; n < numep; n++) { i = (n_first + n) % numep; if (!array_of_epaddr_mask[i]) continue; ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; psmi_assert_always(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); ips_proto_send_ctrl_message_request(proto, &ipsaddr-> flows[proto->msgflowid], OPCODE_DISCONNECT_REQUEST, &ipsaddr->ctrl_msg_queued, 0); /* Force state to DISCONNECTED */ ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED; array_of_errors[i] = PSM2_OK; } _HFI_VDBG("non-graceful close complete from %d peers\n", numep); } for (i = 0; i < numep; i++) { if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM2_OK) continue; ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; if (ipsaddr->cstate_outgoing == CSTATE_NONE) continue; psmi_assert_always(ipsaddr->cstate_outgoing == CSTATE_OUTGOING_DISCONNECTED); proto->num_connected_outgoing--; /* Remote disconnect req arrived already, remove this epid. If it * hasn't arrived yet, that's okay, we'll pick it up later and just * mark our connect-to status as being "none". */ if (ipsaddr->cstate_incoming == CSTATE_NONE) { ips_free_epaddr(array_of_epaddr[i], proto); array_of_epaddr[i] = NULL; } else ipsaddr->cstate_outgoing = CSTATE_NONE; } fail: success: return err; } int ips_proto_isconnected(ips_epaddr_t *ipsaddr) { if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED || ipsaddr->cstate_incoming == CSTATE_ESTABLISHED) return 1; else return 0; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_dump.c000066400000000000000000000200241370564314600211000ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_expected_proto.h" #include "ips_proto_help.h" void ips_proto_dump_frame(void *frame, int lenght, char *message) { uint8_t *raw_frame = frame; int counter; char default_message[] = ""; if (!message) message = default_message; printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame, message); for (counter = 0; counter < lenght; counter++) { if ((counter % 16) == 0) printf("\n"); if ((counter % 4) == 0) printf(" "); printf("%02X ", raw_frame[counter]); } printf("\n"); } void ips_proto_dump_data(void *data, int data_length) { int counter; uint8_t *payload = (uint8_t *) data; printf("\nHex dump of data, length = %i\n", data_length); for (counter = 0; counter < data_length; counter++) { if ((counter % 16) == 0) printf("\n %04d: ", counter); if ((counter % 4) == 0) printf(" "); printf("%02X ", payload[counter]); } printf("\n"); } void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg) { psmi_seqnum_t ack_seq_num; printf("\nHeader decoding in hex: %s\n", msg ? msg : ""); printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n", __be16_to_cpu(p_hdr->lrh[0])); printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1])); printf("LRH: Res4-PktLen12 %x\n", __be16_to_cpu(p_hdr->lrh[2])); printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3])); printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n", __be32_to_cpu(p_hdr->bth[0])); printf("BTH: F1-B1-Res6-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1])); printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2])); printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1)); printf("IPH: kver-sh-intr-tidctrl-tid-om-offset %x\n", __le32_to_cpu(p_hdr->khdr.kdeth0)); printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr)); ack_seq_num.psn_num = p_hdr->ack_seq_num; if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n", (__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK, (__be32_to_cpu(p_hdr->bth[2]) >> HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK, (__be32_to_cpu(p_hdr->bth[2]) >> HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK); else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW) printf("ack_seq_num gen %x, seq %x\n", ack_seq_num.psn_gen, ack_seq_num.psn_seq); else printf("ack_seq_num %x\n", ack_seq_num.psn_num); printf("src_rank/connidx %x\n", p_hdr->connidx); if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc); printf("flags %x\n", p_hdr->flags); } /* linux doesn't have strlcat; this is a stripped down implementation */ /* not super-efficient, but we use it rarely, and only for short strings */ /* not fully standards conforming! */ static size_t strlcat(char *d, const char *s, size_t l) { int dlen = strlen(d), slen, max; if (l <= dlen) /* bug */ return l; slen = strlen(s); max = l - (dlen + 1); if (slen > max) slen = max; memcpy(d + dlen, s, slen); d[dlen + slen] = '\0'; return dlen + slen + 1; /* standard says to return full length, not actual */ } /* decode RHF errors; only used one place now, may want more later */ void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len) { *msg = '\0'; /* if no errors, and so don't need to check what's first */ if (err & PSMI_HAL_RHF_ERR_ICRC) strlcat(msg, "icrcerr ", len); if (err & PSMI_HAL_RHF_ERR_ECC) strlcat(msg, "eccerr ", len); if (err & PSMI_HAL_RHF_ERR_LEN) strlcat(msg, "lenerr ", len); if (err & PSMI_HAL_RHF_ERR_TID) strlcat(msg, "tiderr ", len); if (err & PSMI_HAL_RHF_ERR_DC) strlcat(msg, "dcerr ", len); if (err & PSMI_HAL_RHF_ERR_DCUN) strlcat(msg, "dcuncerr ", len); if (err & PSMI_HAL_RHF_ERR_KHDRLEN) strlcat(msg, "khdrlenerr ", len); } void ips_proto_dump_err_stats(struct ips_proto *proto) { char err_stat_msg[2048]; char tmp_buf[128]; int len = sizeof(err_stat_msg); if (!(hfi_debug & __HFI_PKTDBG)) return; *err_stat_msg = '\0'; if (proto->error_stats.num_icrc_err || proto->error_stats.num_ecc_err || proto->error_stats.num_len_err || proto->error_stats.num_tid_err || proto->error_stats.num_dc_err || proto->error_stats.num_dcunc_err || proto->error_stats.num_khdrlen_err) { snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: "); if (proto->error_stats.num_icrc_err) { snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %" PRIu64 " ", proto->error_stats.num_icrc_err); strlcat(err_stat_msg, tmp_buf, len); } if (proto->error_stats.num_ecc_err) { snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %" PRIu64 " ", proto->error_stats.num_ecc_err); strlcat(err_stat_msg, tmp_buf, len); } if (proto->error_stats.num_len_err) { snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %" PRIu64 " ", proto->error_stats.num_len_err); strlcat(err_stat_msg, tmp_buf, len); } if (proto->error_stats.num_tid_err) { snprintf(tmp_buf, sizeof(tmp_buf), "TID: %" PRIu64 " ", proto->error_stats.num_tid_err); strlcat(err_stat_msg, tmp_buf, len); } if (proto->error_stats.num_dc_err) { snprintf(tmp_buf, sizeof(tmp_buf), "DC: %" PRIu64 " ", proto->error_stats.num_dc_err); strlcat(err_stat_msg, tmp_buf, len); } if (proto->error_stats.num_dcunc_err) { snprintf(tmp_buf, sizeof(tmp_buf), "DCUNC: %" PRIu64 " ", proto->error_stats.num_dcunc_err); strlcat(err_stat_msg, tmp_buf, len); } if (proto->error_stats.num_khdrlen_err) { snprintf(tmp_buf, sizeof(tmp_buf), "KHDRLEN: %" PRIu64 " ", proto->error_stats.num_khdrlen_err); strlcat(err_stat_msg, tmp_buf, len); } strlcat(err_stat_msg, "\n", len); } else strlcat(err_stat_msg, "No previous errors.\n", len); _HFI_ERROR("%s", err_stat_msg); } opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_expected.c000066400000000000000000002627521370564314600217540ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_scb.h" #include "ips_tid.h" #include "ips_tidflow.h" #include "ips_proto.h" #include "ips_expected_proto.h" #include "ips_proto_help.h" #include "psm_mq_internal.h" /* * Timer callbacks. When we need work to be done out of the receive process * loop, we schedule work on timers to be done at a later time. */ static psm2_error_t ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current); static psm2_error_t ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current); static void ips_protoexp_do_tf_seqerr(void *vpprotoexp /* actually: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr); static void ips_protoexp_do_tf_generr(void *vpprotoexp /* actually: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr); static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context); static void ips_tid_avail_callback(struct ips_tid *tidc, void *context); static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context); /* Defined at the ptl-level (breaks abstractions but needed for shared vs * non-shared contexts */ extern int ips_ptl_recvq_isempty(const struct ptl *ptl); static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc); static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc); #ifdef PSM_CUDA static void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, struct ips_tid_send_desc *tidsendc); static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, psm2_mq_req_t req, struct ips_tid_send_desc *tidsendc, struct ips_cuda_hostbuf *chb_prev, uint32_t tsess_srcoff, uint32_t tsess_length, uint32_t tsess_unaligned_start, psm2_chb_match_type_t type); #endif psm2_error_t MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, const struct ips_proto *proto, uint32_t protoexp_flags, int num_of_send_bufs, int num_of_send_desc, struct ips_protoexp **protoexp_o) { struct ips_protoexp *protoexp = NULL; uint32_t tidmtu_max; psm2_error_t err = PSM2_OK; protoexp = (struct ips_protoexp *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp)); if (protoexp == NULL) { err = PSM2_NO_MEMORY; goto fail; } *protoexp_o = protoexp; protoexp->ptl = (const struct ptl *)proto->ptl; protoexp->proto = (struct ips_proto *)proto; protoexp->timerq = proto->timerq; srand48_r((long int) getpid(), &protoexp->tidflow_drand48_data); protoexp->tid_flags = protoexp_flags; if (context->ep->memmode == PSMI_MEMMODE_MINIMAL) { protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED; } { /* * Adjust the session window size so that tid-grant message can * fit into a single frag size packet for single transfer, PSM * must send tid-grant message with a single packet. */ uint32_t fragsize, winsize; if (proto->flags & IPS_PROTO_FLAG_SDMA) fragsize = proto->epinfo.ep_mtu; else fragsize = proto->epinfo.ep_piosize; winsize = 2 * PSMI_PAGESIZE /* bytes per tid-pair */ /* space in packet */ * min((fragsize - sizeof(ips_tid_session_list)), /* space in tidsendc/tidrecvc descriptor */ PSM_TIDLIST_BUFSIZE) / sizeof(uint32_t); /* convert to tid-pair */ if (proto->mq->hfi_base_window_rv > winsize) proto->mq->hfi_base_window_rv = winsize; } /* Must be initialized already */ /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012 psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL && proto->ep->mq->rreq_pool != NULL && proto->ep->mq->sreq_pool != NULL); */ psmi_assert_always(proto->timerq != NULL); /* These request pools are managed by the MQ component */ protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool; protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool; /* tid traffic xfer type */ if (proto->flags & IPS_PROTO_FLAG_SPIO) protoexp->tid_xfer_type = PSM_TRANSFER_PIO; else protoexp->tid_xfer_type = PSM_TRANSFER_DMA; /* ctrl ack/nak xfer type */ if (proto->flags & IPS_PROTO_FLAG_SDMA) protoexp->ctrl_xfer_type = PSM_TRANSFER_DMA; else protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO; /* Initialize tid flow control. */ err = ips_tf_init(protoexp, context, &protoexp->tfc, ips_tidflow_avail_callback); if (err != PSM2_OK) goto fail; if (proto->flags & IPS_PROTO_FLAG_SPIO) tidmtu_max = proto->epinfo.ep_piosize; else tidmtu_max = proto->epinfo.ep_mtu; protoexp->tid_send_fragsize = tidmtu_max; if ((err = ips_tid_init(context, protoexp, ips_tid_avail_callback, protoexp))) goto fail; if ((err = ips_scbctrl_init(context, num_of_send_desc, 0, 0, 0, ips_tid_scbavail_callback, protoexp, &protoexp->tid_scbc_rv))) goto fail; { /* Determine interval to generate headers (relevant only when header * suppression is enabled) else headers will always be generated. * * The PSM2_EXPECTED_HEADERS environment variable can specify the * packet interval to generate headers at. Else a header packet is * generated every * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize). * Note: A header is always generated for the last packet in the flow. */ union psmi_envvar_val env_exp_hdr; uint32_t defval = min(PSM_DEFAULT_EXPECTED_HEADER, proto->mq->hfi_base_window_rv / protoexp->tid_send_fragsize); psmi_getenv("PSM2_EXPECTED_HEADERS", "Interval to generate expected protocol headers", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)defval, &env_exp_hdr); protoexp->hdr_pkt_interval = env_exp_hdr.e_uint; /* Account for flow credits - Should try to have atleast 4 headers * generated per window. */ protoexp->hdr_pkt_interval = max(min (protoexp->hdr_pkt_interval, proto->flow_credits >> 2), 1); if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) { _HFI_VDBG ("Overriding PSM2_EXPECTED_HEADERS=%u to be '%u'\n", env_exp_hdr.e_uint, protoexp->hdr_pkt_interval); } } { union psmi_envvar_val env_rts_cts_interleave; psmi_getenv("PSM2_RTS_CTS_INTERLEAVE", "Interleave the handling of RTS to provide a fair distribution between multiple senders", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)0, &env_rts_cts_interleave); if (env_rts_cts_interleave.e_uint) protoexp->tid_flags |= IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE; } /* Send descriptors. * * There can be up to 2^32 of these send descriptors. We conservatively * allocate 256 but large node configurations can allocate up to sdesc_num * of these (they are about 2k each). * We impose a theoretical limit of 2^30. */ { struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS; uint32_t maxsz, chunksz; if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, &rlim, &maxsz, &chunksz))) goto fail; protoexp->tid_desc_send_pool = psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz, maxsz, 0, DESCRIPTORS, NULL, NULL); if (protoexp->tid_desc_send_pool == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, "Couldn't allocate tid descriptor memory pool"); goto fail; } } /* Receive descriptors are an array in tidflow structure. */ /* This pool can never be smaller than the max number of rreqs that can be * allocated. */ { uint32_t rreq_per_chunk, rreq_max; psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL); psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool, &rreq_per_chunk, &rreq_max); protoexp->tid_getreq_pool = psmi_mpool_create(sizeof(struct ips_tid_get_request), rreq_per_chunk, rreq_max, 0, DESCRIPTORS, NULL, NULL); if (protoexp->tid_getreq_pool == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, "Couldn't allocate getreq descriptor memory pool"); goto fail; } } /* Timers to handle requeueing of work out of the receive path */ psmi_timer_entry_init(&protoexp->timer_send, ips_tid_pendsend_timer_callback, protoexp); STAILQ_INIT(&protoexp->pend_sendq); psmi_timer_entry_init(&protoexp->timer_getreqs, ips_tid_pendtids_timer_callback, protoexp); STAILQ_INIT(&protoexp->pend_getreqsq); protoexp->tid_page_offset_mask = PSMI_PAGESIZE - 1; protoexp->tid_page_mask = ~(PSMI_PAGESIZE - 1); /* * After ips_tid_init(), we know if we use tidcache or not. * if tid cache is used, we can't use tid debug. */ #ifdef PSM_DEBUG if (protoexp->tidc.tid_array == NULL) protoexp->tid_flags |= IPS_PROTOEXP_FLAG_TID_DEBUG; #endif if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { int i; protoexp->tid_info = (struct ips_tidinfo *) psmi_calloc(context->ep, UNDEFINED, IPS_TID_MAX_TIDS, sizeof(struct ips_tidinfo)); if (protoexp->tid_info == NULL) { err = PSM2_NO_MEMORY; goto fail; } for (i = 0; i < IPS_TID_MAX_TIDS; i++) { protoexp->tid_info[i].state = TIDSTATE_FREE; protoexp->tid_info[i].tidrecvc = NULL; protoexp->tid_info[i].tid = 0xFFFFFFFF; } } else protoexp->tid_info = NULL; #ifdef PSM_CUDA { if (PSMI_IS_CUDA_ENABLED && !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, &rlim, &maxsz, &chunksz))) goto fail; /* the maxsz is the amount in MB, not the number of entries, * since the element size depends on the window size */ max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; /* mpool requires max_elements to be power of 2. round down. */ max_elements = 1 << (31 - __builtin_clz(max_elements)); protoexp->cuda_hostbuf_recv_cfg.bufsz = proto->mq->hfi_base_window_rv; protoexp->cuda_hostbuf_pool_recv = psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, psmi_cuda_hostbuf_alloc_func, (void *) &protoexp->cuda_hostbuf_recv_cfg); if (protoexp->cuda_hostbuf_pool_recv == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, "Couldn't allocate CUDA host receive buffer pool"); goto fail; } protoexp->cuda_hostbuf_small_recv_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ; protoexp->cuda_hostbuf_pool_small_recv = psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), chunksz, max_elements, 0, UNDEFINED, NULL, NULL, psmi_cuda_hostbuf_alloc_func, (void *) &protoexp->cuda_hostbuf_small_recv_cfg); if (protoexp->cuda_hostbuf_pool_small_recv == NULL) { err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, "Couldn't allocate CUDA host small receive buffer pool"); goto fail; } PSMI_CUDA_CALL(cuStreamCreate, &protoexp->cudastream_recv, CU_STREAM_NON_BLOCKING); STAILQ_INIT(&protoexp->cudapend_getreqsq); } else { protoexp->cuda_hostbuf_pool_recv = NULL; protoexp->cuda_hostbuf_pool_small_recv = NULL; } } #endif psmi_assert(err == PSM2_OK); return err; fail: #ifdef PSM_CUDA if (protoexp != NULL && protoexp->cuda_hostbuf_pool_recv != NULL) psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); if (protoexp != NULL && protoexp->cuda_hostbuf_pool_small_recv != NULL) psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); #endif if (protoexp != NULL && protoexp->tid_getreq_pool != NULL) psmi_mpool_destroy(protoexp->tid_getreq_pool); if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL) psmi_mpool_destroy(protoexp->tid_desc_send_pool); if (protoexp != NULL) ips_scbctrl_fini(&protoexp->tid_scbc_rv); if (protoexp != NULL) psmi_free(protoexp); return err; } MOCK_DEF_EPILOGUE(ips_protoexp_init); psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp) { psm2_error_t err = PSM2_OK; #ifdef PSM_CUDA if(PSMI_IS_CUDA_ENABLED && !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv); } #endif psmi_mpool_destroy(protoexp->tid_getreq_pool); psmi_mpool_destroy(protoexp->tid_desc_send_pool); if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv))) goto fail; if ((err = ips_tid_fini(&protoexp->tidc))) goto fail; if ((err = ips_tf_fini(&protoexp->tfc))) goto fail; if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) psmi_free(protoexp->tid_info); psmi_free(protoexp); fail: return err; } /* New scbs now available. If we have pending sends or pending get requests, * turn on the timer so it can be processed. */ static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context) { struct ips_protoexp *protoexp = (struct ips_protoexp *)context; if (!STAILQ_EMPTY(&protoexp->pend_sendq)) psmi_timer_request(protoexp->timerq, &protoexp->timer_send, PSMI_TIMER_PRIO_1); if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); return; } /* New Tids are available. If there are pending get requests put the * get timer on the timerq so it can be processed. */ static void ips_tid_avail_callback(struct ips_tid *tidc, void *context) { struct ips_protoexp *protoexp = (struct ips_protoexp *)context; if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); return; } /* New Tid Flows are available. If there are pending get requests put the * get timer on the timerq so it can be processed. */ static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context) { struct ips_protoexp *protoexp = (struct ips_protoexp *)context; if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) { psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); } return; } /* * The tid get request is always issued from within the receive progress loop, * which is why we always enqueue the request instead of issuing it directly. * Eventually, if we expose tid_get to users, we will want to differentiate * when the request comes from the receive progress loop from cases where the * tid_get is issued directly from user code. * */ psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, void *buf, uint32_t length, psm2_epaddr_t epaddr, uint32_t remote_tok, uint32_t flags, ips_tid_completion_callback_t callback, void *context) { struct ips_tid_get_request *getreq; int count, tids, tidflows; uint64_t nbytes; PSM2_LOG_MSG("entering"); psmi_assert((((ips_epaddr_t *) epaddr)->window_rv % PSMI_PAGESIZE) == 0); getreq = (struct ips_tid_get_request *) psmi_mpool_get(protoexp->tid_getreq_pool); /* We can't *really* run out of these here because we always allocate as * much as available receive reqs */ if_pf(getreq == NULL) { PSM2_LOG_MSG("leaving"); psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Ran out of 'getreq' descriptors"); } getreq->tidgr_protoexp = protoexp; getreq->tidgr_epaddr = epaddr; getreq->tidgr_lbuf = buf; getreq->tidgr_length = length; getreq->tidgr_sendtoken = remote_tok; getreq->tidgr_ucontext = context; getreq->tidgr_callback = callback; getreq->tidgr_offset = 0; getreq->tidgr_bytesdone = 0; getreq->tidgr_flags = flags; #ifdef PSM_CUDA psm2_mq_req_t req = (psm2_mq_req_t)context; if ((req->is_buf_gpu_mem && !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) || ((req->is_buf_gpu_mem && (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && gpudirect_recv_threshold && length > gpudirect_recv_threshold))) { getreq->cuda_hostbuf_used = 1; getreq->tidgr_cuda_bytesdone = 0; STAILQ_INIT(&getreq->pend_cudabuf); } else getreq->cuda_hostbuf_used = 0; #endif /* nbytes is the bytes each channel should transfer. */ count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count; #ifdef PSM_CUDA if (req->is_buf_gpu_mem) nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE); else #endif nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE); getreq->tidgr_rndv_winsz = min(nbytes, ((ips_epaddr_t *) epaddr)->window_rv); /* must be within the tid window size */ if (getreq->tidgr_rndv_winsz > PSM_TID_WINSIZE) getreq->tidgr_rndv_winsz = PSM_TID_WINSIZE; STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next); tids = ips_tid_num_available(&protoexp->tidc); tidflows = ips_tf_available(&protoexp->tfc); if (tids > 0 && tidflows > 0) ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0); else if (tids != -1 && tidflows != -1) psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); PSM2_LOG_MSG("leaving"); return PSM2_OK; } /* List of perf events */ #define _ips_logeventid_tid_send_reqs 0 /* out of tid send descriptors */ #define ips_logevent_id(event) _ips_logeventid_ ## event #define ips_logevent(proto, event, ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr) static void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context) { uint64_t t_now = get_cycles(); switch (eventid) { case ips_logevent_id(tid_send_reqs):{ psm2_epaddr_t epaddr = (psm2_epaddr_t) context; proto->psmi_logevent_tid_send_reqs.count++; if (t_now >= proto->psmi_logevent_tid_send_reqs.next_warning) { psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OK, "Non-fatal temporary exhaustion of send tid dma descriptors " "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)", (double) cycles_to_nanosecs(t_now - proto-> t_init) / 1.0e9, (int)psm2_epid_nid(epaddr-> epid), (int)psm2_epid_context(epaddr-> epid), (long long)proto-> psmi_logevent_tid_send_reqs. count); proto->psmi_logevent_tid_send_reqs. next_warning = t_now + sec_2_cycles(proto-> psmi_logevent_tid_send_reqs. interval_secs); } } break; default: break; } return; } /* * Expected Protocol. * * We're granted tids (as part of a tid get request) and expected to fulfill * the request by associating the request's sendtoken to a tid send descriptor. * * It's possible to be out of tid send descriptors when somehow all allocated * descriptors can't complete all of their sends. For example, the targets of * the sends may be busy in computation loops and not processing incoming * packets. */ void ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) { ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; struct ips_proto *proto = tidrecvc->protoexp->proto; psmi_assert(proto->msgflowid < EP_FLOW_LAST); struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; ips_scb_t *scb; scb = tidrecvc->grantscb; ips_scb_opcode(scb) = OPCODE_LONG_CTS; scb->ips_lrh.khdr.kdeth0 = 0; scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val; scb->ips_lrh.data[0] = tidrecvc->rdescid; scb->ips_lrh.data[1].u32w1 = tidrecvc->getreq->tidgr_length; scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken; ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list; ips_scb_length(scb) = tidrecvc->tsess_tidlist_length; PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, flow->ipsaddr->epaddr.epid ,"tidrecvc->getreq->tidgr_sendtoken; %d", tidrecvc->getreq->tidgr_sendtoken); ips_proto_flow_enqueue(flow, scb); flow->flush(flow, NULL); } void ips_protoexp_send_tid_completion(struct ips_tid_recv_desc *tidrecvc, ptl_arg_t sdescid) { ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; struct ips_proto *proto = tidrecvc->protoexp->proto; psmi_assert(proto->msgflowid < EP_FLOW_LAST); struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; ips_scb_t *scb; PSM2_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM2_LOG_TX, proto->ep->epid, flow->ipsaddr->epaddr.epid ,"sdescid._desc_idx: %d", sdescid._desc_idx); scb = tidrecvc->completescb; ips_scb_opcode(scb) = OPCODE_EXPTID_COMPLETION; scb->ips_lrh.khdr.kdeth0 = 0; scb->ips_lrh.data[0] = sdescid; /* Attached tidflow gen/seq */ scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val; ips_proto_flow_enqueue(flow, scb); flow->flush(flow, NULL); if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { flow->flags &= ~IPS_FLOW_FLAG_SKIP_CTS; /* Let the next CTS be processed */ ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0); /* and make explicit progress for it. */ } } #ifdef PSM_CUDA static void psmi_deallocate_chb(struct ips_cuda_hostbuf* chb) { PSMI_CUDA_CALL(cuMemFreeHost, chb->host_buf); PSMI_CUDA_CALL(cuEventDestroy, chb->copy_status); psmi_free(chb); return; } #endif int ips_protoexp_recv_tid_completion(struct ips_recvhdrq_event *rcv_ev) { struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr; ptl_arg_t desc_id = p_hdr->data[0]; struct ips_tid_send_desc *tidsendc; PSM2_LOG_MSG("entering"); PSM2_LOG_EPM(OPCODE_EXPTID_COMPLETION,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, rcv_ev->proto->ep->mq->ep->epid,"desc_id._desc_idx: %d",desc_id._desc_idx); if (!ips_proto_is_expected_or_nak(rcv_ev)) { PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, &ipsaddr->flows[ips_proto_flowid(p_hdr)]); ips_proto_process_ack(rcv_ev); /* * Get the session send descriptor and complete. */ tidsendc = (struct ips_tid_send_desc *) psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, desc_id._desc_idx); _HFI_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc); if (tidsendc == NULL) { _HFI_ERROR ("exptid comp: Index %d is out of range\n", desc_id._desc_idx); PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } else { ptl_arg_t desc_tidsendc; psmi_mpool_get_obj_index_gen_count(tidsendc, &desc_tidsendc._desc_idx, &desc_tidsendc._desc_genc); _HFI_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n", desc_id._desc_idx, desc_id._desc_genc, desc_tidsendc._desc_idx, desc_tidsendc._desc_genc); /* See if the reference is still live and valid */ if (desc_tidsendc.u64 != desc_id.u64) { _HFI_ERROR("exptid comp: Genc %d does not match\n", desc_id._desc_genc); PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } } if (!STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)) { struct ips_message_header hdr; /* Hack to handle the tidflow */ hdr.data[0] = rcv_ev->p_hdr->data[0]; hdr.ack_seq_num = rcv_ev->p_hdr->mdata; hdr.khdr.kdeth0 = __cpu_to_le32(3 << HFI_KHDR_TIDCTRL_SHIFT); rcv_ev->p_hdr = &hdr; /* * This call should directly complete the tidflow * and free all scb on the unacked queue. */ ips_proto_process_ack(rcv_ev); /* Keep KW happy. */ rcv_ev->p_hdr = NULL; /* Prove that the scb will not leak in the unacked queue: */ psmi_assert(STAILQ_EMPTY(&tidsendc->tidflow.scb_unacked)); } psm2_mq_req_t req = tidsendc->mqreq; /* Check if we can complete the send request. */ req->send_msgoff += tidsendc->length; #ifdef PSM_CUDA if (req->cuda_hostbuf_used) { if (tidsendc->cuda_num_buf == 1) { tidsendc->cuda_hostbuf[0]->bytes_read += tidsendc->tid_list.tsess_length; if(tidsendc->cuda_hostbuf[0]->bytes_read == tidsendc->cuda_hostbuf[0]->size){ STAILQ_REMOVE(&req->sendreq_prefetch, tidsendc->cuda_hostbuf[0], ips_cuda_hostbuf, req_next); if (tidsendc->cuda_hostbuf[0]->is_tempbuf) psmi_deallocate_chb(tidsendc->cuda_hostbuf[0]); else { tidsendc->cuda_hostbuf[0]->req = NULL; tidsendc->cuda_hostbuf[0]->offset = 0; tidsendc->cuda_hostbuf[0]->bytes_read = 0; psmi_mpool_put(tidsendc->cuda_hostbuf[0]); } psmi_cuda_run_prefetcher(protoexp, tidsendc); } } else psmi_free(tidsendc->userbuf); } #endif if (req->send_msgoff == req->req_data.send_msglen) { psmi_mq_handle_rts_complete(req); } psmi_mpool_put(tidsendc); PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } int ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev) { struct ips_proto *proto = rcv_ev->proto; struct ips_protoexp *protoexp = proto->protoexp; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_tid_recv_desc *tidrecvc; ptl_arg_t desc_id; psmi_seqnum_t sequence_num; psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); PSM2_LOG_MSG("entering"); desc_id._desc_idx = ips_proto_flowid(p_hdr); PSM2_LOG_EPM(OPCODE_EXPTID,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, proto->ep->mq->ep->epid,"desc_id._desc_idx: %d", desc_id._desc_idx); desc_id._desc_genc = p_hdr->exp_rdescid_genc; tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; /* skip */ } /* IBTA CCA handling for expected flow. */ if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) { /* Mark flow to generate BECN in control packet */ tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; /* Update stats for congestion encountered */ proto->epaddr_stats.congestion_pkts++; /* Clear FECN event */ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; } sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); if_pf (PSM_HAL_ERROR_OK != psmi_hal_tidflow_check_update_pkt_seq( protoexp,sequence_num,tidrecvc,p_hdr, ips_protoexp_do_tf_generr,ips_protoexp_do_tf_seqerr)) return IPS_RECVHDRQ_CONTINUE; /* Reset the swapped generation count as we received a valid packet */ tidrecvc->tidflow_nswap_gen = 0; /* Do some sanity checking */ psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); int recv_completion = (tidrecvc->recv_tidbytes == (p_hdr->exp_offset + ips_recvhdrq_event_paylen(rcv_ev))); /* If sender requested an ACK with the packet and it is not the last * packet, or if the incoming flow faced congestion, respond with an * ACK packet. The ACK when congested will have the BECN bit set. */ if (((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) && !recv_completion) || (tidrecvc->tidflow.flags & IPS_FLOW_FLAG_GEN_BECN)) { ips_scb_t ctrlscb; /* Ack sender with descriptor index */ ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid; ctrlscb.ips_lrh.ack_seq_num = tidrecvc->tidflow_genseq.psn_val; ips_proto_send_ctrl_message(&tidrecvc->tidflow, OPCODE_ACK, &tidrecvc->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } /* If RSM is a HW capability, and RSM has found a TID packet marked * with FECN, the payload will be written to the eager buffer, and * we will have a payload pointer here. In that case, copy the payload * into the user's buffer. If RSM did not intercept this EXPTID * packet, the HFI will handle the packet payload. Possibly should * assert(0 < paylen < MTU). */ if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP) && ips_recvhdrq_event_payload(rcv_ev) && ips_recvhdrq_event_paylen(rcv_ev)) psmi_mq_mtucpy(tidrecvc->buffer + p_hdr->exp_offset, ips_recvhdrq_event_payload(rcv_ev), ips_recvhdrq_event_paylen(rcv_ev)); /* If last packet then we are done. We send a tid transfer completion * packet back to sender, free all tids and close the current tidflow * as well as tidrecvc descriptor. * Note: If we were out of tidflow, this will invoke the callback to * schedule pending transfer. */ if (recv_completion) { /* copy unaligned data if any */ uint8_t *dst, *src; if (tidrecvc->tid_list.tsess_unaligned_start) { dst = (uint8_t *)tidrecvc->buffer; src = (uint8_t *)p_hdr->exp_ustart; #ifdef PSM_CUDA if (tidrecvc->is_ptr_gpu_backed) { PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)dst, src, tidrecvc->tid_list.tsess_unaligned_start); } else #endif ips_protoexp_unaligned_copy(dst, src, tidrecvc->tid_list.tsess_unaligned_start); } if (tidrecvc->tid_list.tsess_unaligned_end) { dst = (uint8_t *)tidrecvc->buffer + tidrecvc->recv_msglen - tidrecvc->tid_list.tsess_unaligned_end; src = (uint8_t *)p_hdr->exp_uend; #ifdef PSM_CUDA if (tidrecvc->is_ptr_gpu_backed) { PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)dst, src, tidrecvc->tid_list.tsess_unaligned_end); } else #endif ips_protoexp_unaligned_copy(dst, src, tidrecvc->tid_list.tsess_unaligned_end); } /* reply tid transfer completion packet to sender */ ips_protoexp_send_tid_completion(tidrecvc, p_hdr->exp_sdescid); /* Mark receive as done */ ips_tid_recv_free(tidrecvc); } PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } #ifndef PSM_DEBUG # define ips_dump_tids(tid_list, msg, ...) #else static void ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...) { char buf[256]; size_t off = 0; int i, num_tids = tid_list->tsess_tidcount; va_list argptr; va_start(argptr, msg); off += vsnprintf(buf, sizeof(buf) - off, msg, argptr); va_end(argptr); for (i = 0; i < num_tids && off < (sizeof(buf) - 1); i++) off += snprintf(buf + off, sizeof(buf) - off, "%d%s", IPS_TIDINFO_GET_TID(tid_list->tsess_list[i]), i < num_tids - 1 ? "," : ""); _HFI_VDBG("%s\n", buf); return; } #endif static void ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc) { char buf[256]; size_t off = 0; int i; off += snprintf(buf + off, sizeof(buf) - off, "Remaining bytes: %d Member id %d is not in tid_session_id=%d :", tidsendc->remaining_tidbytes, tidsendc->tid_idx, tidsendc->rdescid._desc_idx); for (i = 0; i < tidsendc->tid_list.tsess_tidcount + 1; i++) off += snprintf(buf + off, sizeof(buf) - off, "%d,", IPS_TIDINFO_GET_TID(tidsendc->tid_list. tsess_list[i])); psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Trying to use tid idx %d and there are %d members: %s\n", tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount, buf); return; } #ifdef PSM_CUDA static psm2_error_t psmi_cuda_reclaim_hostbufs(struct ips_tid_get_request *getreq) { struct ips_protoexp *protoexp = getreq->tidgr_protoexp; struct ips_tid_getreq_cuda_hostbuf_pend *cmemcpyhead = &getreq->pend_cudabuf; struct ips_cuda_hostbuf *chb; CUresult status; /* Get the getreq's first memcpy op */ while (!STAILQ_EMPTY(cmemcpyhead)) { chb = STAILQ_FIRST(cmemcpyhead); PSMI_CUDA_CHECK_EVENT(chb->copy_status, status); if (status != CUDA_SUCCESS) { /* At least one of the copies is still * in progress. Schedule the timer, * then leave the CUDA progress phase * and check for other pending TID work. */ psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); return PSM2_OK_NO_PROGRESS; } /* The getreq's oldest cudabuf is done. Reclaim it. */ getreq->tidgr_cuda_bytesdone += chb->size; STAILQ_REMOVE_HEAD(cmemcpyhead, next); psmi_mpool_put(chb); } return PSM2_OK; } static struct ips_cuda_hostbuf* psmi_allocate_chb(uint32_t window_len) { struct ips_cuda_hostbuf* chb = (struct ips_cuda_hostbuf*) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 1, sizeof(struct ips_cuda_hostbuf)); if (chb == NULL) { psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, "Couldn't allocate cuda host buffers "); } PSMI_CUDA_CALL(cuMemHostAlloc, (void **) &chb->host_buf, window_len, CU_MEMHOSTALLOC_PORTABLE); PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); return chb; } static void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, struct ips_tid_send_desc *tidsendc) { struct ips_proto *proto = protoexp->proto; struct ips_cuda_hostbuf *chb = NULL; psm2_mq_req_t req = tidsendc->mqreq; uint32_t offset, window_len; /* try to push the prefetcher forward */ if (req->prefetch_send_msgoff < req->req_data.send_msglen) { /* some data remains to be sent */ offset = req->prefetch_send_msgoff; window_len = ips_cuda_next_window(tidsendc->ipsaddr->window_rv, offset, req->req_data.buf_len); if (window_len <= CUDA_SMALLHOSTBUF_SZ) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_small_send); if (chb == NULL) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_send); /* were any buffers available for the prefetcher? */ if (chb == NULL) return; req->prefetch_send_msgoff += window_len; chb->offset = offset; chb->size = window_len; chb->req = req; chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; chb->bytes_read = 0; PSMI_CUDA_CALL(cuMemcpyDtoHAsync, chb->host_buf, chb->gpu_buf, window_len, proto->cudastream_send); PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, proto->cudastream_send); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); return; } return; } static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, psm2_mq_req_t req, struct ips_tid_send_desc *tidsendc, struct ips_cuda_hostbuf *chb_prev, uint32_t tsess_srcoff, uint32_t tsess_length, uint32_t tsess_unaligned_start, psm2_chb_match_type_t type) { struct ips_proto *proto = protoexp->proto; struct ips_cuda_hostbuf *chb = NULL; uint32_t offset, window_len, attached=0; /* try to push the prefetcher forward */ while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) { /* some data remains to be sent */ offset = req->prefetch_send_msgoff; window_len = ips_cuda_next_window(tidsendc->ipsaddr->window_rv, offset, req->req_data.buf_len); if (window_len <= CUDA_SMALLHOSTBUF_SZ) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_small_send); if (chb == NULL) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_send); /* were any buffers available? If not force allocate */ if (chb == NULL) { chb = psmi_allocate_chb(window_len); psmi_assert(chb); chb->is_tempbuf = 1; } req->prefetch_send_msgoff += window_len; chb->offset = offset; chb->size = window_len; chb->req = req; chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; chb->bytes_read = 0; PSMI_CUDA_CALL(cuMemcpyDtoHAsync, chb->host_buf, chb->gpu_buf, window_len, proto->cudastream_send); PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, proto->cudastream_send); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); if (type == PSMI_CUDA_PARTIAL_MATCH_FOUND) { if ((tsess_srcoff < chb->offset) && ((tsess_srcoff + tsess_length) > chb->offset)) { tidsendc->cuda_hostbuf[0] = chb_prev; tidsendc->cuda_hostbuf[1] = chb; tidsendc->cuda_num_buf = 2; void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, tsess_length); tidsendc->userbuf = (void *)((uintptr_t) buffer); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start); return; } } else { if (attached) { tidsendc->cuda_hostbuf[0] = chb_prev; tidsendc->cuda_hostbuf[1] = chb; tidsendc->cuda_num_buf = 2; void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, tsess_length); tidsendc->userbuf = (void *)((uintptr_t) buffer); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start); attached = 0; return; } if ((tsess_srcoff > chb->offset) && (tsess_srcoff < (chb->offset + chb->size)) && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) { chb_prev = chb; attached = 1; chb = NULL; continue; } else if ((chb->offset <= tsess_srcoff) && ((tsess_srcoff + tsess_length) <= (chb->offset+chb->size))) { tidsendc->cuda_hostbuf[0] = chb; tidsendc->cuda_hostbuf[1] = NULL; tidsendc->cuda_num_buf = 1; tidsendc->userbuf = (void *)((uintptr_t) chb->host_buf + tsess_srcoff - chb->offset); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start ); return; } else chb = NULL; } } } static psm2_chb_match_type_t psmi_find_match_in_prefeteched_chb(struct ips_cuda_hostbuf* chb, ips_tid_session_list *tid_list, uint32_t prefetch_send_msgoff) { /* To get a match: * 1. Tid list offset + length is contained within a chb * 2. Tid list offset + length is contained within * the prefetched offset of this req. * 3. Tid list offset + length is partially prefetched * within one chb. (A partial match) */ if (chb->offset <= tid_list->tsess_srcoff) { if ((chb->offset + chb->size) >= (tid_list->tsess_srcoff + tid_list->tsess_length)) { return PSMI_CUDA_FULL_MATCH_FOUND; } else { if((chb->offset + chb->size) > tid_list->tsess_srcoff){ if(((chb->offset + (2 * chb->size)) > (tid_list->tsess_srcoff + tid_list->tsess_length)) && ((prefetch_send_msgoff) >= (tid_list->tsess_srcoff + tid_list->tsess_length))){ return PSMI_CUDA_SPLIT_MATCH_FOUND; } else if((tid_list->tsess_srcoff + tid_list->tsess_length) > prefetch_send_msgoff) { return PSMI_CUDA_PARTIAL_MATCH_FOUND; } } } } return PSMI_CUDA_CONTINUE; } #endif psm2_error_t ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, ips_epaddr_t *ipsaddr, psm2_mq_req_t req, ptl_arg_t rdescid, uint32_t tidflow_genseq, ips_tid_session_list *tid_list, uint32_t tid_list_size) { struct ips_tid_send_desc *tidsendc; uint32_t i, j, *src, *dst; PSM2_LOG_MSG("entering"); psmi_assert(tid_list_size > sizeof(ips_tid_session_list)); psmi_assert(tid_list_size <= sizeof(tidsendc->filler)); psmi_assert(tid_list->tsess_tidcount > 0); psmi_assert((rdescid._desc_genc>>16) == 0); tidsendc = (struct ips_tid_send_desc *) psmi_mpool_get(protoexp->tid_desc_send_pool); if (tidsendc == NULL) { PSM2_LOG_MSG("leaving"); ips_logevent(protoexp->proto, tid_send_reqs, ipsaddr); return PSM2_EP_NO_RESOURCES; } req->ptl_req_ptr = (void *)tidsendc; tidsendc->protoexp = protoexp; /* Uniquely identify this send descriptor in space and time */ tidsendc->sdescid._desc_idx = psmi_mpool_get_obj_index(tidsendc); tidsendc->sdescid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc); tidsendc->rdescid = rdescid; tidsendc->ipsaddr = ipsaddr; tidsendc->mqreq = req; /* * Copy received tidinfo to local tidsendc buffer. * while doing the copy, we try to merge the tids based on * following rules: * 1. both tids are virtually contiguous(i and i+1 in the array); * 2. both tids have the same tidpair value; * 3. first tid (i) has tidctrl=1; * 4. second tid (i+1) has tidctrl=2; * 5. total length does not exceed 512 pages (2M); * 6. The h/w supports merged tid_ctrl's. * * The restriction of 512 pages comes from the limited number * of bits we have for KDETH.OFFSET: * - The entire mapping space provided through TIDs is to be * viewed as a zero-based address mapping. * - We have 15 bits in KDETH offset field through which we * can address upto a maximum of 2MB. * (with 64-byte offset mode or KDETH.OM = 1) * - Assuming a 4KB page size, 2MB/4KB = 512 pages. */ psmi_mq_mtucpy_host_mem(&tidsendc->tid_list, tid_list, sizeof(ips_tid_session_list)); ips_dump_tids(tid_list, "Received %d tids: ", tid_list->tsess_tidcount); if (psmi_hal_has_cap(PSM_HAL_CAP_MERGED_TID_CTRLS)) { src = tid_list->tsess_list; dst = tidsendc->tid_list.tsess_list; dst[0] = src[0]; j = 0; i = 1; while (i < tid_list->tsess_tidcount) { if ((((dst[j]>>IPS_TIDINFO_TIDCTRL_SHIFT)+1) == (src[i]>>IPS_TIDINFO_TIDCTRL_SHIFT)) && (((dst[j]&IPS_TIDINFO_LENGTH_MASK)+ (src[i]&IPS_TIDINFO_LENGTH_MASK)) <= PSM_MAX_NUM_PAGES_IN_TIDPAIR)) { /* merge 'i' to 'j' * (We need to specify "tidctrl" value as 3 * if we merge the individual tid-pairs. * Doing that here) */ dst[j] += (2 << IPS_TIDINFO_TIDCTRL_SHIFT) + (src[i] & IPS_TIDINFO_LENGTH_MASK); i++; if (i == tid_list->tsess_tidcount) break; } j++; /* copy 'i' to 'j' */ dst[j] = src[i]; i++; } tidsendc->tid_list.tsess_tidcount = j + 1; tid_list = &tidsendc->tid_list; } else { tidsendc->tid_list.tsess_tidcount = tid_list->tsess_tidcount; psmi_mq_mtucpy(&tidsendc->tid_list.tsess_list, tid_list->tsess_list, tid_list->tsess_tidcount * sizeof(tid_list->tsess_list[0])); tid_list = &tidsendc->tid_list; } /* Initialize tidflow for window. Use path requested by remote endpoint */ ips_flow_init(&tidsendc->tidflow, protoexp->proto, ipsaddr, protoexp->tid_xfer_type, PSM_PROTOCOL_TIDFLOW, IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW); tidsendc->tidflow.xmit_seq_num.psn_val = tidflow_genseq; tidsendc->tidflow.xmit_ack_num.psn_val = tidflow_genseq; tidsendc->userbuf = (void *)((uintptr_t) req->req_data.buf + tid_list->tsess_srcoff); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tid_list->tsess_unaligned_start); tidsendc->length = tid_list->tsess_length; tidsendc->ctrl_msg_queued = 0; tidsendc->frag_size = min(protoexp->tid_send_fragsize, tidsendc->tidflow.frag_size); #ifdef PSM_CUDA /* Matching on previous prefetches and initiating next prefetch */ struct ips_cuda_hostbuf *chb = NULL, *chb_next = NULL; psm2_chb_match_type_t rc = PSMI_CUDA_CONTINUE; /* check if the prefetcher has a buffer ready to use */ tidsendc->cuda_hostbuf[0] = NULL; tidsendc->cuda_hostbuf[1] = NULL; tidsendc->cuda_num_buf = 0; if (req->cuda_hostbuf_used) { /* To get a match: * 1. Tid list offset + length is contained within a chb * 2. Tid list offset + length is contained within * the prefetched offset of this req. * 3. Tid list offset + length is partially prefetched * within one chb. (A partial match) */ STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) { rc = psmi_find_match_in_prefeteched_chb(chb, tid_list, req->prefetch_send_msgoff); if (rc < PSMI_CUDA_CONTINUE) break; } if (rc == PSMI_CUDA_FULL_MATCH_FOUND) { tidsendc->userbuf = (void *)((uintptr_t) chb->host_buf+ tid_list->tsess_srcoff - chb->offset); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tid_list->tsess_unaligned_start); /* now associate the buffer with the tidsendc */ tidsendc->cuda_hostbuf[0] = chb; tidsendc->cuda_hostbuf[1] = NULL; tidsendc->cuda_num_buf = 1; } else if (rc == PSMI_CUDA_SPLIT_MATCH_FOUND){ void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, tid_list->tsess_length); tidsendc->userbuf = (void *)((uintptr_t) buffer); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tid_list->tsess_unaligned_start); chb_next = STAILQ_NEXT(chb, req_next); tidsendc->cuda_hostbuf[0] = chb; tidsendc->cuda_hostbuf[1] = chb_next; tidsendc->cuda_num_buf = 2; } else if (rc == PSMI_CUDA_PARTIAL_MATCH_FOUND) { psmi_attach_chb_to_tidsendc(protoexp, req, tidsendc, chb, tid_list->tsess_srcoff, tid_list->tsess_length, tid_list->tsess_unaligned_start, rc); } else { psmi_attach_chb_to_tidsendc(protoexp, req, tidsendc, NULL, tid_list->tsess_srcoff, tid_list->tsess_length, tid_list->tsess_unaligned_start, PSMI_CUDA_CONTINUE); } } #endif /* frag size must be 64B multiples */ tidsendc->frag_size &= (~63); tidsendc->is_complete = 0; tidsendc->tid_idx = 0; tidsendc->frame_send = 0; tidsendc->tidbytes = 0; tidsendc->remaining_tidbytes = tid_list->tsess_length - tid_list->tsess_unaligned_start - tid_list->tsess_unaligned_end; tidsendc->remaining_bytes_in_tid = (IPS_TIDINFO_GET_LENGTH(tid_list->tsess_list[0]) << 12) - tid_list->tsess_tidoffset; tidsendc->offset_in_tid = tid_list->tsess_tidoffset; _HFI_EXP ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d,s=%d,e=%d\n", tidsendc->sdescid._desc_idx, rdescid._desc_idx, tid_list->tsess_srcoff, tid_list->tsess_length, tid_list->tsess_unaligned_start, tid_list->tsess_unaligned_end); ips_tid_send_exp(tidsendc); /* Add as a pending op and ring up the timer */ if (tidsendc->is_complete == 0) { STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next); psmi_timer_request(protoexp->timerq, &protoexp->timer_send, PSMI_TIMER_PRIO_1); } PSM2_LOG_MSG("leaving"); /* Consider breaking out of progress engine here */ return PSM2_OK; } static ips_scb_t * ips_scb_prepare_tid_sendctrl(struct ips_flow *flow, struct ips_tid_send_desc *tidsendc) { struct ips_protoexp *protoexp = tidsendc->protoexp; uint32_t *tsess_list = tidsendc->tid_list.tsess_list; uint32_t tid, omode, offset, chunk_size; uint32_t startidx, endidx; uint32_t frame_len, nfrag; uint8_t *bufptr = tidsendc->buffer; ips_scb_t *scb; uint8_t is_payload_per_frag_leq_8dw = 0; /* If payload in the first and last nfrag is less then or equal * to 8DW we disable header suppression so as to detect uncorrectable * errors which will otherwise be non-detectable(since header is * suppressed we lose RHF.EccErr) */ if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL) return NULL; /* * Make sure the next offset is in 64B multiples with the tid. */ frame_len = min(tidsendc->remaining_bytes_in_tid, tidsendc->remaining_tidbytes); if (frame_len > tidsendc->frag_size) { frame_len = tidsendc->frag_size - (tidsendc->offset_in_tid & 63); } /* * Frame length is the amount of payload to be included in a particular * frag of the scb, so we check if frame len is less than or equal * to 8DW. If length is less then then or equal to 8DW for the first * frag then we avoid header suppression */ if (frame_len <= 32) is_payload_per_frag_leq_8dw = 1; /* * Using large offset mode based on offset length. */ if (tidsendc->offset_in_tid < 131072) { /* 2^15 * 4 */ psmi_assert((tidsendc->offset_in_tid % 4) == 0); offset = tidsendc->offset_in_tid / 4; omode = 0; } else { psmi_assert((tidsendc->offset_in_tid % 64) == 0); offset = tidsendc->offset_in_tid / 64; omode = 1; } startidx = tidsendc->tid_idx; tid = IPS_TIDINFO_GET_TID(tsess_list[startidx]); scb->ips_lrh.khdr.kdeth0 = (offset & HFI_KHDR_OFFSET_MASK) | (omode << HFI_KHDR_OM_SHIFT) | (tid << HFI_KHDR_TID_SHIFT); scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(tsess_list[startidx]); scb->tsess = (uint32_t *) &tsess_list[startidx]; /* * Payload and buffer address for current packet. payload_size * must be the first packet size because it is used to initialize * the packet header. */ scb->payload_size = frame_len; ips_scb_buffer(scb) = (void *)bufptr; scb->frag_size = tidsendc->frag_size; /* * Other packet fields. */ PSM2_LOG_EPM(OPCODE_EXPTID,PSM2_LOG_TX, protoexp->proto->ep->epid, flow->ipsaddr->epaddr.epid, "psmi_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid._desc_idx: %d, tidsendc->sdescid._desc_idx: %d", psmi_mpool_get_obj_index(tidsendc->mqreq),tidsendc->rdescid._desc_idx,tidsendc->sdescid._desc_idx); ips_scb_opcode(scb) = OPCODE_EXPTID; scb->ips_lrh.exp_sdescid = tidsendc->sdescid; scb->ips_lrh.exp_rdescid_genc = (uint16_t)tidsendc->rdescid._desc_genc; scb->ips_lrh.exp_offset = tidsendc->tidbytes; scb->tidsendc = tidsendc; SLIST_NEXT(scb, next) = NULL; /* * Loop over the tid session list, count the frag number and payload size. */ nfrag = 1; chunk_size = frame_len; while (1) { /* Record last tididx used */ endidx = tidsendc->tid_idx; /* Check if all tidbytes are done */ tidsendc->remaining_tidbytes -= frame_len; if (!tidsendc->remaining_tidbytes) { /* We do another frame length check for the last frag */ if (frame_len <= 32) is_payload_per_frag_leq_8dw = 1; break; } /* Update in current tid */ tidsendc->remaining_bytes_in_tid -= frame_len; tidsendc->offset_in_tid += frame_len; psmi_assert((tidsendc->offset_in_tid >= 128*1024) ? ((tidsendc->offset_in_tid % 64) == 0) : ((tidsendc->offset_in_tid % 4) == 0)); /* Done with this tid, move on to the next tid */ if (!tidsendc->remaining_bytes_in_tid) { tidsendc->tid_idx++; psmi_assert_always(tidsendc->tid_idx < tidsendc->tid_list.tsess_tidcount); tidsendc->remaining_bytes_in_tid = IPS_TIDINFO_GET_LENGTH(tsess_list [tidsendc->tid_idx]) << 12; tidsendc->offset_in_tid = 0; } /* For PIO, only single packet per scb allowed */ if (flow->transfer == PSM_TRANSFER_PIO) { break; } frame_len = min(tidsendc->remaining_bytes_in_tid, tidsendc->remaining_tidbytes); if (frame_len > tidsendc->frag_size) frame_len = tidsendc->frag_size; nfrag++; chunk_size += frame_len; } scb->nfrag = nfrag; if (nfrag > 1) { scb->nfrag_remaining = scb->nfrag; scb->chunk_size = scb->chunk_size_remaining = chunk_size; } scb->tsess_length = (endidx - startidx + 1) * sizeof(uint32_t); /* Keep track of latest buffer location so we restart at the * right location, if we don't complete the transfer */ tidsendc->buffer = bufptr + chunk_size; tidsendc->tidbytes += chunk_size; if (flow->transfer == PSM_TRANSFER_DMA && psmi_hal_has_cap(PSM_HAL_CAP_DMA_HSUPP_FOR_32B_MSGS)) { is_payload_per_frag_leq_8dw = 0; } /* If last packet, we want a completion notification */ if (!tidsendc->remaining_tidbytes) { /* last packet/chunk, attach unaligned data */ uint8_t *dst, *src; if (tidsendc->tid_list.tsess_unaligned_start) { dst = (uint8_t *)scb->ips_lrh.exp_ustart; src = (uint8_t *)tidsendc->userbuf; #ifdef PSM_CUDA if (IS_TRANSFER_BUF_GPU_MEM(scb) && !tidsendc->mqreq->cuda_hostbuf_used) { PSMI_CUDA_CALL(cuMemcpyDtoH, dst, (CUdeviceptr)src, tidsendc->tid_list.tsess_unaligned_start); } else #endif ips_protoexp_unaligned_copy(dst, src, tidsendc->tid_list.tsess_unaligned_start); } if (tidsendc->tid_list.tsess_unaligned_end) { dst = (uint8_t *)&scb->ips_lrh.exp_uend; src = (uint8_t *)tidsendc->userbuf + tidsendc->length - tidsendc->tid_list.tsess_unaligned_end; #ifdef PSM_CUDA if (IS_TRANSFER_BUF_GPU_MEM(scb) && !tidsendc->mqreq->cuda_hostbuf_used) { PSMI_CUDA_CALL(cuMemcpyDtoH, dst, (CUdeviceptr)src, tidsendc->tid_list.tsess_unaligned_end); } else #endif ips_protoexp_unaligned_copy(dst, src, tidsendc->tid_list.tsess_unaligned_end); } /* * If the number of fragments is greater then one and * "no header suppression" flag is unset then we go * ahead and suppress the header */ if ((scb->nfrag > 1) && (!is_payload_per_frag_leq_8dw)) scb->scb_flags |= IPS_SEND_FLAG_HDRSUPP; else scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; tidsendc->is_complete = 1; } else { /* Do not suppress header every hdr_pkt_interval */ if ((++tidsendc->frame_send % protoexp->hdr_pkt_interval) == 0) /* Request an ACK */ scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; else { if (!is_payload_per_frag_leq_8dw) { /* Request hdr supp */ scb->scb_flags |= IPS_SEND_FLAG_HDRSUPP; } } /* assert only single packet per scb */ psmi_assert(scb->nfrag == 1); } #ifdef PSM_CUDA if (tidsendc->mqreq->is_buf_gpu_mem && /* request's buffer comes from GPU realm */ !tidsendc->mqreq->cuda_hostbuf_used) { /* and it was NOT moved to HOST memory */ scb->mq_req = tidsendc->mqreq; /* so let's mark it per scb, not to check its locality again */ ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; } #endif return scb; } /* * Returns: * * PSM2_OK: scb was allocated for at least one frame, the packet may be queued * or actually sent. * * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow * to be enqueued before polling receive queue. * * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more * scbs become available. * * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now. * */ static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) { ips_scb_t *scb = NULL; psm2_error_t err = PSM2_OK, err_f; struct ips_protoexp *protoexp = tidsendc->protoexp; struct ips_proto *proto = protoexp->proto; struct ips_flow *flow = &tidsendc->tidflow; #ifdef PSM_CUDA struct ips_cuda_hostbuf *chb, *chb_next; CUresult chb_status; uint32_t offset_in_chb, i; for (i = 0; i < tidsendc->cuda_num_buf; i++) { chb = tidsendc->cuda_hostbuf[i]; if (chb) { PSMI_CUDA_CHECK_EVENT(chb->copy_status, chb_status); if (chb_status != CUDA_SUCCESS) { err = PSM2_OK_NO_PROGRESS; PSM2_LOG_MSG("leaving"); return err; } } } if (tidsendc->cuda_num_buf == 2) { chb = tidsendc->cuda_hostbuf[0]; chb_next = tidsendc->cuda_hostbuf[1]; offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset; /* Copying data from multiple cuda * host buffers into a bounce buffer. */ memcpy(tidsendc->buffer, chb->host_buf + offset_in_chb, chb->size-offset_in_chb); memcpy(tidsendc->buffer+ chb->size - offset_in_chb, chb_next->host_buf, tidsendc->tid_list.tsess_srcoff + tidsendc->tid_list.tsess_length - chb_next->offset); chb->bytes_read += chb->size - offset_in_chb; chb_next->bytes_read += tidsendc->tid_list.tsess_srcoff + tidsendc->tid_list.tsess_length - chb_next->offset; if(chb->bytes_read == chb->size) { STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb, ips_cuda_hostbuf, req_next); if (chb->is_tempbuf) psmi_deallocate_chb(chb); else { chb->req = NULL; chb->offset = 0; chb->bytes_read = 0; psmi_mpool_put(chb); } psmi_cuda_run_prefetcher(protoexp, tidsendc); } if(chb_next->bytes_read == chb_next->size) { STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next, ips_cuda_hostbuf, req_next); if (chb_next->is_tempbuf) psmi_deallocate_chb(chb_next); else{ chb_next->req = NULL; chb_next->offset = 0; chb_next->bytes_read = 0; psmi_mpool_put(chb_next); } psmi_cuda_run_prefetcher(protoexp, tidsendc); } } #endif /* * We aggressively try to grab as many scbs as possible, enqueue them to a * flow and flush them when either we're out of scbs our we've completely * filled the send request. */ while (!tidsendc->is_complete) { if_pf(tidsendc->tid_list.tsess_tidcount && (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount || tidsendc->tid_idx < 0)) ips_expsend_tiderr(tidsendc); if ((scb = ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) { proto->stats.scb_exp_unavail_cnt++; err = PSM2_EP_NO_RESOURCES; break; } else { ips_proto_flow_enqueue(flow, scb); } } if (!SLIST_EMPTY(&flow->scb_pend)) { /* Something to flush */ int num_sent; err_f = flow->flush(flow, &num_sent); if (err != PSM2_EP_NO_RESOURCES) { /* PSM2_EP_NO_RESOURCES is reserved for out-of-scbs */ if (err_f == PSM2_EP_NO_RESOURCES) err = PSM2_TIMEOUT; /* force a resend reschedule */ else if (err_f == PSM2_OK && num_sent > 0 && !ips_ptl_recvq_isempty(protoexp->ptl)) err = PSM2_OK_NO_PROGRESS; /* force a rcvhdrq service */ } } PSM2_LOG_MSG("leaving"); return err; } static psm2_error_t ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) { struct ips_protoexp *protoexp = (struct ips_protoexp *)timer->context; struct ips_tid_send_pend *phead = &protoexp->pend_sendq; struct ips_tid_send_desc *tidsendc; psm2_error_t err = PSM2_OK; while (!STAILQ_EMPTY(phead)) { tidsendc = STAILQ_FIRST(phead); err = ips_tid_send_exp(tidsendc); if (tidsendc->is_complete) STAILQ_REMOVE_HEAD(phead, next); if (err == PSM2_OK) { /* Was able to complete the send, keep going */ } else if (err == PSM2_EP_NO_RESOURCES) { /* No more sendbufs available, sendbuf callback will requeue this * timer */ break; } else if (err == PSM2_TIMEOUT) { /* Always a case of try later: * On PIO flow, means no send pio bufs available * On DMA flow, means kernel can't queue request or would have to block */ psmi_timer_request(protoexp->proto->timerq, &protoexp->timer_send, get_cycles() + protoexp->proto->timeout_send); break; } else { /* Forced to reschedule later so we can check receive queue */ psmi_assert(err == PSM2_OK_NO_PROGRESS); psmi_timer_request(protoexp->proto->timerq, &protoexp->timer_send, PSMI_TIMER_PRIO_1); break; } } return PSM2_OK; } /* Right now, in the kernel we are allowing for virtually non-contiguous pages, in a single call, and we are therefore locking one page at a time, but since the intended use of this routine is for a single group of virtually contiguous pages, that should change to improve performance. That means possibly changing the calling MPI code. Doing so gets rid of some of the loop stuff here, and in the driver, and allows for a single call to the core VM code in the kernel, rather than one per page, definitely improving performance. */ static psm2_error_t ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp, struct ips_tid_recv_desc *tidrecvc, uint32_t nbytes_this) { ips_tid_session_list *tid_list = &tidrecvc->tid_list; uintptr_t bufptr = (uintptr_t) tidrecvc->buffer; uint32_t size = nbytes_this; psm2_error_t err = PSM2_OK; uintptr_t pageaddr; uint32_t tidoff, pageoff, pagelen, reglen, num_tids; psmi_assert(size >= 4); /* * The following calculation does not work when size < 4 * and bufptr is byte aligned, it can get negative value. */ tid_list->tsess_unaligned_start = (bufptr & 3) ? (4 - (bufptr & 3)) : 0; size -= tid_list->tsess_unaligned_start; bufptr += tid_list->tsess_unaligned_start; tid_list->tsess_unaligned_end = size & 3; size -= tid_list->tsess_unaligned_end; psmi_assert(size > 0); #ifdef PSM_CUDA /* Driver pins GPU pages when using GPU Direct RDMA for TID recieves, * to accomadate this change the calculations of pageaddr, pagelen * and pageoff have been modified to take GPU page size into * consideration. */ if (tidrecvc->is_ptr_gpu_backed) { uint64_t page_mask = ~(PSMI_GPU_PAGESIZE -1); uint32_t page_offset_mask = (PSMI_GPU_PAGESIZE -1); pageaddr = bufptr & page_mask; pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + ((bufptr + size - 1) & page_mask) - (bufptr & page_mask)); tidoff = pageoff = (uint32_t) (bufptr & page_offset_mask); } else #endif { pageaddr = bufptr & protoexp->tid_page_mask; pagelen = (uint32_t) (PSMI_PAGESIZE + ((bufptr + size - 1) & protoexp->tid_page_mask) - (bufptr & protoexp->tid_page_mask)); tidoff = pageoff = (uint32_t) (bufptr & protoexp->tid_page_offset_mask); } reglen = pagelen; if (protoexp->tidc.tid_array) { if ((err = ips_tidcache_acquire(&protoexp->tidc, (void *)pageaddr, ®len, (uint32_t *) tid_list->tsess_list, &num_tids, &tidoff #ifdef PSM_CUDA , tidrecvc->is_ptr_gpu_backed #endif ))) goto fail; } else { if ((err = ips_tid_acquire(&protoexp->tidc, (void *)pageaddr, ®len, (uint32_t *) tid_list->tsess_list, &num_tids #ifdef PSM_CUDA , tidrecvc->is_ptr_gpu_backed #endif ))) goto fail; } /* * PSM2 currently provides storage space enough to hold upto * 1024 tids. (PSM_TIDLIST_BUFSIZE). So, make sure we * don't get more than what we can hold from the tidcache here. * * The reason for 1024 tids comes from the PSM_TID_WINSIZE value * (currently 4MB. So, if in future, there is a change to this macro, * then you would need a change to PSM_TIDLIST_BUFSIZE as well). * * Assuming a 4KB page size, to be able to receive * a message of 4MB size, we'd need an maximum of 4MB/4KB = 1024 tids. */ psmi_assert(num_tids > 0); psmi_assert(num_tids <= (PSM_TID_WINSIZE/PSM_TIDLIST_BUFSIZE)); if (reglen > pagelen) { err = psmi_handle_error(protoexp->tidc.context->ep, PSM2_EP_DEVICE_FAILURE, "PSM tid registration: " "register more pages than asked"); goto fail; } else if (reglen < pagelen) { /* * driver registered less pages, update PSM records. */ tid_list->tsess_unaligned_end = 0; tidrecvc->recv_tidbytes = reglen - pageoff; tidrecvc->recv_msglen = tid_list->tsess_unaligned_start + tidrecvc->recv_tidbytes; } else { tidrecvc->recv_tidbytes = size; tidrecvc->recv_msglen = nbytes_this; } tid_list->tsess_tidcount = num_tids; tid_list->tsess_tidoffset = tidoff; ips_dump_tids(tid_list, "Registered %d tids: ", num_tids); fail: return err; } static psm2_error_t ips_tid_recv_alloc(struct ips_protoexp *protoexp, ips_epaddr_t *ipsaddr, const struct ips_tid_get_request *getreq, uint32_t nbytes_this, struct ips_tid_recv_desc **ptidrecvc) { psm2_error_t err; ips_scb_t *grantscb, *completescb; struct ips_tid_recv_desc *tidrecvc; PSM2_LOG_MSG("entering"); /* Allocate all necessary resources. */ /* 1. allocate a tid grant scb. */ grantscb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); if (grantscb == NULL) { /* ips_tid_scbavail_callback() will reschedule */ PSM2_LOG_MSG("leaving"); return PSM2_EP_NO_RESOURCES; } /* 2. allocate a tid complete scb. */ completescb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); if (completescb == NULL) { ips_scbctrl_free(grantscb); /* ips_tid_scbavail_callback() will reschedule */ PSM2_LOG_MSG("leaving"); return PSM2_EP_NO_RESOURCES; } /* 3. allocate a tid flow entry. */ err = ips_tf_allocate(&protoexp->tfc, &tidrecvc); if (err != PSM2_OK) { ips_scbctrl_free(completescb); ips_scbctrl_free(grantscb); /* Unable to get a tidflow for expected protocol. */ psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); PSM2_LOG_MSG("leaving"); return err; } #ifdef PSM_CUDA psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext; if (req->is_buf_gpu_mem) tidrecvc->is_ptr_gpu_backed = !getreq->cuda_hostbuf_used; else tidrecvc->is_ptr_gpu_backed = req->is_buf_gpu_mem; /* 4. allocate a cuda bounce buffer, if required */ struct ips_cuda_hostbuf *chb = NULL; if (getreq->cuda_hostbuf_used) { if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( protoexp->cuda_hostbuf_pool_small_recv); if (chb == NULL) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( protoexp->cuda_hostbuf_pool_recv); if (chb == NULL) { /* Unable to get a cudahostbuf for TID. * Release the resources we're holding and reschedule.*/ ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); ips_scbctrl_free(completescb); ips_scbctrl_free(grantscb); psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); PSM2_LOG_MSG("leaving"); return PSM2_EP_NO_RESOURCES; } tidrecvc->cuda_hostbuf = chb; tidrecvc->buffer = chb->host_buf; chb->size = 0; chb->gpu_buf = (CUdeviceptr) getreq->tidgr_lbuf + getreq->tidgr_offset; } else { chb = NULL; tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset); tidrecvc->cuda_hostbuf = NULL; } #else tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset); #endif /* 5. allocate some tids from driver. */ err = ips_tid_recv_alloc_frag(protoexp, tidrecvc, nbytes_this); if (err != PSM2_OK) { #ifdef PSM_CUDA if (chb) psmi_mpool_put(chb); #endif ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); ips_scbctrl_free(completescb); ips_scbctrl_free(grantscb); /* Unable to register tids */ psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); PSM2_LOG_MSG("leaving"); return err; } if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { int num_tids = tidrecvc->tid_list.tsess_tidcount; int tid, i; for (i = 0; i < num_tids; i++) { tid = IPS_TIDINFO_GET_TID(tidrecvc->tid_list. tsess_list[i]) * 2 + IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list. tsess_list[i]) - 1; psmi_assert(protoexp->tid_info[tid].state == TIDSTATE_FREE); psmi_assert(protoexp->tid_info[tid].tidrecvc == NULL); psmi_assert(protoexp->tid_info[tid].tid == 0xFFFFFFFF); protoexp->tid_info[tid].state = TIDSTATE_USED; protoexp->tid_info[tid].tidrecvc = tidrecvc; protoexp->tid_info[tid].tid = tidrecvc->tid_list.tsess_list[i]; } } /* Initialize recv descriptor */ tidrecvc->ipsaddr = ipsaddr; tidrecvc->getreq = (struct ips_tid_get_request *)getreq; /* Initialize tidflow, instead calling generic routine: ips_flow_init(&tidrecvc->tidflow, protoexp->proto, ipsaddr, protoexp->ctrl_xfer_type, PSM_PROTOCOL_TIDFLOW, IPS_PATH_LOW_PRIORITY, EP_FLOW_TIDFLOW); * only reset following necessary field. */ tidrecvc->tidflow.ipsaddr = ipsaddr; tidrecvc->tidflow.flags = 0; tidrecvc->tidflow_nswap_gen = 0; tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; tidrecvc->tidflow_genseq.psn_seq = 0; /* Always start sequence number at 0 (zero), in order to prevent wraparound sequence numbers */ psmi_hal_tidflow_set_entry( tidrecvc->rdescid._desc_idx, tidrecvc->tidflow_genseq.psn_gen, tidrecvc->tidflow_genseq.psn_seq, tidrecvc->context->psm_hw_ctxt); tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset; tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen; tidrecvc->ctrl_msg_queued = 0; tidrecvc->state = TIDRECVC_STATE_BUSY; tidrecvc->stats.nSeqErr = 0; tidrecvc->stats.nGenErr = 0; tidrecvc->stats.nReXmit = 0; tidrecvc->stats.nErrChkReceived = 0; /* This gets sent out as a control message, so we need to force 4-byte IB * alignment */ tidrecvc->tsess_tidlist_length = (uint16_t) PSMI_ALIGNUP((sizeof(ips_tid_session_list) + (tidrecvc->tid_list.tsess_tidcount * sizeof(uint32_t))), 4); _HFI_EXP("alloc tidrecv=%d, paylen=%d, ntid=%d\n", tidrecvc->rdescid._desc_idx, tidrecvc->tsess_tidlist_length, tidrecvc->tid_list.tsess_tidcount); tidrecvc->grantscb = grantscb; tidrecvc->completescb = completescb; *ptidrecvc = tidrecvc; /* return to caller */ PSM2_LOG_MSG("leaving"); return PSM2_OK; } static psm2_error_t ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) { struct ips_tid_get_pend *phead = &((struct ips_protoexp *)timer->context)->pend_getreqsq; struct ips_protoexp *protoexp; struct ips_tid_get_request *getreq; struct ips_tid_recv_desc *tidrecvc; ips_epaddr_t *ipsaddr; uint32_t nbytes_this, count; int ret; PSM2_LOG_MSG("entering"); #ifdef PSM_CUDA if (!(((struct ips_protoexp *)timer->context)->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) || ((((struct ips_protoexp *)timer->context)->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && gpudirect_recv_threshold)) { /* Before processing pending TID requests, first try to free up * any CUDA host buffers that are now idle. */ struct ips_tid_get_cudapend *cphead = &((struct ips_protoexp *)timer->context)->cudapend_getreqsq; psm2_error_t err; /* See if any CUDA memcpys are in progress. Grab the first getreq... */ while (!STAILQ_EMPTY(cphead)) { getreq = STAILQ_FIRST(cphead); err = psmi_cuda_reclaim_hostbufs(getreq); if (err == PSM2_OK_NO_PROGRESS) goto cudapend_exit; /* This pending cuda getreq has no more CUDA ops queued up. * Either it's completely done, or the CUDA copies have caught * up with the TID data xfer, but the TID xfer itself is not * finished. */ if (getreq->tidgr_cuda_bytesdone == getreq->tidgr_length) { /* TID xfer is done. * We should only get here if: * this was involved a cuda copy, and * the TIX xfer is done. */ psmi_assert(getreq->cuda_hostbuf_used); psmi_assert(getreq->tidgr_length == getreq->tidgr_offset); /* Remove from the cudapend list, and reclaim */ getreq->tidgr_protoexp = NULL; getreq->tidgr_epaddr = NULL; STAILQ_REMOVE_HEAD(cphead, tidgr_next); /* mark the req as done */ if (getreq->tidgr_callback) getreq->tidgr_callback(getreq->tidgr_ucontext); psmi_mpool_put(getreq); } else break; /* CUDA xfers in progress. Leave. */ } } cudapend_exit: #endif while (!STAILQ_EMPTY(phead)) { getreq = STAILQ_FIRST(phead); ipsaddr = (ips_epaddr_t *) (getreq->tidgr_epaddr); count = ipsaddr->msgctl->ipsaddr_count; ipsaddr_next: ipsaddr = ipsaddr->msgctl->ipsaddr_next; ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; protoexp = ((psm2_epaddr_t) ipsaddr)->proto->protoexp; if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { psmi_assert(protoexp->proto->msgflowid < EP_FLOW_LAST); struct ips_flow *flow = &ipsaddr->flows[protoexp->proto->msgflowid]; if (flow->flags & IPS_FLOW_FLAG_SKIP_CTS) { break; /* skip sending next CTS */ } } #ifdef PSM_CUDA if (getreq->cuda_hostbuf_used) { /* If this is a large transfer, we may be able to * start reclaiming before all of the data is sent. */ psmi_cuda_reclaim_hostbufs(getreq); } #endif /* * Calculate the next window size, avoid the last * window too small. */ nbytes_this = getreq->tidgr_length - getreq->tidgr_offset; if (nbytes_this >= 2 * getreq->tidgr_rndv_winsz) nbytes_this = getreq->tidgr_rndv_winsz; else if (nbytes_this > getreq->tidgr_rndv_winsz) nbytes_this /= 2; /* * If there is a next window and the next window * length is greater than PAGESIZE, make sure the window * starts on a page boundary. */ #ifdef PSM_CUDA psm2_mq_req_t req = (psm2_mq_req_t)getreq->tidgr_ucontext; if (req->is_buf_gpu_mem){ if (((getreq->tidgr_offset + nbytes_this) < getreq->tidgr_length) && nbytes_this > PSMI_GPU_PAGESIZE) { uint32_t pageoff = (((uintptr_t)getreq->tidgr_lbuf) & (PSMI_GPU_PAGESIZE - 1)) + getreq->tidgr_offset + nbytes_this; nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1); } } else #endif { if ((getreq->tidgr_offset + nbytes_this) < getreq->tidgr_length && nbytes_this > PSMI_PAGESIZE) { uint32_t pageoff = (((uintptr_t)getreq->tidgr_lbuf) & (PSMI_PAGESIZE - 1)) + getreq->tidgr_offset + nbytes_this; nbytes_this -= pageoff & (PSMI_PAGESIZE - 1); } } psmi_assert(nbytes_this >= 4); psmi_assert(nbytes_this <= PSM_TID_WINSIZE); if ((ret = ips_tid_num_available(&protoexp->tidc)) <= 0) { /* We're out of tids. If this process used all the resource, * the free callback will reschedule the operation, otherwise, * we reschedule it here */ if (ret == 0) { psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); } } else if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) { /* We're out of tidflow. If this process used all the resource, * the free callback will reschedule the operation, otherwise, * we reschedule it here */ if (ret == 0) { psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); } } else if (ips_tid_recv_alloc(protoexp, ipsaddr, getreq, nbytes_this, &tidrecvc) == PSM2_OK) { ips_protoexp_send_tid_grant(tidrecvc); if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { /* * Once the CTS was sent, we mark it per 'flow' object * not to proceed with next CTSes until that one is done. */ struct ips_proto *proto = tidrecvc->protoexp->proto; psmi_assert(proto->msgflowid < EP_FLOW_LAST); struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; flow->flags |= IPS_FLOW_FLAG_SKIP_CTS; } /* * nbytes_this is the asked length for this session, * ips_tid_recv_alloc() might register less pages, the * real length is in tidrecvc->recv_msglen. */ getreq->tidgr_offset += tidrecvc->recv_msglen; psmi_assert(getreq->tidgr_offset <= getreq->tidgr_length); _HFI_VDBG("GRANT tididx=%d srcoff=%d nbytes=%d/%d\n", tidrecvc->rdescid._desc_idx, getreq->tidgr_offset, tidrecvc->recv_msglen, getreq->tidgr_length); if (getreq->tidgr_offset == getreq->tidgr_length) { #ifdef PSM_CUDA if (getreq->cuda_hostbuf_used) { /* this completes the tid xfer setup. move to the pending cuda ops queue, set the timer to catch completion */ STAILQ_REMOVE_HEAD(phead, tidgr_next); STAILQ_INSERT_TAIL( &getreq->tidgr_protoexp->cudapend_getreqsq, getreq, tidgr_next); psmi_timer_request(getreq->tidgr_protoexp->timerq, &getreq->tidgr_protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); continue; } #endif getreq->tidgr_protoexp = NULL; getreq->tidgr_epaddr = NULL; STAILQ_REMOVE_HEAD(phead, tidgr_next); continue; /* try next grant request */ } else if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE) { /* In case of multi rail, PSM sends one CTS per request * per card after which the request is moved to the end * of the queue. */ count--; if (count) goto ipsaddr_next; STAILQ_REMOVE_HEAD(phead, tidgr_next); STAILQ_INSERT_TAIL(phead, getreq ,tidgr_next); continue; } /* created a tidrecvc, reset count */ count = ipsaddr->msgctl->ipsaddr_count; goto ipsaddr_next; /* try next fragment on next ipsaddr */ } /* * We need to loop until we can't get a tidrecvc on all * ipsaddrs, then the callbacks on the home protoexp where * getreq is linked can resume this routine. Otherwise, we * might make this getreq to be orphaned and cause deadlock. */ count--; if (count) goto ipsaddr_next; break; } PSM2_LOG_MSG("leaving"); return PSM2_OK; /* XXX err-broken */ } #ifdef PSM_CUDA static void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) { struct ips_protoexp *protoexp = tidrecvc->protoexp; struct ips_cuda_hostbuf *chb; chb = tidrecvc->cuda_hostbuf; chb->size += tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start + tidrecvc->tid_list.tsess_unaligned_end; PSMI_CUDA_CALL(cuMemcpyHtoDAsync, chb->gpu_buf, chb->host_buf, tidrecvc->recv_tidbytes + tidrecvc->tid_list.tsess_unaligned_start + tidrecvc->tid_list.tsess_unaligned_end, protoexp->cudastream_recv); PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, protoexp->cudastream_recv); STAILQ_INSERT_TAIL(&tidrecvc->getreq->pend_cudabuf, chb, next); tidrecvc->cuda_hostbuf = NULL; ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0); } #endif static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) { struct ips_protoexp *protoexp = tidrecvc->protoexp; struct ips_tid_get_request *getreq = tidrecvc->getreq; int tidcount = tidrecvc->tid_list.tsess_tidcount; psm2_error_t err = PSM2_OK; psmi_assert(getreq != NULL); psmi_assert(tidcount > 0); psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); #ifdef PSM_CUDA if (tidrecvc->cuda_hostbuf) psmi_cudamemcpy_tid_to_device(tidrecvc); #endif if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { int tid, i; for (i = 0; i < tidcount; i++) { tid = IPS_TIDINFO_GET_TID(tidrecvc->tid_list. tsess_list[i]) * 2 + IPS_TIDINFO_GET_TIDCTRL(tidrecvc->tid_list. tsess_list[i]) - 1; psmi_assert(protoexp->tid_info[tid].state == TIDSTATE_USED); psmi_assert(protoexp->tid_info[tid].tidrecvc == tidrecvc); psmi_assert(protoexp->tid_info[tid].tid == tidrecvc->tid_list.tsess_list[i]); protoexp->tid_info[tid].state = TIDSTATE_FREE; protoexp->tid_info[tid].tidrecvc = NULL; protoexp->tid_info[tid].tid = 0xFFFFFFFF; } } ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ", tidrecvc->tid_list.tsess_tidcount); if (protoexp->tidc.tid_array) { if ((err = ips_tidcache_release(&protoexp->tidc, tidrecvc->tid_list.tsess_list, tidcount))) goto fail; } else { if ((err = ips_tid_release(&protoexp->tidc, tidrecvc->tid_list.tsess_list, tidcount))) goto fail; } getreq->tidgr_bytesdone += tidrecvc->recv_msglen; _HFI_EXP("req=%p bytes=%d/%d\n", getreq->tidgr_ucontext, getreq->tidgr_bytesdone, getreq->tidgr_length); tidrecvc->state = TIDRECVC_STATE_FREE; /* finally free the tidflow */ ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); if (getreq->tidgr_bytesdone == getreq->tidgr_length) { #ifdef PSM_CUDA /* if cuda, we handle callbacks when the cuda xfer is done */ if (!getreq->cuda_hostbuf_used) { if (getreq->tidgr_callback) getreq->tidgr_callback(getreq->tidgr_ucontext); psmi_mpool_put(getreq); } #else if (getreq->tidgr_callback) getreq->tidgr_callback(getreq->tidgr_ucontext); psmi_mpool_put(getreq); #endif } else { /* We just released some tids. * If requests are waiting on tids to be * freed, queue up the timer */ if (getreq->tidgr_offset < getreq->tidgr_length) { ips_tid_pendtids_timer_callback(&getreq-> tidgr_protoexp-> timer_getreqs, 0); } } if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) { psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); } fail: return err; } void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev) { struct ips_tid_recv_desc *tidrecvc; struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ptl_arg_t desc_id; int tidpair = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK; int tidctrl = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> HFI_KHDR_TIDCTRL_SHIFT) & HFI_KHDR_TIDCTRL_MASK; int tid0, tid1, tid; psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); /* Expected sends not enabled */ if (protoexp == NULL) return; /* Not doing extra tid debugging or not really a tiderr */ if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) || !(psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TID)) return; if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) != PSM_HAL_RHF_RX_TYPE_EXPECTED) { _HFI_ERROR("receive type %d is not " "expected in tid debugging\n", psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf)); return; } desc_id._desc_idx = ips_proto_flowid(p_hdr); desc_id._desc_genc = p_hdr->exp_rdescid_genc; tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if (tidctrl != 3) tid0 = tid1 = tidpair * 2 + tidctrl - 1; else { tid0 = tidpair * 2; tid1 = tid0 + 1; } for (tid = tid0; tid <= tid1; tid++) { if (protoexp->tid_info[tid].state == TIDSTATE_USED) continue; char buf[128]; char *s = "invalid (not even in table)"; if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc) s = "valid"; else { snprintf(buf, sizeof(buf) - 1, "wrong generation (gen=%d,received=%d)", tidrecvc->rdescid._desc_genc, desc_id._desc_genc); buf[sizeof(buf) - 1] = '\0'; s = buf; } if (protoexp->tid_info[tid].tidrecvc != tidrecvc) { _HFI_ERROR ("tid %d not a known member of tidsess %d\n", tid, desc_id._desc_idx); } _HFI_ERROR("tid %d is marked unused (session=%d): %s\n", tid, desc_id._desc_idx, s); } return; } void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev) { struct ips_tid_recv_desc *tidrecvc; struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; struct ips_message_header *p_hdr = rcv_ev->p_hdr; int hdr_err = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_KHDRLEN; uint8_t op_code = _get_proto_hfi_opcode(p_hdr); char pktmsg[128]; char errmsg[256]; psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); /* Expected sends not enabled */ if (protoexp == NULL) return; ips_proto_get_rhf_errstring(psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf), pktmsg, sizeof(pktmsg)); snprintf(errmsg, sizeof(errmsg), "%s pkt type opcode 0x%x at hd=0x%x %s\n", (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) ? "Eager" : (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) ? "Expected" : (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_NON_KD) ? "Non-kd" : "", op_code, rcv_ev->recvq->state->hdrq_head, pktmsg); if (!hdr_err) { ptl_arg_t desc_id; psmi_seqnum_t sequence_num; desc_id._desc_idx = ips_proto_flowid(p_hdr); desc_id._desc_genc = p_hdr->exp_rdescid_genc; tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { /* Print this at very verbose level. Noisy links can have a few of * these! */ _HFI_VDBG ("Data Error Pkt and Recv Generation Mismatch: %s", errmsg); return; /* skip */ } if (tidrecvc->state == TIDRECVC_STATE_FREE) { _HFI_EPDBG ("Data Error Pkt for a Completed Rendezvous: %s", errmsg); return; /* skip */ } /* See if CRC error for a previous packet */ sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); if (sequence_num.psn_gen == tidrecvc->tidflow_genseq.psn_gen) { /* Try to recover the flow by restarting from previous known good * sequence (possible if the packet with CRC error is after the "known * good PSN" else we can't restart the flow. */ return ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); } else { /* Print this at very verbose level */ _HFI_VDBG ("Data Error Packet. GenMismatch: Yes. Tidrecvc: %p. " "Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n", tidrecvc, sequence_num.psn_gen, sequence_num.psn_seq, tidrecvc->tidflow_genseq.psn_gen, tidrecvc->tidflow_genseq.psn_seq, errmsg); } } else { _HFI_VDBG("HDR_ERROR: %s\n", errmsg); } } psm2_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc) { psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); ips_tfgen_allocate(&tidrecvc->protoexp->tfc, tidrecvc->rdescid._desc_idx, &tidrecvc->tidflow_active_gen); /* Update tidflow table with new generation number */ tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; psmi_hal_tidflow_set_entry( tidrecvc->rdescid._desc_idx, tidrecvc->tidflow_genseq.psn_gen, tidrecvc->tidflow_genseq.psn_seq, tidrecvc->context->psm_hw_ctxt); /* Increment swapped generation count for tidflow */ tidrecvc->tidflow_nswap_gen++; return PSM2_OK; } void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev) { struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_tid_recv_desc *tidrecvc; ptl_arg_t desc_id; psmi_assert_always(protoexp != NULL); psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); desc_id._desc_idx = ips_proto_flowid(p_hdr); desc_id._desc_genc = p_hdr->exp_rdescid_genc; tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc && tidrecvc->state == TIDRECVC_STATE_BUSY) ips_protoexp_do_tf_seqerr(protoexp, tidrecvc, p_hdr); return; } static void ips_protoexp_do_tf_seqerr(void *vpprotoexp /* actually: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr) { struct ips_protoexp *protoexp = (struct ips_protoexp *) vpprotoexp; struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; psmi_seqnum_t sequence_num, tf_sequence_num; ips_scb_t ctrlscb; /* Update stats for sequence errors */ tidrecvc->stats.nSeqErr++; sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); /* Only care about sequence error for currently active generation */ if (tidrecvc->tidflow_active_gen != sequence_num.psn_gen) return; /* If a "large" number of swapped generation we are loosing packets * for this flow. Request throttling of tidflow by generating a * BECN. With header suppression we will miss some FECN packet * on OPA hence keeping track of swapped generation is another * mechanism to do congestion control for tidflows. * * For mismatched sender/receiver/link speeds we can get into a * deadly embrace where minimal progress is made due to generation * mismatch errors. This can occur if we wrap around the generation * count without making progress. Hence in cases where the swapped * generation count is > 254 stop sending BECN (and the NAK) so the * send -> receiver pipeline is flushed with an error check and things * can sync up. This should be an extremely rare event. */ if_pf(tidrecvc->tidflow_nswap_gen >= 254) return; /* Do not send NAK. Let error check kick in. */ if_pf((tidrecvc->tidflow_nswap_gen > 4) && (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { _HFI_CCADBG("Generating BECN. Number of swapped gen: %d.\n", tidrecvc->tidflow_nswap_gen); /* Mark flow to generate BECN in control packet */ tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; /* Update stats for congestion encountered */ protoexp->proto->epaddr_stats.congestion_pkts++; } /* Get the latest seq from hardware tidflow table, if that value is * reliable. The value is not reliable if context sharing is used, * because context sharing might drop packet even though hardware * has received it successfully. The hardware table may also be * incorrect if RSM is intercepting TID & FECN & SH packets. * We can handle this condition by taking the most recent PSN whether * it comes from the tidflow table or from PSM's own accounting. */ if (!tidrecvc->context->tf_ctrl) { uint64_t tf; uint32_t seqno=0; psmi_hal_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, tidrecvc->context->psm_hw_ctxt); psmi_hal_tidflow_get_seqnum(tf, &seqno); tf_sequence_num.psn_val = seqno; if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP)) { if (tf_sequence_num.psn_val > tidrecvc->tidflow_genseq.psn_seq) tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; } else tidrecvc->tidflow_genseq.psn_seq = tf_sequence_num.psn_seq; } /* Swap generation for the flow. */ ips_protoexp_flow_newgen(tidrecvc); ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.data[0] = p_hdr->exp_sdescid; /* Keep peer generation but use my last received sequence */ sequence_num.psn_seq = tidrecvc->tidflow_genseq.psn_seq; ctrlscb.ips_lrh.ack_seq_num = sequence_num.psn_val; /* My new generation and last received sequence */ ctrlscb.ips_lrh.data[1].u32w0 = tidrecvc->tidflow_genseq.psn_val; ips_proto_send_ctrl_message(&tidrecvc->tidflow, OPCODE_NAK, &tidrecvc->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); /* Update stats for retransmit */ tidrecvc->stats.nReXmit++; return; } void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev) { struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_tid_recv_desc *tidrecvc; ptl_arg_t desc_id; psmi_assert_always(protoexp != NULL); psmi_assert(_get_proto_hfi_opcode(p_hdr) == OPCODE_EXPTID); /* For a generation error our NAK crossed on the wire or this is a stale * packet. Error recovery should sync things up again. Just drop this * packet. */ desc_id._desc_idx = ips_proto_flowid(p_hdr); desc_id._desc_genc = p_hdr->exp_rdescid_genc; tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if (tidrecvc->rdescid._desc_genc == desc_id._desc_genc && tidrecvc->state == TIDRECVC_STATE_BUSY) ips_protoexp_do_tf_generr(protoexp, tidrecvc, p_hdr); return; } static void ips_protoexp_do_tf_generr(void *vpprotoexp /* actually: struct ips_protoexp *protoexp */, void *vptidrecvc /* actually: struct ips_tid_recv_desc *tidrecvc */, struct ips_message_header *p_hdr) { struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) vptidrecvc; /* Update stats for generation errors */ tidrecvc->stats.nGenErr++; /* If packet faced congestion we may want to generate * a CN packet to rate control sender. */ return; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_header.h000066400000000000000000000151331370564314600213750ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_PROTO_HEADER_H #define _IPS_PROTO_HEADER_H /* Although defined as macros, the *_BITS values below are NOT meant to be changed. They are defined this way so that their values are written in exactly one place. These macros are used in struct ips_message_header below, as well as in the active messages code for the purpose of establishing how many arguments/handlers are supported, and to assert that values written into the header fields are not too large for the number of bits available. The preprocessor check below ensures less than 32 bits are used. */ /* Number of bits to use for the amhdr_len field. */ #define IPS_AM_HDR_LEN_BITS 4 /* Number of bits to use for the amhdr_hidx field. Bounds the number of * handlers supported (1 << IPS_AM_HDR_HIDX_BITS). */ #define IPS_AM_HDR_HIDX_BITS 8 /* Number of bits to use for the amhdr_nargs field. Bounds the number of arguments supported (1 << IPS_AM_HDR_NARGS_BITS). */ #define IPS_AM_HDR_NARGS_BITS 4 #if (IPS_AM_HDR_LEN_BITS + IPS_AM_HDR_HIDX_BITS + IPS_AM_HDR_NARGS_BITS) > 32 #error "Bad IPS header definition: AM fields must use 32 bits or less" #endif /* Number of AM arguments that can be packets into struct_ips_message_header. Remaining arguments up to the max (1 << IPS_AM_HDR_NARGS_BITS) are placed in the data payload. */ #define IPS_AM_HDR_NARGS \ (sizeof(((struct ips_message_header *)0)->data) / sizeof(psm2_amarg_t)) /* The actual size of the message header is determined by three paramters: * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware) * OPA words contain LRH and BTH * IPS_HEADER_QUEUE_HWORDS (fixed at 2 by ips protocol) * IPS hardware words contain ips-protocol-specific data * IPS_HEADER_QUEUE_UWORDS (fixed at 7 by ips protocol) * IPS user words contain ips-protocol-specific data * * The header message size is determined to as IWORDS + HWORDS + UWORDS */ struct ips_message_header { __be16 lrh[4]; __be32 bth[3]; /* fields below this point are in host byte order */ struct hfi_kdeth khdr; struct { __u32 flags:6; __u32 connidx:26; /* connection idx */ }; union { struct { struct { __u32 ack_seq_num:31; __u32 reserved:1; }; union { struct { /* for active message */ __u32 amhdr_len:IPS_AM_HDR_LEN_BITS; __u32 amhdr_nargs:IPS_AM_HDR_NARGS_BITS; __u32 amhdr_hidx:IPS_AM_HDR_HIDX_BITS; }; __u32 mdata; /* for misc data */ }; /* Inline arguments and/or message payload */ union { ptl_arg_t data[2]; __u32 uwords[4]; }; }; /* for message header packet only */ struct { __u32 pad1; __u32 tag[3]; /* 96 bits psm tag */ ptl_arg_t hdr_data; }; /* for expected tid packet only */ struct { __u8 exp_ustart[3]; /* unaligned start bytes */ __u8 exp_uend[3]; /* unaligned end bytes */ __u16 exp_rdescid_genc; /* tidrecvc gen count */ ptl_arg_t exp_sdescid; /* sender descriptor id */ __u32 exp_cksum; /* optional checksum */ __u32 exp_offset; /* packet offset */ }; }; }; /* * OpCodes in BTH[0], 24-31 bits. Order is important!!! */ #define OPCODE_RESERVED 0xC0 /* reserved */ #define OPCODE_TINY 0xC1 /* 0 <= msglen <= 8 */ #define OPCODE_SHORT 0xC2 /* 8 < msglen <= MTU */ #define OPCODE_EAGER 0xC3 /* eager packet */ #define OPCODE_LONG_RTS 0xC4 /* ready to send */ #define OPCODE_LONG_CTS 0xC5 /* confirm to send */ #define OPCODE_LONG_DATA 0xC6 /* long data packets */ #define OPCODE_EXPTID 0xC7 /* expected tid data */ #define OPCODE_EXPTID_COMPLETION 0xC8 /* expected tid completion */ #define OPCODE_ACK 0xC9 /* explicit ACK packet */ #define OPCODE_NAK 0xCA /* explicit NAK packet */ #define OPCODE_BECN 0xCB /* congestion control */ #define OPCODE_ERR_CHK 0xCC /* query eager receiving */ #define OPCODE_ERR_CHK_GEN 0xCD /* query tid receiving */ #define OPCODE_CONNECT_REQUEST 0xCE /* connect request */ #define OPCODE_CONNECT_REPLY 0xCF /* connect reply */ #define OPCODE_DISCONNECT_REQUEST 0xD0 /* disconnect request */ #define OPCODE_DISCONNECT_REPLY 0xD1 /* disconnect reply */ #define OPCODE_AM_REQUEST_NOREPLY 0xD2 /* AM request w/o reply */ #define OPCODE_AM_REQUEST 0xD3 /* AM request */ #define OPCODE_AM_REPLY 0xD4 /* AM reply */ #define OPCODE_FUTURE_FROM 0xD5 /* reserved for expansion */ #define OPCODE_FUTURE_TO 0xDF /* reserved for expansion */ #endif /* _IPS_PROTO_HEADER_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_help.h000066400000000000000000000442271370564314600211030ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2017 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2017 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_PROTO_HELP_H #define _IPS_PROTO_HELP_H #include "ptl_ips.h" /* hfi_opcode is not the ips-level opcode. */ PSMI_ALWAYS_INLINE( uint8_t _get_proto_hfi_opcode(const struct ips_message_header *p_hdr)) { return ((__be32_to_cpu(p_hdr->bth[0]) >> HFI_BTH_OPCODE_SHIFT) & HFI_BTH_OPCODE_MASK); } PSMI_ALWAYS_INLINE( uint8_t ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow)) { /* * Setup ACK request if more than ack_interval packets * have not been requested an ACK */ if (scb->scb_flags & IPS_SEND_FLAG_ACKREQ || scb->nfrag > 1) { flow->ack_counter = 0; } else { flow->ack_counter++; if (flow->ack_counter > flow->ack_interval) { flow->ack_counter = 0; scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; } } /* Bottom 6 bits wind up in protocol header fields, other bits * control other aspects of packet composition */ return (uint8_t) (scb->scb_flags & IPS_SEND_FLAG_PROTO_OPTS); } PSMI_ALWAYS_INLINE( ips_epaddr_flow_t ips_proto_flowid(struct ips_message_header *p_hdr)) { return (ips_epaddr_flow_t) ((__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK); } PSMI_ALWAYS_INLINE( int ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr, void *payload, uint32_t paylen, uint32_t *cksum)) { uint16_t paywords; /* Update the payload words in header */ paywords = (sizeof(struct ips_message_header) + paylen + PSM_CRC_SIZE_IN_BYTES + HFI_CRC_SIZE_IN_BYTES) >> BYTE2DWORD_SHIFT; p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); /* Need to regenerate KDETH checksum after updating payload length */ /* ips_kdeth_cksum(p_hdr); */ *cksum = 0xffffffff; /* Checksum header */ *cksum = ips_crc_calculate(sizeof(struct ips_message_header), (uint8_t *) p_hdr, *cksum); /* Checksum payload (if any) */ if (paylen) { psmi_assert_always(payload); *cksum = ips_crc_calculate(paylen, (uint8_t *) payload, *cksum); } return 0; } PSMI_ALWAYS_INLINE( uint32_t ips_proto_dest_context_from_header(struct ips_proto *proto, struct ips_message_header *p_hdr)) { return (__be32_to_cpu(p_hdr->bth[1]) & 0xFF); } PSMI_ALWAYS_INLINE( void ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, struct ips_flow *flow, ips_scb_t *scb, uint8_t flags)) { uint16_t slid, dlid; uint32_t paywords = (sizeof(struct ips_message_header) + scb->payload_size + HFI_CRC_SIZE_IN_BYTES) >> BYTE2DWORD_SHIFT; struct ips_message_header *p_hdr = &scb->ips_lrh; #if 0 /* * This scb has been used by this connection last time, * so some of the header fields are already set. */ if (scb->flow == flow) { p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey | (scb-> opcode << BTH_OPCODE_SHIFT) | (extra_bytes << BTH_EXTRA_BYTE_SHIFT)); p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num. psn | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); p_hdr->khdr.kdeth0 = __cpu_to_le32(scb->offset | (scb-> offset_mode << HFI_KHDR_OM_SHIFT) | (scb-> tid << HFI_KHDR_TID_SHIFT) | (scb-> tidctrl << HFI_KHDR_TIDCTRL_SHIFT) | (scb-> flags & IPS_SEND_FLAG_INTR) | (scb-> flags & IPS_SEND_FLAG_HDR_SUPPRESS) | (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); /* ips_kdeth_cksum(p_hdr); // Generate KDETH checksum */ p_hdr->ack_seq_num = flow->recv_seq_num.psn; p_hdr->flags = flags; return; } #endif slid = flow->path->pr_slid; dlid = flow->path->pr_dlid; if (scb->scb_flags & IPS_SEND_FLAG_NO_LMC) { slid = ipsaddr->pathgrp->pg_base_slid; dlid = ipsaddr->pathgrp->pg_base_dlid; } /* Setup LRH fields */ p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | ((flow->path->pr_sl & HFI_LRH_SL_MASK) << HFI_LRH_SL_SHIFT) | ((proto->sl2sc[flow->path->pr_sl] & HFI_LRH_SC_MASK) << HFI_LRH_SC_SHIFT)); p_hdr->lrh[1] = dlid; p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); p_hdr->lrh[3] = slid; /* Setup BTH fields */ p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey | (scb->opcode << HFI_BTH_OPCODE_SHIFT)); p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); if (scb->tidctrl) { /* expected receive packet */ psmi_assert(scb->tidsendc != NULL); p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | (ipsaddr-> subcontext << HFI_BTH_SUBCTXT_SHIFT) | (scb->tidsendc-> rdescid._desc_idx << HFI_BTH_FLOWID_SHIFT) | (proto->epinfo. ep_baseqp << HFI_BTH_QP_SHIFT)); /* Setup KHDR fields */ p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | (scb->tidctrl << HFI_KHDR_TIDCTRL_SHIFT) | (scb->scb_flags & IPS_SEND_FLAG_INTR) | (scb->scb_flags & IPS_SEND_FLAG_HDRSUPP) | (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); } else { /* eager receive packet */ p_hdr->bth[1] = __cpu_to_be32(ipsaddr->context | (ipsaddr-> subcontext << HFI_BTH_SUBCTXT_SHIFT) | (flow->flowid << HFI_BTH_FLOWID_SHIFT) | (proto->epinfo. ep_baseqp << HFI_BTH_QP_SHIFT)); /* Setup KHDR fields */ p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | (scb->scb_flags & IPS_SEND_FLAG_INTR) | (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); p_hdr->ack_seq_num = flow->recv_seq_num.psn_num; } p_hdr->khdr.job_key = __cpu_to_le32(proto->epinfo.ep_jkey); p_hdr->connidx = ipsaddr->connidx_outgoing; p_hdr->flags = flags; scb->flow = flow; return; } /* * Assumes that the following fields are already set in scb: * payload * payload_size * flags */ PSMI_INLINE( void ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr, struct ips_flow *flow, ips_scb_t *scb)) { psmi_assert((scb->payload_size & 3) == 0); ips_proto_hdr(proto, ipsaddr, flow, scb, ips_flow_gen_ackflags(scb, flow)); scb->ack_timeout = proto->epinfo.ep_timeout_ack; scb->abs_timeout = TIMEOUT_INFINITE; scb->scb_flags |= IPS_SEND_FLAG_PENDING; if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { flow->xmit_seq_num.psn_seq += scb->nfrag; scb->seq_num = flow->xmit_seq_num; scb->seq_num.psn_seq--; } else { flow->xmit_seq_num.psn_num = (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask; scb->seq_num.psn_num = (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask; } return; } PSMI_ALWAYS_INLINE( void ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype)) { switch (msgtype) { case OPCODE_ACK: break; case OPCODE_ERR_CHK: case OPCODE_ERR_CHK_GEN: proto->epaddr_stats.err_chk_send++; break; case OPCODE_NAK: proto->epaddr_stats.nak_send++; break; case OPCODE_CONNECT_REQUEST: proto->epaddr_stats.connect_req++; break; case OPCODE_DISCONNECT_REQUEST: proto->epaddr_stats.disconnect_req++; break; default: break; } return; } /* * Exported there solely for inlining is_expected_or_nak and mq_tiny handling */ extern psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, uint16_t *msg_queue_mask, ips_scb_t *ctrlscb, void *payload, uint32_t paylen); PSMI_ALWAYS_INLINE( void ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow)) { if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) { flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; /* ACK clears NAK */ } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) { SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); } flow->flags |= IPS_FLOW_FLAG_PENDING_ACK; } else { ips_scb_t ctrlscb; ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; /* Coalesced ACKs disabled. Send ACK immediately */ ips_proto_send_ctrl_message(flow, OPCODE_ACK, &flow->ipsaddr->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } } PSMI_ALWAYS_INLINE( void ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow)) { if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; /* NAK clears ACK */ } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) { SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); } flow->flags |= IPS_FLOW_FLAG_PENDING_NAK; } else { ips_scb_t ctrlscb; ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; /* Coalesced ACKs disabled. Send NAK immediately */ ips_proto_send_ctrl_message(flow, OPCODE_NAK, &flow->ipsaddr->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } } /* return 1 if packet is next expected in flow * return 0 if packet is not next expected in flow (and nak packet). */ PSMI_ALWAYS_INLINE( int ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) { struct ips_proto *proto = rcv_ev->proto; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); struct ips_flow *flow; psmi_seqnum_t sequence_num; psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) || (flowid == EP_FLOW_GO_BACK_N_DMA) ); flow = &ipsaddr->flows[flowid]; /* If packet faced congestion generate BECN in NAK. */ if_pf((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) && ((flow->cca_ooo_pkts & 0xf) == 0)) { /* Generate a BECN for every 16th OOO packet marked with a FECN. */ flow->flags |= IPS_FLOW_FLAG_GEN_BECN; flow->cca_ooo_pkts++; rcv_ev->proto->epaddr_stats.congestion_pkts++; rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Clear FECN event */ } sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) { flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND; flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num + 1) & proto->psn_mask; flow->cca_ooo_pkts = 0; /* don't process ack, caller will do it. */ return 1; } int16_t diff = (int16_t) (sequence_num.psn_num - flow->recv_seq_num.psn_num); if (diff > 0) { if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) { /* Queue/Send NAK to peer */ ips_proto_send_nak((struct ips_recvhdrq *) rcv_ev->recvq, flow); flow->flags |= IPS_FLOW_FLAG_NAK_SEND; flow->cca_ooo_pkts = 0; } else if (proto->flags & IPS_PROTO_FLAG_CCA) { flow->cca_ooo_pkts = diff; if (flow->cca_ooo_pkts > flow->ack_interval) { ips_scb_t ctrlscb; rcv_ev->proto->epaddr_stats.congestion_pkts++; flow->flags |= IPS_FLOW_FLAG_GEN_BECN; _HFI_CCADBG ("BECN Generation. Expected: %d, Got: %d.\n", flow->recv_seq_num.psn_num, sequence_num.psn_num); ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.data[0].u32w0 = flow->cca_ooo_pkts; /* Send Control message to throttle flow. Will clear flow flag and * reset cca_ooo_pkts. */ ips_proto_send_ctrl_message(flow, OPCODE_BECN, &flow->ipsaddr-> ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } } } /* process ack if packet is not in sequence. */ ips_proto_process_ack(rcv_ev); return 0; } /* * Note, some code depends on the literal values specified in this enum. */ enum ips_msg_order { IPS_MSG_ORDER_PAST = 3, /* Old message, recv & drop */ IPS_MSG_ORDER_EXPECTED_MATCH = 2, /* Expected message, recv on match */ IPS_MSG_ORDER_EXPECTED = 1, /* Expected message, always recv */ IPS_MSG_ORDER_FUTURE_RECV = 0, /* Future message, buffer in OOO Q */ IPS_MSG_ORDER_FUTURE = -1, /* Future message, leave on RHQ */ }; PSMI_ALWAYS_INLINE( enum ips_msg_order ips_proto_check_msg_order(ips_epaddr_t *ipsaddr, struct ips_flow *flow, uint16_t send_seqnum, uint16_t *recv_seqnum)) { int16_t diff = (int16_t) (*recv_seqnum - send_seqnum); if (likely(diff == 0)) { *recv_seqnum += 1; ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_UNEXP_MASK; if (ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_UNEXP_MASK) return IPS_MSG_ORDER_EXPECTED_MATCH; return IPS_MSG_ORDER_EXPECTED; } else if (diff > 0) { return IPS_MSG_ORDER_PAST; } ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_OOO_MASK; if (!(ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_OOO_MASK)) { /* * Second time to see the same ooo message, receive and put * into OOO queue. */ return IPS_MSG_ORDER_FUTURE_RECV; } /* The first time to see an OOO message, leave it there and try * next time. But we need to revert back the receiving flow PSN. */ uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & psn_mask; return IPS_MSG_ORDER_FUTURE; } PSMI_INLINE( int ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev)) { uint32_t index; #ifdef PSM_FI /* NOTE: Fault injection will currently not work with hardware * suppression. See note below for reason why as we currently * do not update the hardware tidflow table if FI is dropping * the packet. * * We need to look into the packet before dropping it and * if it's an expected packet AND we have hardware suppression * then we need to update the hardware tidflow table and the * associated tidrecvc state to fake having received a packet * until some point in the window defined by the loss rate. * This way the subsequent err chk will be NAKd and we can resync * the flow with the sender. * * Note: For real errors the hardware generates seq/gen errors * which are handled appropriately by the protocol. */ if_pf(PSMI_FAULTINJ_ENABLED()) { PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", 1, IPS_FAULTINJ_RECVLOST); if (psmi_faultinj_is_fault(fi_recv)) return IPS_RECVHDRQ_CONTINUE; } #endif /* #ifdef PSM_FI */ /* see file ips_proto_header.h for details */ index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED; if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED)) index = 0; return ips_packet_service_routine[index] ((struct ips_recvhdrq_event *)rcv_ev); } /* * Breaks header encapsulation but needed in mq sends so we can pay * "near-equal" attention to putting sends on the wire and servicing the * receive queue. */ PSMI_ALWAYS_INLINE( psm2_error_t ips_recv_progress_if_busy(ptl_t *ptl_gen, psm2_error_t err)) { struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen; if (err == PSM2_EP_NO_RESOURCES) { ptl->ctl->ep_poll(ptl_gen, 0); return PSM2_OK; } else return err; } /* Find next lowest power of a two for a 32 bit number*/ PSMI_ALWAYS_INLINE( unsigned int ips_next_low_pow2(unsigned int v)) { const unsigned int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 }; const unsigned int S[] = { 1, 2, 4, 8, 16 }; register unsigned int r = 1; int i; for (i = 4; i >= 0; i--) { if (v & b[i]) { v >>= S[i]; r <<= S[i]; } } return r; } PSMI_ALWAYS_INLINE( ips_path_rec_t * ips_select_path(struct ips_proto *proto, ips_path_type_t path_type, ips_epaddr_t *ipsaddr, ips_path_grp_t *pathgrp)) { uint32_t path_idx; if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { /* If dispersive routes are configured then select the routes * in round robin order. We may want to use congestion * information to select the least lightly loaded path. */ path_idx = pathgrp->pg_next_path[path_type]; if (++pathgrp->pg_next_path[path_type] >= pathgrp->pg_num_paths[path_type]) pathgrp->pg_next_path[path_type] = 0; } else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) path_idx = /* Key on destination context */ ipsaddr->context % pathgrp->pg_num_paths[path_type]; else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) path_idx = /* Key off src context */ proto->epinfo.ep_context % pathgrp->pg_num_paths[path_type]; else /* Base LID routed - Default in Infinhfi 2.5 (Oct 09). */ path_idx = 0; return pathgrp->pg_path[path_idx][path_type]; } #endif /* _IPS_PROTO_HELP_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_internal.h000066400000000000000000000076021370564314600217630ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_PROTO_INTERNAL_H #define _IPS_PROTO_INTERNAL_H #include "ips_expected_proto.h" #include "ips_proto_help.h" /* * Connect protocol. * * On receive, handled by upcalling into the connect interface. * On send, handled by ips_proto by having connect compose the message. */ psm2_error_t ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode, struct ips_message_header *p_hdr, void *payload, uint32_t paylen); int ips_proto_build_connect_message(struct ips_proto *proto, ips_epaddr_t *ptladdr, uint8_t opcode, void *payload); psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t); psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t); psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t); psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t); psm2_error_t ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current); psm2_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment); void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context); psm2_error_t ips_proto_recv_init(struct ips_proto *proto); psm2_error_t ips_proto_recv_fini(struct ips_proto *proto); int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev); int ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev); int ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev); int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev); int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev); #endif /* _IPS_PROTO_INTERNAL_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_mq.c000066400000000000000000001573011370564314600205610ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #ifdef PSM_CUDA #include "psm_gdrcpy.h" #endif #include "ips_scb.h" #include "ips_proto.h" #include "psm_mq_internal.h" #include "ips_expected_proto.h" #include "ips_proto_help.h" PSMI_NEVER_INLINE(ips_scb_t * ips_poll_scb(struct ips_proto *proto, int npkts, int len, uint32_t flags, int istiny)) { ips_scb_t *scb = NULL; psmi_assert(npkts > 0); psm2_error_t err; proto->stats.scb_egr_unavail_cnt++; PSMI_BLOCKUNTIL(proto->ep, err, ((scb = (istiny ? ips_scbctrl_alloc_tiny(&proto->scbc_egr) : ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags))) != NULL)); psmi_assert(scb != NULL); return scb; } PSMI_ALWAYS_INLINE(ips_scb_t *mq_alloc_tiny(struct ips_proto *proto)) { ips_scb_t *scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr); /* common case should branch right through */ if_pt(scb != NULL) return scb; else return ips_poll_scb(proto, 1, 0, 0, 1); } PSMI_ALWAYS_INLINE( ips_scb_t * mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags)) { psmi_assert(npkts > 0); ips_scb_t *scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags); if_pt(scb != NULL) { return scb; } else { return ips_poll_scb(proto, npkts, len, flags, 0 /* not tiny scb */); } } static int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) { psm2_mq_req_t req = (psm2_mq_req_t) reqp; /* This code path is executed when the send is on a device buffer * and the receive is completed using eager buffers. As there is no * completion notification sent to the sender, this is the only place * where send side chb's can be freed and put back into the mpool. */ #ifdef PSM_CUDA struct ips_cuda_hostbuf *chb; if (req->cuda_hostbuf_used) { while (!STAILQ_EMPTY(&req->sendreq_prefetch)) { /* If any prefetched buffers weren't used, they must be reclaimed here. */ chb = STAILQ_FIRST(&req->sendreq_prefetch); STAILQ_REMOVE_HEAD(&req->sendreq_prefetch, req_next); psmi_mpool_put(chb); } } #endif req->send_msgoff += nbytes; /* * the reason to use >= is because * we may have DW pad in nbytes. */ if (req->send_msgoff >= req->req_data.send_msglen) { req->state = MQ_STATE_COMPLETE; ips_barrier(); if(!psmi_is_req_internal(req)) mq_qq_append(&req->mq->completed_q, req); } return IPS_RECVHDRQ_CONTINUE; } static int ips_proto_mq_rv_complete(void *reqp) { psm2_mq_req_t req = (psm2_mq_req_t) reqp; psmi_mq_handle_rts_complete(req); return IPS_RECVHDRQ_CONTINUE; } static void ips_proto_mq_rv_complete_exp(void *reqp) { ips_proto_mq_rv_complete(reqp); return; } PSMI_ALWAYS_INLINE( void ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars)) { unsigned char *dest = vdest; const unsigned char *src = vsrc; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); return; } #endif if (nchars >> 2) hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2); dest += (nchars >> 2) << 2; src += (nchars >> 2) << 2; switch (nchars & 0x03) { case 3: *dest++ = *src++; case 2: *dest++ = *src++; case 1: *dest++ = *src++; } return; } #ifdef PSM_CUDA PSMI_ALWAYS_INLINE( void ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)) { unsigned char *dest = vdest; const unsigned char *src = vsrc; if (nchars >> 2) hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2); dest += (nchars >> 2) << 2; src += (nchars >> 2) << 2; switch (nchars & 0x03) { case 3: *dest++ = *src++; case 2: *dest++ = *src++; case 1: *dest++ = *src++; } return; } #endif extern psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); /* * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope * * Recoverable errors: * PSM2_OK: If PIO, envelope is sent. * If DMA, all queued up packets on flow were flushed. * * Recoverable errors converted to PSM2_OK just before return: * PSM2_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets. * PSM2_EP_NO_RESOURCES: * If PIO, no pio available or cable currently pulled. * If DMA, can be that no scb's available to handle unaligned packets * or writev returned a recoverable error (no mem for * descriptors, dma interrupted or no space left in dma queue). * * Unrecoverable errors (PIO or DMA). * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, * rxe/txe parity error. * PSM2_EP_NO_NETWORK: No network, no lid, ... */ PSMI_ALWAYS_INLINE( psm2_error_t ips_mq_send_envelope(struct ips_proto *proto, struct ips_flow *flow, struct ips_scb *scb, int do_flush)) { psm2_error_t err = PSM2_OK; ips_proto_flow_enqueue(flow, scb); if ((flow->transfer == PSM_TRANSFER_PIO) || do_flush) err = flow->flush(flow, NULL); if (do_flush) err = ips_recv_progress_if_busy(proto->ptl, err); /* As per the PSM error model (or lack thereof), PSM clients expect to see * only PSM2_OK as a recoverable error */ if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) err = PSM2_OK; return err; } /* * We don't use message striping for middle message protocol, * Tests on sandy-bridge two HFIs show lower bandwidth if * message striping is used. */ ustatic psm2_error_t ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, struct ips_flow *flow, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) { ips_epaddr_t *ipsaddr = flow->ipsaddr; psm2_error_t err = PSM2_OK; uintptr_t buf = (uintptr_t) ubuf; uint32_t nbytes_left, pktlen, offset, chunk_size; uint16_t msgseq, padding; ips_scb_t *scb; uint32_t is_non_dw_mul_allowed = 0; psmi_assert(len > 0); psmi_assert(req != NULL); if (flow->transfer == PSM_TRANSFER_DMA) { psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0); /* max chunk size is the rv window size */ chunk_size = ipsaddr->window_rv; if (psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_MULTIPLE_MSG_SIZE)) is_non_dw_mul_allowed = 1; } else { psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0); chunk_size = flow->frag_size; } msgseq = ipsaddr->msgctl->mq_send_seqnum++; nbytes_left = len; offset = 0; do { if (is_non_dw_mul_allowed) { /* No need to care about padding if non-double word * multiple message size is allowed. */ padding = 0; } else { padding = nbytes_left & 0x3; } if (padding) { psmi_assert(nbytes_left > flow->frag_size); /* over reading should be OK on sender because * the padding area is within the whole buffer, * receiver will discard the extra bytes via * padcnt in packet header */ padding = 4 - padding; pktlen = flow->frag_size - padding; } else { pktlen = min(chunk_size, nbytes_left); psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed)); } scb = mq_alloc_pkts(proto, 1, 0, 0); psmi_assert(scb != NULL); ips_scb_opcode(scb) = OPCODE_EAGER; ips_set_LMC_LID_choice(proto, scb, len); scb->ips_lrh.khdr.kdeth0 = msgseq; ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); scb->ips_lrh.hdr_data.u32w1 = len; scb->ips_lrh.hdr_data.u32w0 = offset; /* initial offset */ _HFI_VDBG ("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n", (void *)buf, pktlen, flow->frag_size, nbytes_left); ips_scb_buffer(scb) = (void *)buf; #ifdef PSM_CUDA /* PSM would never send packets using eager protocol * if GPU Direct RDMA is turned off, which makes setting * these flags safe. */ if (req->is_buf_gpu_mem) { ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; } #endif buf += pktlen; offset += pktlen; nbytes_left -= pktlen; pktlen += padding; psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed)); scb->frag_size = flow->frag_size; scb->nfrag = (pktlen + flow->frag_size - 1) / flow->frag_size; if (scb->nfrag > 1) { ips_scb_length(scb) = flow->frag_size; scb->nfrag_remaining = scb->nfrag; scb->chunk_size = scb->chunk_size_remaining = pktlen; } else ips_scb_length(scb) = pktlen; if (nbytes_left == 0) { /* last segment/packet */ ips_scb_cb(scb) = ips_proto_mq_eager_complete; ips_scb_cb_param(scb) = req; /* Set ACKREQ if single packet per scb. For multi * packets per scb, it is SDMA, driver will set * ACKREQ in last packet, we only need ACK for * last packet. */ if (scb->nfrag == 1) ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; } else { req->send_msgoff += pktlen; } ips_proto_flow_enqueue(flow, scb); if (flow->transfer == PSM_TRANSFER_PIO) { /* we need to flush the pio pending queue as quick as possible */ err = flow->flush(flow, NULL); } } while (nbytes_left); /* after all sdma setup, flush sdma queue, * we want one system call to handle as many scbs as possible. */ if (flow->transfer == PSM_TRANSFER_DMA) { err = flow->flush(flow, NULL); } /* Before return, try to make some progress as long as the operation is * not a fast path isend. If this is a fast path isend we cannot call * progress functions since that will cause recursion into recvhdrq_progress * and cause messages to be lost. Instead, for fast path if the operation * was successfully enqueued, but flush returned PSM2_OK_NO_PROGRESS we return * PSM2_OK since the user will progress the queue once the fast path call is * complete. */ if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) { if (likely(!(req->flags_internal & PSMI_REQ_FLAG_FASTPATH))) { err = ips_recv_progress_if_busy(proto->ptl, PSM2_EP_NO_RESOURCES); } else if (err == PSM2_EP_NO_RESOURCES) { err = PSM2_OK; } } return err; } static psm2_error_t ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, ips_epaddr_t *ipsaddr, const void *buf, uint32_t len) { psmi_assert(proto->msgflowid < EP_FLOW_LAST); struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; psm2_error_t err = PSM2_OK; ips_scb_t *scb; PSM2_LOG_MSG("entering"); req->req_data.buf = (void *)buf; req->req_data.buf_len = len; req->req_data.send_msglen = len; req->recv_msgoff = 0; req->rts_peer = (psm2_epaddr_t) ipsaddr; scb = mq_alloc_pkts(proto, 1, 0, 0); psmi_assert(scb); ips_scb_opcode(scb) = OPCODE_LONG_RTS; ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; if (req->type & MQE_TYPE_WAITING) ips_scb_flags(scb) |= IPS_SEND_FLAG_BLOCKING; scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; ips_scb_copy_tag(scb->ips_lrh.tag, req->req_data.tag.tag); scb->ips_lrh.hdr_data.u32w1 = len; scb->ips_lrh.hdr_data.u32w0 = psmi_mpool_get_obj_index(req); if (len <= flow->frag_size && #ifdef PSM_CUDA !req->is_buf_gpu_mem && #endif !(len & 0x3)) { ips_scb_buffer(scb) = (void *)buf; ips_scb_length(scb) = len; req->send_msgoff = len; } else { ips_scb_length(scb) = 0; req->send_msgoff = 0; } #ifdef PSM_CUDA /* Used to indicate to the receiver that the send * is issued on a device buffer. This helps the * receiver select TID instead of using eager buffers. */ if (req->is_buf_gpu_mem) { ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; scb->mq_req = req; /* request comes from GPU domain (device) ... */ } req->cuda_hostbuf_used = 0; if ((!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && req->is_buf_gpu_mem && (len > GPUDIRECT_THRESH_RV)) || ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && req->is_buf_gpu_mem && (len > gpudirect_send_threshold))) { /* send from intermediate host buffer */ struct ips_cuda_hostbuf *chb; uint32_t offset, window_len; int prefetch_lookahead = 0; STAILQ_INIT(&req->sendreq_prefetch); offset = 0; req->cuda_hostbuf_used = 1; /* start prefetching */ req->prefetch_send_msgoff = 0; while ((offset < len) && (prefetch_lookahead < proto->cuda_prefetch_limit)) { chb = NULL; window_len = ips_cuda_next_window(ipsaddr->window_rv, offset, len); if (window_len <= CUDA_SMALLHOSTBUF_SZ) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_small_send); if (chb == NULL) chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( proto->cuda_hostbuf_pool_send); /* any buffers available? */ if (chb == NULL) break; req->prefetch_send_msgoff += window_len; chb->offset = offset; chb->size = window_len; chb->req = req; chb->gpu_buf = (CUdeviceptr) buf + offset; chb->bytes_read = 0; PSMI_CUDA_CALL(cuMemcpyDtoHAsync, chb->host_buf, chb->gpu_buf, window_len, proto->cudastream_send); PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, proto->cudastream_send); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); offset += window_len; prefetch_lookahead++; } } #endif PSM2_LOG_EPM_COND((len > proto->mq->hfi_thresh_rv) && proto->protoexp, OPCODE_LONG_RTS,PSM2_LOG_TX,proto->ep->epid, req->rts_peer->epid, "scb->ips_lrh.hdr_data.u32w0: %d",scb->ips_lrh.hdr_data.u32w0); /* If this is a fast path isend, then we cannot poll or * allow progressing of the mq from within the fast path * call otherwise messages will be lost. Therefore given fast path * we will avoid calling poll_internal and not set PSMI_TRUE which would * call ips_recv_progress_if_busy. */ if (unlikely(req->flags_internal & PSMI_REQ_FLAG_FASTPATH)) { if ((err = ips_mq_send_envelope(proto, flow, scb, PSMI_FALSE))) goto fail; } else { if ((err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE))) goto fail; /* Assume that we already put a few rndv requests in flight. This helps * for bibw microbenchmarks and doesn't hurt the 'blocking' case since * we're going to poll anyway */ psmi_poll_internal(proto->ep, 1); } fail: _HFI_VDBG ("[rndv][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p/%d]: %s\n", psmi_epaddr_get_name(proto->ep->epid), psmi_epaddr_get_name(req->rts_peer->epid), buf, len, req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], req, psmi_mpool_get_obj_index(req), psm2_error_get_string(err)); PSM2_LOG_MSG("leaving"); return err; } #ifdef PSM_CUDA static inline int psmi_cuda_is_buffer_gpu_mem(void *ubuf) { return (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)); } static inline int psmi_cuda_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) { if (!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) || !PSMI_IS_GDR_COPY_ENABLED || len < 1 || len > cuda_thresh_rndv){ return 1; } return 0; } #endif /* Find the correct flow (PIO/DMA) */ static inline ips_epaddr_flow_t flow_select_type(struct ips_proto *proto, uint32_t len, int gpu_mem, uint32_t eager_thresh) { ips_epaddr_flow_t flow_type; uint32_t pio_gdr_threshold; #ifdef PSM_CUDA if (gpu_mem) { pio_gdr_threshold = gdr_copy_threshold_send; } else #endif { pio_gdr_threshold = eager_thresh; } if (len <= pio_gdr_threshold) { /* PIO or GDRcopy */ flow_type = EP_FLOW_GO_BACK_N_PIO; /* * If PIO was disabled through the environment variable, * override the flow value. */ if (unlikely(ips_proto_is_disabled_pio(proto))) flow_type = EP_FLOW_GO_BACK_N_DMA; } else { /* Send DMA */ flow_type = EP_FLOW_GO_BACK_N_DMA; /* * If Send DMA was disabled through the environment variable, * override the flow value. */ if (unlikely(ips_proto_is_disabled_sdma(proto))) flow_type = EP_FLOW_GO_BACK_N_PIO; } return flow_type; } psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et qt, uint32_t *out, psm2_mq_t mq, psm2_epaddr_t epaddr) { struct ptl_ips *ptl = (struct ptl_ips *) epaddr->ptlctl->ptl; psm2_error_t rv = PSM2_INTERNAL_ERR; switch (qt) { case PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA: *out = ptl->proto.iovec_thresh_eager; rv = PSM2_OK; break; case PSM2_INFO_QUERY_THRESH_IPS_TINY: *out = mq->hfi_thresh_tiny; rv = PSM2_OK; break; case PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE: { ips_epaddr_t *ipsaddr = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_next; *out = ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].frag_size; } rv = PSM2_OK; break; case PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE: { ips_epaddr_t *ipsaddr = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_next; *out = ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].frag_size; } rv = PSM2_OK; break; case PSM2_INFO_QUERY_THRESH_IPS_RNDV: *out = mq->hfi_thresh_rv; rv = PSM2_OK; break; default: break; } return rv; } psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context, psm2_mq_req_t *req_o) { psm2_error_t err = PSM2_OK; ips_epaddr_flow_t flow_type; struct ips_proto *proto; struct ips_flow *flow; ips_epaddr_t *ipsaddr; ips_scb_t *scb; psm2_mq_req_t req; req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); if_pf(req == NULL) return PSM2_NO_MEMORY; _HFI_VDBG("(req=%p) ubuf=%p len=%u\n", req, ubuf, len); req->flags_user = flags_user; req->flags_internal = flags_internal; ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; proto = ((psm2_epaddr_t) ipsaddr)->proto; psmi_assert(proto->msgflowid < EP_FLOW_LAST); req->req_data.send_msglen = len; req->req_data.tag = *tag; req->req_data.context = context; #ifdef PSM_CUDA req->is_buf_gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); req->cuda_hostbuf_used = 0; if (req->is_buf_gpu_mem) { psmi_cuda_set_attr_sync_memops(ubuf); if (psmi_cuda_is_needed_rendezvous(proto, len)) goto do_rendezvous; } #else req->is_buf_gpu_mem = 0; #endif flow_type = flow_select_type(proto, len, req->is_buf_gpu_mem, proto->iovec_thresh_eager); flow = &ipsaddr->flows[flow_type]; if (flags_user & PSM2_MQ_FLAG_SENDSYNC) { goto do_rendezvous; } else if (len <= mq->hfi_thresh_tiny) { scb = mq_alloc_tiny(proto); psmi_assert(scb); ips_scb_opcode(scb) = OPCODE_TINY; ips_set_LMC_LID_choice(proto, scb, len); scb->ips_lrh.khdr.kdeth0 = ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) | ipsaddr->msgctl->mq_send_seqnum++; ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); const void *user_buffer = ubuf; #ifdef PSM_CUDA if (req->is_buf_gpu_mem) { /* The following functions PINS the GPU pages * and mmaps the pages into the process virtual * space. This allows PSM to issue a standard * memcpy to move data between HFI resources * and the GPU */ ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)ubuf, len, 0, proto); } mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, (uint32_t *) user_buffer, len); #else mq_copy_tiny((uint32_t *) &scb->ips_lrh.hdr_data, (uint32_t *) user_buffer, len); #endif /* If this is a fast path isend, then we cannot allow * progressing of the mq from within the fast path * call otherwise messages will be lost. Therefore given fast path * we will set PSMI_FALSE which will prevent the call to * ips_recv_progress_if_busy. */ err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH)); if (err != PSM2_OK) return err; /* We can mark this op complete since all the data is now copied * into an SCB that remains live until it is remotely acked */ req->state = MQ_STATE_COMPLETE; mq_qq_append(&mq->completed_q, req); _HFI_VDBG ("[itiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", psmi_epaddr_get_name(mq->ep->epid), psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], req); } else if (len <= flow->frag_size) { uint32_t paylen = len & ~0x3; scb = mq_alloc_pkts(proto, 1, 0, 0); psmi_assert(scb); ips_scb_opcode(scb) = OPCODE_SHORT; ips_set_LMC_LID_choice(proto, scb, len); scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; scb->ips_lrh.hdr_data.u32w1 = len; ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); const void * user_buffer = ubuf; #ifdef PSM_CUDA if (req->is_buf_gpu_mem && len <= gdr_copy_threshold_send){ ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)ubuf, len , 0, proto); } #endif ips_scb_buffer(scb) = (void *)user_buffer; ips_scb_length(scb) = paylen; if (len > paylen) { /* there are nonDW bytes, copy to header */ mq_copy_tiny ((uint32_t *)&scb->ips_lrh.hdr_data.u32w0, (uint32_t *)((uintptr_t)ubuf + paylen), len - paylen); /* for complete callback */ req->send_msgoff = len - paylen; } else { req->send_msgoff = 0; } /* * Need ack for send side completion because we * send from user buffer. */ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; #ifdef PSM_CUDA if (req->is_buf_gpu_mem && len > gdr_copy_threshold_send) { ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; } #endif /* If this is a fast path isend, then we cannot allow * progressing of the mq from within the fast path * call otherwise messages will be lost. Therefore given fast path * we will set PSMI_FALSE which will prevent the call to * ips_recv_progress_if_busy. */ err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH)); if (err != PSM2_OK) return err; /* * It should be OK to check the buffer address in * 'scb' to be changed, when this scb is done, the * address is set to NULL when scb is put back to * scb pool. Even if the same scb is re-used, it * is not possible to set to this 'buf' address. */ if (ips_scb_buffer(scb) == (void *)user_buffer) { /* continue to send from user buffer */ ips_scb_cb(scb) = ips_proto_mq_eager_complete; ips_scb_cb_param(scb) = req; } else { /* mark the message done */ req->state = MQ_STATE_COMPLETE; mq_qq_append(&mq->completed_q, req); } _HFI_VDBG ("[ishrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", psmi_epaddr_get_name(mq->ep->epid), psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], req); } else if (len <= mq->hfi_thresh_rv) { req->send_msgoff = 0; err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len); if (err != PSM2_OK) return err; _HFI_VDBG ("[ilong][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", psmi_epaddr_get_name(mq->ep->epid), psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], req); } else { /* skip eager accounting below */ do_rendezvous: err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len); *req_o = req; return err; } *req_o = req; mq->stats.tx_num++; mq->stats.tx_eager_num++; mq->stats.tx_eager_bytes += len; return err; } psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) { psm2_error_t err = PSM2_OK; ips_epaddr_flow_t flow_type; struct ips_proto *proto; struct ips_flow *flow; ips_epaddr_t *ipsaddr; ips_scb_t *scb; int gpu_mem = 0; _HFI_VDBG("ubuf=%p len=%u\n", ubuf, len); ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; proto = ((psm2_epaddr_t) ipsaddr)->proto; psmi_assert(proto->msgflowid < EP_FLOW_LAST); #ifdef PSM_CUDA gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); if (gpu_mem) { psmi_cuda_set_attr_sync_memops(ubuf); if (psmi_cuda_is_needed_rendezvous(proto, len)) goto do_rendezvous; } #endif flow_type = flow_select_type(proto, len, gpu_mem, proto->iovec_thresh_eager_blocking); flow = &ipsaddr->flows[flow_type]; if (flags & PSM2_MQ_FLAG_SENDSYNC) { goto do_rendezvous; } else if (len <= mq->hfi_thresh_tiny) { scb = mq_alloc_tiny(proto); psmi_assert(scb); ips_scb_opcode(scb) = OPCODE_TINY; ips_set_LMC_LID_choice(proto, scb, len); scb->ips_lrh.khdr.kdeth0 = ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) | ipsaddr->msgctl->mq_send_seqnum++; ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); #ifdef PSM_CUDA const void *user_buffer = ubuf; if (gpu_mem){ /* The following functions PINS the GPU pages * and mmaps the pages into the process virtual * space. This allows PSM to issue a standard * memcpy to move data between HFI resources * and the GPU */ ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)ubuf, len, 0, proto); } mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, (uint32_t *) user_buffer, len); #else mq_copy_tiny ((uint32_t *) &scb->ips_lrh.hdr_data, (uint32_t *) ubuf, len); #endif err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE); if (err != PSM2_OK) return err; _HFI_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", psmi_epaddr_get_name(mq->ep->epid), psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); } else if (len <= flow->frag_size) { uint32_t paylen = len & ~0x3; scb = mq_alloc_pkts(proto, 1, 0, 0); psmi_assert(scb); ips_scb_opcode(scb) = OPCODE_SHORT; ips_set_LMC_LID_choice(proto, scb, len); scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; scb->ips_lrh.hdr_data.u32w1 = len; ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); const void * user_buffer = ubuf; #ifdef PSM_CUDA if (gpu_mem && len <= gdr_copy_threshold_send) { ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)ubuf, len, 0, proto); } #endif ips_scb_buffer(scb) = (void *)user_buffer; ips_scb_length(scb) = paylen; if (len > paylen) { /* there are nonDW bytes, copy to header */ mq_copy_tiny ((uint32_t *)&scb->ips_lrh.hdr_data.u32w0, (uint32_t *)((uintptr_t)ubuf + paylen), len - paylen); } /* * Need ack for send side completion because we * send from user buffer. */ ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; #ifdef PSM_CUDA if (gpu_mem && len > gdr_copy_threshold_send) { ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; } #endif err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE); if (err != PSM2_OK) return err; /* * It should be OK to check the buffer address in * 'scb' to be changed, when this scb is done, the * address is set to NULL when scb is put back to * scb pool. Even if the same scb is re-used, it * is not possible to set to this 'ubuf' address. */ if (ips_scb_buffer(scb) == (void *)user_buffer) { if (flow->transfer != PSM_TRANSFER_PIO || paylen > proto->scb_bufsize || !ips_scbctrl_bufalloc(scb)) { /* sdma transfer (can't change user buffer), * or, payload is larger than bounce buffer, * or, can't allocate bounce buffer, * send from user buffer till complete */ PSMI_BLOCKUNTIL(mq->ep, err, ips_scb_buffer(scb) != (void*)user_buffer); if (err > PSM2_OK_NO_PROGRESS) return err; err = PSM2_OK; } else { /* copy to bounce buffer */ #ifdef PSM_CUDA ips_shortcpy_host_mem #else ips_shortcpy #endif (ips_scb_buffer(scb), (void*)user_buffer, paylen); } } _HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", psmi_epaddr_get_name(mq->ep->epid), psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); } else if (len <= mq->hfi_thresh_rv) { psm2_mq_req_t req; /* Block until we can get a req */ PSMI_BLOCKUNTIL(mq->ep, err, (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); if (err > PSM2_OK_NO_PROGRESS) return err; #ifdef PSM_CUDA req->cuda_hostbuf_used = 0; if (gpu_mem) { req->is_buf_gpu_mem = 1; } else req->is_buf_gpu_mem = 0; #endif req->type |= MQE_TYPE_WAITING; req->req_data.send_msglen = len; req->req_data.tag = *tag; req->send_msgoff = 0; req->flags_user = flags; req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len); if (err != PSM2_OK) return err; psmi_mq_wait_internal(&req); _HFI_VDBG("[long][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", psmi_epaddr_get_name(mq->ep->epid), psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); } else { psm2_mq_req_t req; do_rendezvous: /* Block until we can get a req */ PSMI_BLOCKUNTIL(mq->ep, err, (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); if (err > PSM2_OK_NO_PROGRESS) return err; req->type |= MQE_TYPE_WAITING; req->req_data.tag = *tag; req->flags_user = flags; req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; #ifdef PSM_CUDA if (gpu_mem) { req->is_buf_gpu_mem = 1; } else req->is_buf_gpu_mem = 0; #endif err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len); if (err != PSM2_OK) return err; psmi_mq_wait_internal(&req); return err; /* skip accounting, done separately at completion time */ } mq->stats.tx_num++; mq->stats.tx_eager_num++; mq->stats.tx_eager_bytes += len; return err; } static psm2_error_t ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) { psm2_epaddr_t epaddr = req->rts_peer; struct ips_proto *proto = epaddr->proto; /* We have a match. * We may already set with first packet, * If we're doing eager-based r-v, just send back the sreq and length and * have the sender complete the send. */ PSM2_LOG_MSG("entering"); #ifdef PSM_CUDA /* Cases where we do not use TIDs: * 1) Recv on a host buffer, Send on a gpu buffer and len is less than 3 bytes * 2) Recv on a host buffer, Send on a host buffer and len is less than hfi_thresh_rv * 3) Recv on gpu buf and len is less than 3 bytes * 4) Expected protocol not initialized. */ if ((!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV)|| (!req->is_sendbuf_gpu_mem && req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv))) || (req->is_buf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV) || proto->protoexp == NULL) { /* no expected tid recieve */ #else if (req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv ||/* less rv theshold */ proto->protoexp == NULL) { /* no expected tid recieve */ #endif /* there is no order requirement, try to push CTS request * directly, if fails, then queue it for later try. */ if (ips_proto_mq_push_cts_req(proto, req) != PSM2_OK) { struct ips_pend_sends *pends = &proto->pend_sends; struct ips_pend_sreq *sreq = psmi_mpool_get(proto->pend_sends_pool); psmi_assert(sreq != NULL); if (sreq == NULL) { PSM2_LOG_MSG("leaving"); return PSM2_NO_MEMORY; } sreq->type = IPS_PENDSEND_EAGER_REQ; sreq->req = req; STAILQ_INSERT_TAIL(&pends->pendq, sreq, next); psmi_timer_request(proto->timerq, &pends->timer, PSMI_TIMER_PRIO_1); } } else { ips_protoexp_tid_get_from_token(proto->protoexp, req->req_data.buf, req->req_data.recv_msglen, epaddr, req->rts_reqidx_peer, req->type & MQE_TYPE_WAITING_PEER ? IPS_PROTOEXP_TIDGET_PEERWAIT : 0, ips_proto_mq_rv_complete_exp, req); } PSM2_LOG_MSG("leaving"); return PSM2_OK; } psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req) { ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer); struct ips_flow *flow; ips_scb_t *scb; ptl_arg_t *args; PSM2_LOG_MSG("entering"); psmi_assert(proto->msgflowid < EP_FLOW_LAST); flow = &ipsaddr->flows[proto->msgflowid]; scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0); if (scb == NULL) { PSM2_LOG_MSG("leaving"); return PSM2_OK_NO_PROGRESS; } args = (ptl_arg_t *) scb->ips_lrh.data; ips_scb_opcode(scb) = OPCODE_LONG_CTS; scb->ips_lrh.khdr.kdeth0 = 0; args[0].u32w0 = psmi_mpool_get_obj_index(req); args[1].u32w1 = req->req_data.recv_msglen; args[1].u32w0 = req->rts_reqidx_peer; PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, flow->ipsaddr->epaddr.epid ,"req->rts_reqidx_peer: %d", req->rts_reqidx_peer); ips_proto_flow_enqueue(flow, scb); flow->flush(flow, NULL); /* have already received enough bytes */ if (req->recv_msgoff == req->req_data.recv_msglen) { ips_proto_mq_rv_complete(req); } PSM2_LOG_MSG("leaving"); return PSM2_OK; } psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) { psm2_error_t err = PSM2_OK; uintptr_t buf = (uintptr_t) req->req_data.buf + req->recv_msgoff; ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer); uint32_t nbytes_left = req->req_data.send_msglen - req->recv_msgoff; uint32_t nbytes_sent = 0; uint32_t nbytes_this, chunk_size; uint16_t frag_size, unaligned_bytes; struct ips_flow *flow; ips_scb_t *scb; psmi_assert(nbytes_left > 0); PSM2_LOG_MSG("entering."); if ( #ifdef PSM_CUDA req->is_buf_gpu_mem || #endif req->req_data.send_msglen > proto->iovec_thresh_eager) { /* use SDMA transfer */ psmi_assert((proto->flags & IPS_PROTO_FLAG_SPIO) == 0); flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA]; frag_size = flow->path->pr_mtu; /* max chunk size is the rv window size */ chunk_size = ipsaddr->window_rv; } else { /* use PIO transfer */ psmi_assert((proto->flags & IPS_PROTO_FLAG_SDMA) == 0); flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; chunk_size = frag_size = flow->frag_size; } do { /* * don't try to call progression routine such as: * ips_recv_progress_if_busy() in this loop, * it will cause recursive call of this function. */ /* * When tid code path is enabled, we don’t allocate scbc_rv * objects. If the message is less than the hfi_thresh_rv, * we normally use eager protocol to do the transfer. * However, if it is sync send, we use the rendezvous * rts/cts/rts-data protocol. * In this case, because scbc_rv is null, * we use scbc_egr instead. */ scb = ips_scbctrl_alloc(proto->scbc_rv ? proto->scbc_rv : &proto->scbc_egr, 1, 0, 0); if (scb == NULL) { err = PSM2_OK_NO_PROGRESS; break; } ips_scb_opcode(scb) = OPCODE_LONG_DATA; scb->ips_lrh.khdr.kdeth0 = 0; scb->ips_lrh.data[0].u32w0 = req->rts_reqidx_peer; scb->ips_lrh.data[1].u32w1 = req->req_data.send_msglen; /* attached unaligned bytes into packet header */ unaligned_bytes = nbytes_left & 0x3; if (unaligned_bytes) { mq_copy_tiny((uint32_t *)&scb->ips_lrh.mdata, (uint32_t *)buf, unaligned_bytes); /* position to send */ buf += unaligned_bytes; req->recv_msgoff += unaligned_bytes; psmi_assert(req->recv_msgoff < 4); /* for complete callback */ req->send_msgoff += unaligned_bytes; nbytes_left -= unaligned_bytes; nbytes_sent += unaligned_bytes; } scb->ips_lrh.data[1].u32w0 = req->recv_msgoff; ips_scb_buffer(scb) = (void *)buf; scb->frag_size = frag_size; nbytes_this = min(chunk_size, nbytes_left); if (nbytes_this > 0) scb->nfrag = (nbytes_this + frag_size - 1) / frag_size; else scb->nfrag = 1; if (scb->nfrag > 1) { ips_scb_length(scb) = frag_size; scb->nfrag_remaining = scb->nfrag; scb->chunk_size = scb->chunk_size_remaining = nbytes_this; } else ips_scb_length(scb) = nbytes_this; buf += nbytes_this; req->recv_msgoff += nbytes_this; nbytes_sent += nbytes_this; nbytes_left -= nbytes_this; if (nbytes_left == 0) { /* because of scb callback, use eager complete */ ips_scb_cb(scb) = ips_proto_mq_eager_complete; ips_scb_cb_param(scb) = req; /* Set ACKREQ if single packet per scb. For multi * packets per scb, it is SDMA, driver will set * ACKREQ in last packet, we only need ACK for * last packet. */ if (scb->nfrag == 1) ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; } else { req->send_msgoff += nbytes_this; } ips_proto_flow_enqueue(flow, scb); if (flow->transfer == PSM_TRANSFER_PIO) { /* we need to flush the pio pending queue as quick as possible */ flow->flush(flow, NULL); } } while (nbytes_left); /* for sdma, if some bytes are queued, flush them */ if (flow->transfer == PSM_TRANSFER_DMA && nbytes_sent) { flow->flush(flow, NULL); } PSM2_LOG_MSG("leaving."); return err; } int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) { struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_proto *proto = rcv_ev->proto; psm2_mq_t mq = proto->ep->mq; struct ips_flow *flow; psm2_mq_req_t req; uint32_t paylen; /* * if PSN does not match, drop the packet. */ PSM2_LOG_MSG("entering"); if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) { PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } req = psmi_mpool_find_obj_by_index(mq->sreq_pool, p_hdr->data[1].u32w0); psmi_assert(req != NULL); /* * if there is payload, it is expected tid protocol * with tid session info as the payload. */ paylen = ips_recvhdrq_event_paylen(rcv_ev); if (paylen > 0) { ips_tid_session_list *payload = ips_recvhdrq_event_payload(rcv_ev); psmi_assert(paylen == 0 || payload); PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, mq->ep->epid,"p_hdr->data[1].u32w0 %d", p_hdr->data[1].u32w0); proto->epaddr_stats.tids_grant_recv++; psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv); psmi_assert(proto->protoexp != NULL); /* ptl_req_ptr will be set to each tidsendc */ if (req->ptl_req_ptr == NULL) { req->req_data.send_msglen = p_hdr->data[1].u32w1; } psmi_assert(req->req_data.send_msglen == p_hdr->data[1].u32w1); if (ips_tid_send_handle_tidreq(proto->protoexp, rcv_ev->ipsaddr, req, p_hdr->data[0], p_hdr->mdata, payload, paylen) == 0) { proto->psmi_logevent_tid_send_reqs.next_warning = 0; } else { flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; flow->recv_seq_num.psn_num -= 1; /* Decrement seq number to NAK proper CTS */ ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow); static unsigned int msg_cnt = 0; if (msg_cnt++ == 0) { /* Report the message only once */ _HFI_INFO("PSM2 memory shortage detected. Please consider modifying PSM2_MEMORY setting\n"); } return PSM2_EP_NO_RESOURCES; } } else { req->rts_reqidx_peer = p_hdr->data[0].u32w0; /* eager receive only */ req->req_data.send_msglen = p_hdr->data[1].u32w1; if (req->send_msgoff >= req->req_data.send_msglen) { /* already sent enough bytes, may truncate so using >= */ ips_proto_mq_rv_complete(req); } else if (ips_proto_mq_push_rts_data(proto, req) != PSM2_OK) { /* there is no order requirement, tried to push RTS data * directly and not done, so queue it for later try. */ struct ips_pend_sreq *sreq = psmi_mpool_get(proto->pend_sends_pool); psmi_assert(sreq != NULL); sreq->type = IPS_PENDSEND_EAGER_DATA; sreq->req = req; STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next); /* Make sure it's processed by timer */ psmi_timer_request(proto->timerq, &proto->pend_sends.timer, PSMI_TIMER_PRIO_1); } } flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev) { int ret = IPS_RECVHDRQ_CONTINUE; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; psm2_mq_t mq = rcv_ev->proto->mq; ips_msgctl_t *msgctl = ipsaddr->msgctl; enum ips_msg_order msgorder; char *payload; uint32_t paylen; psm2_mq_req_t req; /* * if PSN does not match, drop the packet. */ PSM2_LOG_MSG("entering"); if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) { PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } msgorder = ips_proto_check_msg_order(ipsaddr, flow, __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, &ipsaddr->msgctl->mq_recv_seqnum); if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) { PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_REVISIT; } payload = ips_recvhdrq_event_payload(rcv_ev); paylen = ips_recvhdrq_event_paylen(rcv_ev); /* either no payload or whole message */ psmi_assert(paylen == 0 || paylen >= p_hdr->data[1].u32w1); /* * We can't have past message sequence here. For eager message, * it must always have an eager queue matching because even in * truncation case the code logic will wait till all packets * have been received. */ psmi_assert(msgorder != IPS_MSG_ORDER_PAST); _HFI_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", (long long)p_hdr->data[0].u64, p_hdr->data[1].u32w0, p_hdr->data[1].u32w1); int rc = psmi_mq_handle_rts(mq, (psm2_epaddr_t) &ipsaddr->msgctl-> master_epaddr, (psm2_mq_tag_t *) p_hdr->tag, p_hdr->data[1].u32w1, payload, paylen, msgorder, ips_proto_mq_rts_match_callback, &req); if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & psn_mask; ipsaddr->msgctl->mq_recv_seqnum--; PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_REVISIT; } req->rts_peer = (psm2_epaddr_t) ipsaddr; req->rts_reqidx_peer = p_hdr->data[1].u32w0; if (req->req_data.send_msglen > mq->hfi_thresh_rv) { PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,req->rts_peer->epid,mq->ep->epid, "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); } if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING) req->type |= MQE_TYPE_WAITING_PEER; #ifdef PSM_CUDA if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU) req->is_sendbuf_gpu_mem = 1; else req->is_sendbuf_gpu_mem = 0; #endif if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { /* for out of order matching only */ req->msg_seqnum = __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; req->ptl_req_ptr = (void *)msgctl; msgctl->outoforder_count++; mq_qq_append(&mq->outoforder_q, req); ret = IPS_RECVHDRQ_BREAK; } else { ipsaddr->msg_toggle = 0; if (rc == MQ_RET_MATCH_OK) ips_proto_mq_rts_match_callback(req, 1); /* XXX if blocking, break out of progress loop */ if (msgctl->outoforder_count) ips_proto_mq_handle_outoforder_queue(mq, msgctl); if (rc == MQ_RET_UNEXP_OK) ret = IPS_RECVHDRQ_BREAK; } if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); PSM2_LOG_MSG("leaving"); return ret; } int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev) { int ret = IPS_RECVHDRQ_CONTINUE; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; psm2_mq_t mq = rcv_ev->proto->mq; ips_msgctl_t *msgctl = ipsaddr->msgctl; enum ips_msg_order msgorder; char *payload; uint32_t paylen; psm2_mq_req_t req; /* * if PSN does not match, drop the packet. */ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) return IPS_RECVHDRQ_CONTINUE; msgorder = ips_proto_check_msg_order(ipsaddr, flow, __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, &ipsaddr->msgctl->mq_recv_seqnum); if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) return IPS_RECVHDRQ_REVISIT; payload = (void *)&p_hdr->hdr_data; paylen = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> HFI_KHDR_TINYLEN_SHIFT) & HFI_KHDR_TINYLEN_MASK; /* * We can't have past message sequence here. For eager message, * it must always have an eager queue matching because even in * truncation case the code logic will wait till all packets * have been received. */ psmi_assert(msgorder != IPS_MSG_ORDER_PAST); _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], OPCODE_TINY, p_hdr->hdr_data.u32w1); /* store in req below too! */ int rc = psmi_mq_handle_envelope(mq, (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, (psm2_mq_tag_t *) p_hdr->tag, paylen, 0, payload, paylen, msgorder, OPCODE_TINY, &req); if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & psn_mask; ipsaddr->msgctl->mq_recv_seqnum--; return IPS_RECVHDRQ_REVISIT; } if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { /* for out of order matching only */ req->msg_seqnum = __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; req->ptl_req_ptr = (void *)msgctl; msgctl->outoforder_count++; mq_qq_append(&mq->outoforder_q, req); ret = IPS_RECVHDRQ_BREAK; } else { ipsaddr->msg_toggle = 0; if (msgctl->outoforder_count) ips_proto_mq_handle_outoforder_queue(mq, msgctl); if (rc == MQ_RET_UNEXP_OK) ret = IPS_RECVHDRQ_BREAK; } if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); return ret; } int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev) { int ret = IPS_RECVHDRQ_CONTINUE; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; psm2_mq_t mq = rcv_ev->proto->mq; ips_msgctl_t *msgctl = ipsaddr->msgctl; enum ips_msg_order msgorder; char *payload; uint32_t paylen; psm2_mq_req_t req; /* * if PSN does not match, drop the packet. */ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) return IPS_RECVHDRQ_CONTINUE; msgorder = ips_proto_check_msg_order(ipsaddr, flow, __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, &ipsaddr->msgctl->mq_recv_seqnum); if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) return IPS_RECVHDRQ_REVISIT; payload = ips_recvhdrq_event_payload(rcv_ev); paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(paylen == 0 || payload); /* * We can't have past message sequence here. For eager message, * it must always have an eager queue matching because even in * truncation case the code logic will wait till all packets * have been received. */ psmi_assert(msgorder != IPS_MSG_ORDER_PAST); _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], OPCODE_SHORT, p_hdr->hdr_data.u32w1); /* store in req below too! */ int rc = psmi_mq_handle_envelope(mq, (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, (psm2_mq_tag_t *) p_hdr->tag, p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0, payload, paylen, msgorder, OPCODE_SHORT, &req); if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & psn_mask; ipsaddr->msgctl->mq_recv_seqnum--; return IPS_RECVHDRQ_REVISIT; } if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { /* for out of order matching only */ req->msg_seqnum = __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; req->ptl_req_ptr = (void *)msgctl; msgctl->outoforder_count++; mq_qq_append(&mq->outoforder_q, req); ret = IPS_RECVHDRQ_BREAK; } else { ipsaddr->msg_toggle = 0; if (msgctl->outoforder_count) ips_proto_mq_handle_outoforder_queue(mq, msgctl); if (rc == MQ_RET_UNEXP_OK) ret = IPS_RECVHDRQ_BREAK; } if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); return ret; } int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev) { int ret = IPS_RECVHDRQ_CONTINUE; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; psm2_mq_t mq = rcv_ev->proto->mq; ips_msgctl_t *msgctl = ipsaddr->msgctl; enum ips_msg_order msgorder; char *payload; uint32_t paylen; psm2_mq_req_t req; /* * if PSN does not match, drop the packet. */ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) return IPS_RECVHDRQ_CONTINUE; msgorder = ips_proto_check_msg_order(ipsaddr, flow, __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, &ipsaddr->msgctl->mq_recv_seqnum); if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) return IPS_RECVHDRQ_REVISIT; payload = ips_recvhdrq_event_payload(rcv_ev); paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(paylen == 0 || payload); if (msgorder == IPS_MSG_ORDER_PAST || msgorder == IPS_MSG_ORDER_FUTURE_RECV) { req = mq_eager_match(mq, msgctl, __le32_to_cpu(p_hdr->khdr.kdeth0)&HFI_KHDR_MSGSEQ_MASK); /* * It is future message sequence or past message sequence, * and there is request matching in eager queue, we handle * the packet data and return. We can't go continue to * match envelope. * Past message sequence must always have a matching!!! * error is caught below. */ if (req) { #ifdef PSM_CUDA if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->user_gpu_buffer, req->req_data.send_msglen, 1, rcv_ev->proto); } #endif psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen); if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) ret = IPS_RECVHDRQ_BREAK; if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); return ret; } psmi_assert(msgorder == IPS_MSG_ORDER_FUTURE_RECV); /* * For future message sequence, since there is no eager * queue matching yet, this must be the first packet for * the message sequence. And of course, expected message * sequence is always the first packet for the sequence. */ } /* * We can't have past message sequence here. For eager message, * it must always have an eager queue matching because even in * truncation case the code logic will wait till all packets * have been received. */ psmi_assert(msgorder != IPS_MSG_ORDER_PAST); _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], OPCODE_EAGER, p_hdr->hdr_data.u32w1); /* store in req below too! */ int rc = psmi_mq_handle_envelope(mq, (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, (psm2_mq_tag_t *) p_hdr->tag, p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0, payload, paylen, msgorder, OPCODE_EAGER, &req); if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & psn_mask; ipsaddr->msgctl->mq_recv_seqnum--; return IPS_RECVHDRQ_REVISIT; } /* for both outoforder matching and eager matching */ req->msg_seqnum = __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; req->ptl_req_ptr = (void *)msgctl; if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { msgctl->outoforder_count++; mq_qq_append(&mq->outoforder_q, req); ret = IPS_RECVHDRQ_BREAK; } else { ipsaddr->msg_toggle = 0; if (msgctl->outoforder_count) ips_proto_mq_handle_outoforder_queue(mq, msgctl); if (rc == MQ_RET_UNEXP_OK) ret = IPS_RECVHDRQ_BREAK; } if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); return ret; } /* * Progress the out of order queue to see if any message matches * current receiving sequence number. */ void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl) { psm2_mq_req_t req; do { req = mq_ooo_match(&mq->outoforder_q, msgctl, msgctl->mq_recv_seqnum); if (req == NULL) return; msgctl->outoforder_count--; msgctl->mq_recv_seqnum++; psmi_mq_handle_outoforder(mq, req); } while (msgctl->outoforder_count > 0); return; } int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) { struct ips_message_header *p_hdr = rcv_ev->p_hdr; psm2_mq_t mq = rcv_ev->proto->mq; char *payload; uint32_t paylen; psm2_mq_req_t req; struct ips_flow *flow; /* * if PSN does not match, drop the packet. */ if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) return IPS_RECVHDRQ_CONTINUE; req = psmi_mpool_find_obj_by_index(mq->rreq_pool, p_hdr->data[0].u32w0); psmi_assert(req != NULL); psmi_assert(p_hdr->data[1].u32w1 == req->req_data.send_msglen); /* * if a packet has very small offset, it must have unaligned data * attached in the packet header, and this must be the first packet * for that message. */ if (p_hdr->data[1].u32w0 < 4 && p_hdr->data[1].u32w0 > 0) { psmi_assert(p_hdr->data[1].u32w0 == (req->req_data.send_msglen&0x3)); mq_copy_tiny((uint32_t *)req->req_data.buf, (uint32_t *)&p_hdr->mdata, p_hdr->data[1].u32w0); req->send_msgoff += p_hdr->data[1].u32w0; } payload = ips_recvhdrq_event_payload(rcv_ev); paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(paylen == 0 || payload); psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen); flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); ips_proto_process_ack(rcv_ev); return IPS_RECVHDRQ_CONTINUE; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_proto_params.h000066400000000000000000000223331370564314600214300ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef _IPS_PROTO_PARAMS_H #define _IPS_PROTO_PARAMS_H /* * send method: dma, pio; * recv method: tid, egr; * * send-recv mode combinations: 1=on, 0=off * A: dma:1, pio=1, tid=1, egr=1; * B: dma:0, pio=1, tid=1, egr=1; * C: dma:1, pio=0, tid=1, egr=1; * D: dma:1, pio=1, tid=0, egr=1; * E: dma:0, pio=1, tid=0, egr=1; * F: dma:1, pio=0, tid=0, egr=1; * * message packet type: * T: tiny; S: short; E: eager; * LR: long rts; LC: long cts; LD: long data; * ED: expected data; EC: expected completion; * C: ctrl msg; * * send,recv method for each packet type and each send-recv mode * ------------------------------------------------------------------- * | | A | B | C | D | E | F | * ------------------------------------------------------------------- * | T | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | * ------------------------------------------------------------------- * | S | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | * ------------------------------------------------------------------- * | E | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |threshold * ------------------------------------------------------------------- * | LR | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | * ------------------------------------------------------------------- * | LC | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | * ------------------------------------------------------------------- * | LD | x | x | x | pio,egr | pio,egr | dma,egr |threshold * ------------------------------------------------------------------- * | ED | dma,tid | pio,tid | dma,tid | x | x | x | * ------------------------------------------------------------------- * | EC | pio,egr | pio,egr | dma,egr | x | x | x | * ------------------------------------------------------------------- * | C | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | * ------------------------------------------------------------------- */ /* Constants */ #define BYTE2DWORD_SHIFT 2 #define LOWER_16_BITS 0xFFFF #define PSM_CACHE_LINE_BYTES 64 #define PSM2_FLOW_CREDITS 64 #define PSM_CRC_SIZE_IN_BYTES 8 /* * version of protocol header (known to chip also). * This value for OPA is defined in spec. */ #define IPS_PROTO_VERSION 0x1 /* time conversion macros */ #define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us)) #define ms_2_cycles(ms) nanosecs_to_cycles(1000000ULL*(ms)) #define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec)) /* Per-flow flags */ #define IPS_FLOW_FLAG_NAK_SEND 0x01 #define IPS_FLOW_FLAG_PENDING_ACK 0x02 #define IPS_FLOW_FLAG_PENDING_NAK 0x04 #define IPS_FLOW_FLAG_GEN_BECN 0x08 #define IPS_FLOW_FLAG_CONGESTED 0x10 #define IPS_FLOW_FLAG_SKIP_CTS 0x20 /* tid session expected send flags */ #define EXP_SEND_FLAG_CLEAR_ALL 0x00 #define EXP_SEND_FLAG_FREE_TIDS 0x01 #define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL /* 64 bit all-one's */ /* * scb flags for wire, * Only the lower 6 bits are wire-protocol options */ #define IPS_SEND_FLAG_NONE 0x00 #define IPS_SEND_FLAG_BLOCKING 0x01 /* blocking send */ #define IPS_SEND_FLAG_PKTCKSUM 0x02 /* Has packet checksum */ #define IPS_SEND_FLAG_AMISTINY 0x04 /* AM is tiny, exclusive */ #ifdef PSM_CUDA /* This flag is used to indicate to the reciever when * the send is issued on a device buffer. This helps in * selecting TID path on the recieve side regardless of * the receive buffers locality. It is used * in a special case where the send is on a device * buffer and the receive is on a host buffer. */ #define IPS_SEND_FLAG_USER_BUF_GPU 0x08 #endif #define IPS_SEND_FLAG_PROTO_OPTS 0x3f /* only 6bits wire flags */ /* scb flags */ #define IPS_SEND_FLAG_PENDING 0x0100 #define IPS_SEND_FLAG_PERSISTENT 0x0200 #define IPS_SEND_FLAG_NO_LMC 0x0400 #ifdef PSM_CUDA /* This flag is used to indicate if the send is on * a GPU buffer. This helps PIO/SDMA paths to detect * if payload is GPU buffer without having to call * cudaGetPointerAttribute. */ #define IPS_SEND_FLAG_PAYLOAD_BUF_GPU 0x0800 #endif /* 0x10000000, interrupt when done */ #define IPS_SEND_FLAG_INTR (1< 0) proto->stray_warn_interval = sec_2_cycles(interval_secs); else proto->stray_warn_interval = 0; return PSM2_OK; } psm2_error_t ips_proto_recv_fini(struct ips_proto *proto) { ips_report_strays(proto); return PSM2_OK; } #define cycles_to_sec_f(cycles) \ (((double)cycles_to_nanosecs(cycles)) / 1000000000.0) struct ips_stray_epid { psm2_epid_t epid; uint32_t err_check_bad_sent; uint32_t ipv4_addr; uint32_t pid; uint32_t num_messages; uint64_t t_warn_next; uint64_t t_first; uint64_t t_last; }; static void ips_report_strays(struct ips_proto *proto) { struct ips_stray_epid *sepid; struct psmi_eptab_iterator itor; psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK); #if _HFI_DEBUGGING double t_first = 0; double t_last = 0; double t_runtime = 0; if (_HFI_INFO_ON) { t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init); } #endif while ((sepid = psmi_epid_itor_next(&itor))) { char ipbuf[INET_ADDRSTRLEN], *ip = NULL; char bufpid[32]; uint32_t lid = psm2_epid_nid(sepid->epid); #if _HFI_DEBUGGING if (_HFI_INFO_ON) { t_first = cycles_to_sec_f(sepid->t_first - proto->t_init); t_last = cycles_to_sec_f(sepid->t_last - proto->t_init); } #endif if (sepid->ipv4_addr) ip = (char *) inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, sizeof(ipbuf)); if (!ip) snprintf(ipbuf, sizeof(ipbuf), "%d (%x)", lid, lid); if (sepid->pid) snprintf(bufpid, sizeof(bufpid), "PID=%d", sepid->pid); else snprintf(bufpid, sizeof(bufpid), "PID unknown"); if (_HFI_INFO_ON) { _HFI_INFO_ALWAYS ("Process %s on host %s=%s sent %d stray message(s) and " "was told so %d time(s) (first stray message at %.1fs " "(%d%%), last at %.1fs (%d%%) into application run)\n", bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages, sepid->err_check_bad_sent, t_first, (int)(t_first * 100.0 / t_runtime), t_last, (int)(t_last * 100.0 / t_runtime)); } psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid); psmi_free(sepid); } psmi_epid_itor_fini(&itor); return; } /* New scbs now available. If we have pending sends because we were out of * scbs, put the pendq on the timerq so it can be processed. */ void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context) { struct ips_proto *proto = (struct ips_proto *)context; struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq); if (sreq != NULL) psmi_timer_request(proto->timerq, &proto->pend_sends.timer, PSMI_TIMER_PRIO_1); return; } psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current) { psm2_error_t err = PSM2_OK; struct ips_pend_sends *pend_sends = (struct ips_pend_sends *)timer->context; struct ips_pendsendq *phead = &pend_sends->pendq; struct ips_proto *proto = (struct ips_proto *)pend_sends->proto; struct ips_pend_sreq *sreq; while (!STAILQ_EMPTY(phead)) { sreq = STAILQ_FIRST(phead); switch (sreq->type) { case IPS_PENDSEND_EAGER_REQ: err = ips_proto_mq_push_cts_req(proto, sreq->req); break; case IPS_PENDSEND_EAGER_DATA: err = ips_proto_mq_push_rts_data(proto, sreq->req); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown pendq state %d\n", sreq->type); } if (err == PSM2_OK) { STAILQ_REMOVE_HEAD(phead, next); psmi_mpool_put(sreq); } else { /* out of scbs. wait for the next scb_avail callback */ /* printf("!!!!! breaking out of pendq progress\n"); */ break; } } return err; } PSMI_INLINE( int between(int first_seq, int last_seq, int seq)) { if (last_seq >= first_seq) { if (seq < first_seq || seq > last_seq) { return 0; } } else { if (seq > last_seq && seq < first_seq) { return 0; } } return 1; } PSMI_INLINE( int pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow, psmi_seqnum_t ack_seq_num)) { uint32_t last_num; struct ips_scb_unackedq *unackedq = &flow->scb_unacked; if (STAILQ_EMPTY(unackedq)) return 0; /* scb_pend will be moved back when an nak is received, but * the packet may actually be received and acked after the nak, * so we use the tail of unacked queue, which may include packets * not being sent out yet, this is over do, but it should be OK. */ last_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_num; return between(flow->xmit_ack_num.psn_num, last_num, ack_seq_num.psn_num); } PSMI_INLINE( struct ips_flow * get_tidflow(struct ips_proto *proto, ips_epaddr_t *ipsaddr, struct ips_message_header *p_hdr, psmi_seqnum_t ack_seq_num)) { struct ips_protoexp *protoexp = proto->protoexp; ptl_arg_t desc_id = p_hdr->data[0]; struct ips_tid_send_desc *tidsendc; ptl_arg_t desc_tidsendc; struct ips_flow *flow; uint32_t last_seq; struct ips_scb_unackedq *unackedq; tidsendc = (struct ips_tid_send_desc *) psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, desc_id._desc_idx); if (tidsendc == NULL) { _HFI_ERROR ("OPCODE_ACK: Index %d is out of range in tidflow ack\n", desc_id._desc_idx); return NULL; } /* Ensure generation matches */ psmi_mpool_get_obj_index_gen_count(tidsendc, &desc_tidsendc._desc_idx, &desc_tidsendc._desc_genc); if (desc_tidsendc.u64 != desc_id.u64) return NULL; /* Ensure ack is within window */ flow = &tidsendc->tidflow; unackedq = &flow->scb_unacked; /* No unacked scbs */ if (STAILQ_EMPTY(unackedq)) return NULL; /* Generation for ack should match */ if (STAILQ_FIRST(unackedq)->seq_num.psn_gen != ack_seq_num.psn_gen) return NULL; /* scb_pend will be moved back when an nak is received, but * the packet may actually be received and acked after the nak, * so we use the tail of unacked queue, which may include packets * not being sent out yet, this is over do, but it should be OK. */ last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_seq; if (between(flow->xmit_ack_num.psn_seq, last_seq, ack_seq_num.psn_seq) == 0) return NULL; return flow; } /* NAK post process for tid flow */ void ips_tidflow_nak_post_process(struct ips_proto *proto, struct ips_flow *flow) { ips_scb_t *scb; uint32_t first_seq, ack_seq; scb = STAILQ_FIRST(&flow->scb_unacked); first_seq = __be32_to_cpu(scb->ips_lrh.bth[2]) & HFI_BTH_SEQ_MASK; ack_seq = (flow->xmit_ack_num.psn_seq - 1) & HFI_BTH_SEQ_MASK; /* If the ack SEQ falls into a multi-packets scb, * don't re-send the packets already acked. */ if (scb->nfrag > 1 && between(first_seq, scb->seq_num.psn_seq, ack_seq)) { uint32_t om, offset_in_tid, remaining_bytes_in_tid; uint32_t npkt, pktlen, nbytes; uint32_t idx, loop; /* how many packets acked in this scb */ npkt = ((ack_seq - first_seq) & HFI_BTH_SEQ_MASK) + 1; /* Get offset/om from current packet header */ offset_in_tid = __le32_to_cpu(scb->ips_lrh.khdr.kdeth0) & HFI_KHDR_OFFSET_MASK; om = (__le32_to_cpu(scb->ips_lrh.khdr.kdeth0) >> HFI_KHDR_OM_SHIFT) & 0x1; if (om) offset_in_tid *= 64; else offset_in_tid *= 4; /* bytes remaining in current tid */ remaining_bytes_in_tid = (IPS_TIDINFO_GET_LENGTH(scb->tsess[0]) << 12) - offset_in_tid; /* packet length in current header */ pktlen = scb->payload_size; psmi_assert(min(remaining_bytes_in_tid, scb->frag_size) >= pktlen); psmi_assert((((__be16_to_cpu(scb->ips_lrh.lrh[2]) & HFI_LRH_PKTLEN_MASK) << BYTE2DWORD_SHIFT) - sizeof(struct ips_message_header) - HFI_CRC_SIZE_IN_BYTES) == pktlen); /* Loop to find the position to start */ idx = 0; nbytes = 0; loop = npkt; while (loop) { remaining_bytes_in_tid -= pktlen; offset_in_tid += pktlen; nbytes += pktlen; first_seq++; loop--; if (remaining_bytes_in_tid == 0) { idx++; remaining_bytes_in_tid = IPS_TIDINFO_GET_LENGTH(scb-> tsess[idx]) << 12; offset_in_tid = 0; } pktlen = min(remaining_bytes_in_tid, scb->frag_size); } psmi_assert((first_seq & HFI_BTH_SEQ_MASK) == ((ack_seq + 1) & HFI_BTH_SEQ_MASK)); /* 0. update scb info */ psmi_assert(scb->nfrag_remaining > npkt); scb->nfrag_remaining -= npkt; psmi_assert(scb->chunk_size_remaining > nbytes); scb->chunk_size_remaining -= nbytes; ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes); /* 1. if last packet in sequence, set ACK, clear SH */ if (scb->nfrag_remaining == 1) { psmi_assert(scb->chunk_size_remaining <= scb->frag_size); scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; scb->scb_flags &= ~IPS_SEND_FLAG_HDRSUPP; /* last packet is what remaining */ pktlen = scb->chunk_size_remaining; } /* 2. set new packet sequence number */ scb->ips_lrh.bth[2] = __cpu_to_be32( ((first_seq & HFI_BTH_SEQ_MASK) << HFI_BTH_SEQ_SHIFT) | ((scb->seq_num.psn_gen & HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT) | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); /* 3. set new packet offset */ scb->ips_lrh.exp_offset += nbytes; /* 4. if packet length is changed, set new length */ if (scb->payload_size != pktlen) { scb->payload_size = pktlen; scb->ips_lrh.lrh[2] = __cpu_to_be16(( (scb->payload_size + sizeof(struct ips_message_header) + HFI_CRC_SIZE_IN_BYTES) >> BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK); } /* 5. set new tidctrl and tidinfo array */ scb->tsess = &scb->tsess[idx]; scb->tsess_length -= idx * sizeof(uint32_t); scb->tidctrl = IPS_TIDINFO_GET_TIDCTRL(scb->tsess[0]); /* 6. calculate new offset mode */ if (offset_in_tid < 131072) { /* 2^15 * 4 */ offset_in_tid /= 4; om = 0; } else { offset_in_tid /= 64; om = 1; } /* 7. set new tidinfo */ scb->ips_lrh.khdr.kdeth0 = __cpu_to_le32( (offset_in_tid & HFI_KHDR_OFFSET_MASK) | (om << HFI_KHDR_OM_SHIFT) | (IPS_TIDINFO_GET_TID(scb->tsess[0]) << HFI_KHDR_TID_SHIFT) | (scb->tidctrl << HFI_KHDR_TIDCTRL_SHIFT) | (scb->scb_flags & IPS_SEND_FLAG_INTR) | (scb->scb_flags & IPS_SEND_FLAG_HDRSUPP) | (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); } /* Update unacked scb's to use the new generation */ while (scb) { /* update with new generation */ scb->ips_lrh.bth[2] = __cpu_to_be32( (__be32_to_cpu(scb->ips_lrh.bth[2]) & (~(HFI_BTH_GEN_MASK << HFI_BTH_GEN_SHIFT))) | ((flow->xmit_seq_num.psn_gen & HFI_BTH_GEN_MASK) << HFI_BTH_GEN_SHIFT)); scb->seq_num.psn_gen = flow->xmit_seq_num.psn_gen; scb = SLIST_NEXT(scb, next); } } /* NAK post process for dma flow */ void ips_dmaflow_nak_post_process(struct ips_proto *proto, struct ips_flow *flow) { ips_scb_t *scb; uint32_t first_num, ack_num; uint16_t padding = 0; scb = STAILQ_FIRST(&flow->scb_unacked); first_num = __be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask; ack_num = (flow->xmit_ack_num.psn_num - 1) & proto->psn_mask; /* If the ack PSN falls into a multi-packets scb, * don't re-send the packets already acked. */ psmi_assert(scb->nfrag > 1); if (between(first_num, scb->seq_num.psn_num, ack_num)) { uint32_t npkt, pktlen, nbytes; /* how many packets acked in this scb */ npkt = ((ack_num - first_num) & proto->psn_mask) + 1; /* how many bytes already acked in this scb, for eager receive * packets, all payload size is frag_size except the last packet * which is not acked yet */ pktlen = scb->frag_size; nbytes = (((ack_num - first_num) & proto->psn_mask) + 1) * pktlen; /* 0. update scb info */ psmi_assert(scb->nfrag_remaining > npkt); scb->nfrag_remaining -= npkt; psmi_assert(scb->chunk_size_remaining > nbytes); scb->chunk_size_remaining -= nbytes; ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes); /* 1. if last packet in sequence, set IPS_SEND_FLAG_ACKREQ */ if (scb->chunk_size_remaining <= scb->frag_size) { psmi_assert(scb->nfrag_remaining == 1); scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; /* last packet is what remaining */ /* check if padding is required*/ padding = scb->chunk_size_remaining & 0x3; if_pf(padding) { /* how much to pad with also equals how many bytes we need * to rewind the source buffer offset by to keep it dw aligned */ padding = 4 - padding; ips_scb_buffer(scb) = (void *)((char*)ips_scb_buffer(scb) - padding); scb->chunk_size_remaining += padding; } pktlen = scb->chunk_size_remaining; } /* 2. set new packet sequence number */ scb->ips_lrh.bth[2] = __cpu_to_be32( ((ack_num + 1) & proto->psn_mask) | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); /* 3. set new packet offset adjusted with padding */ scb->ips_lrh.hdr_data.u32w0 += nbytes - padding; /* 4. if packet length is changed, set new length */ if (scb->payload_size != pktlen) { scb->payload_size = pktlen; scb->ips_lrh.lrh[2] = __cpu_to_be16(( (scb->payload_size + sizeof(struct ips_message_header) + HFI_CRC_SIZE_IN_BYTES) >> BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK); } } } /* process an incoming ack message. Separate function to allow */ /* for better optimization by compiler */ int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) { struct ips_proto *proto = rcv_ev->proto; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_flow *flow = NULL; struct ips_scb_unackedq *unackedq; struct ips_scb_pendlist *scb_pend; psmi_seqnum_t ack_seq_num, last_seq_num; ips_epaddr_flow_t flowid; ips_scb_t *scb; uint32_t tidctrl; ack_seq_num.psn_num = p_hdr->ack_seq_num; tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { ack_seq_num.psn_num = (ack_seq_num.psn_num - 1) & proto->psn_mask; psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) goto ret; } else { ack_seq_num.psn_seq -= 1; flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num); if (!flow) /* Invalid ack for flow */ goto ret; } flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; unackedq = &flow->scb_unacked; scb_pend = &flow->scb_pend; if (STAILQ_EMPTY(unackedq)) goto ret; last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; INC_TIME_SPEND(TIME_SPEND_USER2); /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma, * we can used general psn_num to compare the PSN. */ while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, last_seq_num.psn_num, ack_seq_num.psn_num) ) { /* take it out of the xmit queue and .. */ if (scb == SLIST_FIRST(scb_pend)) { #ifdef PSM_DEBUG flow->scb_num_pending--; #endif SLIST_REMOVE_HEAD(scb_pend, next); } STAILQ_REMOVE_HEAD(unackedq, nextq); #ifdef PSM_DEBUG flow->scb_num_unacked--; psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending); #endif flow->credits += scb->nfrag; if (flow->transfer == PSM_TRANSFER_DMA && scb->dma_complete == 0) ips_proto_dma_wait_until(proto, scb); if (scb->callback) (*scb->callback) (scb->cb_param, scb->nfrag > 1 ? scb->chunk_size : scb->payload_size); if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT)) ips_scbctrl_free(scb); /* set all index pointer to NULL if all frames have been * acked */ if (STAILQ_EMPTY(unackedq)) { psmi_timer_cancel(proto->timerq, flow->timer_ack); flow->timer_ack = NULL; psmi_timer_cancel(proto->timerq, flow->timer_send); flow->timer_send = NULL; SLIST_FIRST(scb_pend) = NULL; psmi_assert(flow->scb_num_pending == 0); /* Reset congestion window - all packets ACK'd */ flow->credits = flow->cwin = proto->flow_credits; flow->ack_interval = max((flow->credits >> 2) - 1, 1); flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; goto ret; } else if (flow->timer_ack == scb->timer_ack) { /* * Exchange timers with last scb on unackedq. * timer in scb is used by flow, cancelling current * timer and then requesting a new timer takes more * time, instead, we exchange the timer between current * freeing scb and the last scb on unacked queue. */ psmi_timer *timer; ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq); timer = scb->timer_ack; scb->timer_ack = last->timer_ack; last->timer_ack = timer; timer = scb->timer_send; scb->timer_send = last->timer_send; last->timer_send = timer; scb->timer_ack->context = scb; scb->timer_send->context = scb; last->timer_ack->context = last; last->timer_send->context = last; } } psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ /* CCA: If flow is congested adjust rate */ if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { if ((flow->path->pr_ccti + proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { ips_cca_adjust_rate(flow->path, proto->cace[flow->path->pr_sl]. ccti_increase); /* Clear congestion event */ rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; } } else { /* Increase congestion window if flow is not congested */ if_pf(flow->cwin < proto->flow_credits) { flow->credits += min(flow->cwin << 1, proto->flow_credits) - flow->cwin; flow->cwin = min(flow->cwin << 1, proto->flow_credits); flow->ack_interval = max((flow->credits >> 2) - 1, 1); } } /* Reclaimed some credits - attempt to flush flow */ if (!SLIST_EMPTY(scb_pend)) flow->flush(flow, NULL); /* * If the next packet has not even been put on the wire, cancel the * retransmission timer since we're still presumably waiting on free * pio bufs */ if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE) psmi_timer_cancel(proto->timerq, flow->timer_ack); ret: return IPS_RECVHDRQ_CONTINUE; } /* process an incoming nack message. Separate function to allow */ /* for better optimization by compiler */ int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) { struct ips_proto *proto = rcv_ev->proto; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_flow *flow = NULL; struct ips_scb_unackedq *unackedq; struct ips_scb_pendlist *scb_pend; psmi_seqnum_t ack_seq_num, last_seq_num; psm_protocol_type_t protocol; ips_epaddr_flow_t flowid; ips_scb_t *scb; uint32_t tidctrl; INC_TIME_SPEND(TIME_SPEND_USER3); ack_seq_num.psn_num = p_hdr->ack_seq_num; tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { protocol = PSM_PROTOCOL_GO_BACK_N; psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) goto ret; ack_seq_num.psn_num = (ack_seq_num.psn_num - 1) & proto->psn_mask; flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; } else { protocol = PSM_PROTOCOL_TIDFLOW; flow = get_tidflow(proto, ipsaddr, p_hdr, ack_seq_num); if (!flow) goto ret; /* Invalid ack for flow */ ack_seq_num.psn_seq--; psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen); psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen); /* Update xmit_ack_num with both new generation and new * acked sequence; update xmit_seq_num with the new flow * generation, don't change the sequence number. */ flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0; flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen; psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen); } unackedq = &flow->scb_unacked; scb_pend = &flow->scb_pend; if (STAILQ_EMPTY(unackedq)) goto ret; last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; proto->epaddr_stats.nak_recv++; _HFI_VDBG("got a nack %d on flow %d, " "first is %d, last is %d\n", ack_seq_num.psn_num, flow->flowid, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_FIRST(unackedq)->seq_num. psn_num, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_LAST(unackedq, ips_scb, nextq)-> seq_num.psn_num); /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma, * we can use general psn_num to compare the PSN. */ while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, last_seq_num.psn_num, ack_seq_num.psn_num) ) { /* take it out of the xmit queue and .. */ if (scb == SLIST_FIRST(scb_pend)) { #ifdef PSM_DEBUG flow->scb_num_pending--; #endif SLIST_REMOVE_HEAD(scb_pend, next); } STAILQ_REMOVE_HEAD(unackedq, nextq); #ifdef PSM_DEBUG flow->scb_num_unacked--; psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending); #endif if (flow->transfer == PSM_TRANSFER_DMA && scb->dma_complete == 0) ips_proto_dma_wait_until(proto, scb); if (scb->callback) (*scb->callback) (scb->cb_param, scb->nfrag > 1 ? scb->chunk_size : scb->payload_size); if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT)) ips_scbctrl_free(scb); /* set all index pointer to NULL if all frames has been acked */ if (STAILQ_EMPTY(unackedq)) { psmi_timer_cancel(proto->timerq, flow->timer_ack); flow->timer_ack = NULL; psmi_timer_cancel(proto->timerq, flow->timer_send); flow->timer_send = NULL; SLIST_FIRST(scb_pend) = NULL; psmi_assert(flow->scb_num_pending == 0); /* Reset congestion window if all packets acknowledged */ flow->credits = flow->cwin = proto->flow_credits; flow->ack_interval = max((flow->credits >> 2) - 1, 1); flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; goto ret; } else if (flow->timer_ack == scb->timer_ack) { /* * Exchange timers with last scb on unackedq. * timer in scb is used by flow, cancelling current * timer and then requesting a new timer takes more * time, instead, we exchange the timer between current * freeing scb and the last scb on unacked queue. */ psmi_timer *timer; ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq); timer = scb->timer_ack; scb->timer_ack = last->timer_ack; last->timer_ack = timer; timer = scb->timer_send; scb->timer_send = last->timer_send; last->timer_send = timer; scb->timer_ack->context = scb; scb->timer_send->context = scb; last->timer_ack->context = last; last->timer_send->context = last; } } psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ if (protocol == PSM_PROTOCOL_TIDFLOW) ips_tidflow_nak_post_process(proto, flow); else if (scb->nfrag > 1) ips_dmaflow_nak_post_process(proto, flow); /* Always cancel ACK timer as we are going to restart the flow */ psmi_timer_cancel(proto->timerq, flow->timer_ack); /* What's now pending is all that was unacked */ SLIST_FIRST(scb_pend) = scb; #ifdef PSM_DEBUG flow->scb_num_pending = flow->scb_num_unacked; #endif while (scb && !(scb->scb_flags & IPS_SEND_FLAG_PENDING)) { /* Wait for the previous dma completion */ if (flow->transfer == PSM_TRANSFER_DMA && scb->dma_complete == 0) ips_proto_dma_wait_until(proto, scb); scb->scb_flags |= IPS_SEND_FLAG_PENDING; scb = SLIST_NEXT(scb, next); } /* If NAK with congestion bit set - delay re-transmitting and THEN adjust * CCA rate. */ if_pf(rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { uint64_t offset; /* Clear congestion event and mark flow as congested */ rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; flow->flags |= IPS_FLOW_FLAG_CONGESTED; /* For congested flow use slow start i.e. reduce congestion window. * For TIDFLOW we cannot reduce congestion window as peer expects * header packets at regular intervals (protoexp->hdr_pkt_interval). */ if (flow->protocol != PSM_PROTOCOL_TIDFLOW) flow->credits = flow->cwin = 1; else flow->credits = flow->cwin; flow->ack_interval = max((flow->credits >> 2) - 1, 1); /* During congestion cancel send timer and delay retransmission by * random interval */ psmi_timer_cancel(proto->timerq, flow->timer_send); if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE) offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1); else offset = 0; struct drand48_data drand48_data; srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data); double rnum; drand48_r(&drand48_data, &rnum); psmi_timer_request(proto->timerq, flow->timer_send, (get_cycles() + (uint64_t) (offset * (rnum + 1.0)))); } else { int num_resent = 0; /* Reclaim all credits upto congestion window only */ flow->credits = flow->cwin; flow->ack_interval = max((flow->credits >> 2) - 1, 1); /* Flush pending scb's */ flow->flush(flow, &num_resent); proto->epaddr_stats.send_rexmit += num_resent; } ret: return IPS_RECVHDRQ_CONTINUE; } int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev) { struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); struct ips_flow *flow; psmi_seqnum_t seq_num; int16_t seq_off; INC_TIME_SPEND(TIME_SPEND_USER4); PSM2_LOG_MSG("entering"); psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; recvq->proto->epaddr_stats.err_chk_recv++; /* Ignore FECN bit since this is the control path */ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num); if_pf(seq_off <= 0) { _HFI_VDBG("naking for seq=%d, off=%d on flowid %d\n", seq_num.psn_num, seq_off, flowid); if (seq_off < -flow->ack_interval) flow->flags |= IPS_FLOW_FLAG_GEN_BECN; ips_proto_send_nak(recvq, flow); flow->flags |= IPS_FLOW_FLAG_NAK_SEND; } else { ips_scb_t ctrlscb; ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; ips_proto_send_ctrl_message(flow, OPCODE_ACK, &ipsaddr->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } int ips_proto_process_err_chk_gen(struct ips_recvhdrq_event *rcv_ev) { struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_protoexp *protoexp = recvq->proto->protoexp; struct ips_tid_recv_desc *tidrecvc; psmi_seqnum_t err_seqnum, recvseq; ptl_arg_t desc_id = p_hdr->data[0]; ptl_arg_t send_desc_id = p_hdr->data[1]; int16_t seq_off; uint8_t ack_type; ips_scb_t ctrlscb; INC_TIME_SPEND(TIME_SPEND_USER4); PSM2_LOG_MSG("entering"); recvq->proto->epaddr_stats.err_chk_recv++; /* Ignore FECN bit since this is the control path */ rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Get the flowgenseq for err chk gen */ err_seqnum.psn_val = __be32_to_cpu(p_hdr->bth[2]); /* Get receive descriptor */ psmi_assert(desc_id._desc_idx < HFI_TF_NFLOWS); tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; if (tidrecvc->rdescid._desc_genc != desc_id._desc_genc) { /* Receive descriptor mismatch in time and space. * Stale err chk gen, drop packet */ _HFI_DBG ("ERR_CHK_GEN: gen mismatch Pkt: 0x%x, Current: 0x%x\n", desc_id._desc_genc, tidrecvc->rdescid._desc_genc); PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); /* * We change tidrecvc->tidflow_genseq here only when a new generation * is allocated and programmed into hardware. Otherwise we use local * variable recvseq to create the reply. */ recvseq = tidrecvc->tidflow_genseq; /* Get the latest seq from hardware tidflow table. But * only do this when context sharing is not used, because * context sharing might drop packet even though hardware * has received it successfully. */ if (!tidrecvc->context->tf_ctrl) { uint64_t tf; uint32_t seqno=0; psmi_hal_tidflow_get(tidrecvc->rdescid._desc_idx, &tf, tidrecvc->context->psm_hw_ctxt); psmi_hal_tidflow_get_seqnum(tf, &seqno); recvseq.psn_seq = seqno; } if (err_seqnum.psn_gen != recvseq.psn_gen) { ack_type = OPCODE_NAK; /* NAK without allocating a new generation */ /* My current generation and last received seq */ ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val; } else { /* Either lost packets or lost ack, we need to deal * with wrap around of the seq value from 2047 to 0 * because seq is only 11 bits */ seq_off = (int16_t)(err_seqnum.psn_seq - recvseq.psn_seq); if (seq_off < 0) seq_off += 2048; /* seq is 11 bits */ if (seq_off < 1024) { ack_type = OPCODE_NAK; /* NAK with allocating a new generation */ /* set latest seq */ tidrecvc->tidflow_genseq.psn_seq = recvseq.psn_seq; /* allocate and set a new generation */ ips_protoexp_flow_newgen(tidrecvc); /* get the new generation */ recvseq.psn_gen = tidrecvc->tidflow_genseq.psn_gen; /* My new generation and last received seq */ ctrlscb.ips_lrh.data[1].u32w0 = recvseq.psn_val; } else /* ACK with last received seq, * no need to set ips_lrh.data[1].u32w0 */ ack_type = OPCODE_ACK; } ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.data[0].u64 = send_desc_id.u64; /* Keep peer generation but use my last received sequence */ err_seqnum.psn_seq = recvseq.psn_seq; ctrlscb.ips_lrh.ack_seq_num = err_seqnum.psn_val; /* May want to generate a BECN if a lot of swapped generations */ if_pf((tidrecvc->tidflow_nswap_gen > 4) && (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { _HFI_CCADBG ("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n", tidrecvc->tidflow_nswap_gen); /* Mark flow to generate BECN in control packet */ tidrecvc->tidflow.flags |= IPS_FLOW_FLAG_GEN_BECN; /* Update stats for congestion encountered */ recvq->proto->epaddr_stats.congestion_pkts++; } ips_proto_send_ctrl_message(&tidrecvc->tidflow, ack_type, &tidrecvc->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); /* Update stats for expected window */ tidrecvc->stats.nErrChkReceived++; if (ack_type == OPCODE_NAK) tidrecvc->stats.nReXmit++; /* Update stats for retransmit (Sent a NAK) */ PSM2_LOG_MSG("leaving"); return IPS_RECVHDRQ_CONTINUE; } int ips_proto_process_becn(struct ips_recvhdrq_event *rcv_ev) { struct ips_proto *proto = rcv_ev->proto; struct ips_message_header *p_hdr = rcv_ev->p_hdr; ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; int flowid = ips_proto_flowid(p_hdr); struct ips_flow *flow; psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; if ((flow->path->pr_ccti + proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { ips_cca_adjust_rate(flow->path, proto->cace[flow->path->pr_sl].ccti_increase); /* Clear congestion event */ rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; } return IPS_RECVHDRQ_CONTINUE; } static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto) { _HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code); if (hfi_debug & __HFI_DBG) { ips_proto_show_header(proto, "received bad opcode"); ips_proto_dump_frame(proto, sizeof(struct ips_message_header), "Opcode error protocol header dump"); } } int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev) { struct ips_message_header *protocol_header = rcv_ev->p_hdr; struct ips_proto *proto = rcv_ev->proto; proto->stats.unknown_packets++; ips_bad_opcode(_get_proto_hfi_opcode(protocol_header), protocol_header); return IPS_RECVHDRQ_CONTINUE; } int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev) { psm2_error_t err = PSM2_OK; char *payload = ips_recvhdrq_event_payload(rcv_ev); uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); psmi_assert(payload); err = ips_proto_process_connect(rcv_ev->proto, _get_proto_hfi_opcode(rcv_ev->p_hdr), rcv_ev->p_hdr, payload, paylen); if (err != PSM2_OK) psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Process connect/disconnect error: %d, opcode %d\n", err, _get_proto_hfi_opcode(rcv_ev->p_hdr)); return IPS_RECVHDRQ_CONTINUE; } /* Return 1 if packet is ok. */ /* Return 0 if packet should be skipped */ int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev) { struct ips_message_header *p_hdr = rcv_ev->p_hdr; struct ips_proto *proto = rcv_ev->proto; psm2_ep_t ep_err; char *pkt_type; int opcode = (int)_get_proto_hfi_opcode(p_hdr); /* * If the protocol is disabled or not yet enabled, no processing happens * We set it t_init to 0 when disabling the protocol */ if (proto->t_init == 0) return IPS_RECVHDRQ_CONTINUE; /* Connect messages don't have to be from a known epaddr */ switch (opcode) { case OPCODE_CONNECT_REQUEST: case OPCODE_CONNECT_REPLY: case OPCODE_DISCONNECT_REQUEST: case OPCODE_DISCONNECT_REPLY: ips_proto_connect_disconnect( (struct ips_recvhdrq_event *)rcv_ev); return IPS_RECVHDRQ_CONTINUE; default: break; } /* Packet from "unknown" peer. Log the packet and payload if at appropriate * verbose level. */ { char *payload = ips_recvhdrq_event_payload(rcv_ev); uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); ips_proto_dump_err_stats(proto); if (hfi_debug & __HFI_PKTDBG) { ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE, "header"); if (paylen) ips_proto_dump_frame(payload, paylen, "data"); } } /* Other messages are definitely crosstalk. */ /* out-of-context expected messages are always fatal */ if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) { ep_err = PSMI_EP_NORETURN; pkt_type = "expected"; } else if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) { ep_err = PSMI_EP_LOGEVENT; pkt_type = "eager"; } else { ep_err = PSMI_EP_NORETURN; pkt_type = "unknown"; } proto->stats.stray_packets++; /* If we have debug mode, print the complete packet every time */ if (hfi_debug & __HFI_PKTDBG) ips_proto_show_header(p_hdr, "invalid connidx"); /* At this point we are out of luck. */ psmi_handle_error(ep_err, PSM2_EPID_NETWORK_ERROR, "Received %s message(s) ptype=0x%x opcode=0x%x" " from an unknown process", pkt_type, psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf), opcode); return 0; /* Always skip this packet unless the above call was a noreturn * call */ } /* get the error string as a number and a string */ static void rhf_errnum_string(char *msg, size_t msglen, long err) { int len; char *errmsg; len = snprintf(msg, msglen, "RHFerror %lx: ", err); if (len > 0 && len < msglen) { errmsg = msg + len; msglen -= len; } else errmsg = msg; *errmsg = 0; ips_proto_get_rhf_errstring(err, errmsg, msglen); } /* * Error handling */ int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev) { struct ips_proto *proto = rcv_ev->proto; int pkt_verbose_err = hfi_debug & __HFI_PKTDBG; int tiderr = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TID; int tf_seqerr = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TFSEQ; int tf_generr = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & PSMI_HAL_RHF_ERR_TFGEN; int data_err = psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf) & (PSMI_HAL_RHF_ERR_ICRC | PSMI_HAL_RHF_ERR_ECC | PSMI_HAL_RHF_ERR_LEN | PSMI_HAL_RHF_ERR_DC | PSMI_HAL_RHF_ERR_DCUN | PSMI_HAL_RHF_ERR_KHDRLEN); char pktmsg[128]; *pktmsg = 0; /* * Tid errors on eager pkts mean we get a headerq overflow, perfectly * safe. Tid errors on expected or other packets means trouble. */ if (tiderr && psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) { struct ips_message_header *p_hdr = rcv_ev->p_hdr; /* Payload dropped - Determine flow for this header and see if * we need to generate a NAK. * * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO * CONGESTION AS THE EAGER BUFFER IS FULL. * * Possible eager packet type: * * Ctrl Message - ignore * MQ message - Can get flow and see if we need to NAK. * AM message - Can get flow and see if we need to NAK. */ proto->stats.hdr_overflow++; if (data_err) return 0; switch (_get_proto_hfi_opcode(p_hdr)) { case OPCODE_TINY: case OPCODE_SHORT: case OPCODE_EAGER: case OPCODE_LONG_RTS: case OPCODE_LONG_CTS: case OPCODE_LONG_DATA: case OPCODE_AM_REQUEST: case OPCODE_AM_REQUEST_NOREPLY: case OPCODE_AM_REPLY: { ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); struct ips_epstate_entry *epstaddr; struct ips_flow *flow; psmi_seqnum_t sequence_num; int16_t diff; /* Obtain ipsaddr for packet */ epstaddr = ips_epstate_lookup(rcv_ev->recvq->epstate, rcv_ev->p_hdr->connidx); if_pf(epstaddr == NULL || epstaddr->ipsaddr == NULL) return 0; /* Unknown packet - drop */ rcv_ev->ipsaddr = epstaddr->ipsaddr; psmi_assert(flowid < EP_FLOW_LAST); flow = &rcv_ev->ipsaddr->flows[flowid]; sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); diff = (int16_t) (sequence_num.psn_num - flow->recv_seq_num.psn_num); if (diff >= 0 && !(flow-> flags & IPS_FLOW_FLAG_NAK_SEND)) { /* Mark flow as congested and attempt to generate NAK */ flow->flags |= IPS_FLOW_FLAG_GEN_BECN; proto->epaddr_stats.congestion_pkts++; flow->flags |= IPS_FLOW_FLAG_NAK_SEND; flow->cca_ooo_pkts = 0; ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow); } /* Safe to process ACKs from header */ ips_proto_process_ack(rcv_ev); } break; case OPCODE_EXPTID: /* If RSM is matching packets that are TID&FECN&SH, * it is possible to have a EXPTID packet encounter * the eager full condition and have the payload * dropped (but the header delivered). * Treat this condition as a data error (corruption,etc) * and send a NAK. */ if (psmi_hal_has_cap(PSM_HAL_CAP_RSM_FECN_SUPP)) ips_protoexp_handle_data_err(rcv_ev); break; default: break; } } else if (tf_generr) /* handle generr, ignore tiderr if any */ ips_protoexp_handle_tf_generr(rcv_ev); else if (tf_seqerr) ips_protoexp_handle_tf_seqerr(rcv_ev); else if (tiderr) { /* tid error, but not on an eager pkt */ psm2_ep_t ep_err = PSMI_EP_LOGEVENT; uint16_t tid, offset; uint64_t t_now = get_cycles(); proto->tiderr_cnt++; /* Whether and how we will be logging this event */ if (proto->tiderr_max > 0 && proto->tiderr_cnt >= proto->tiderr_max) ep_err = PSMI_EP_NORETURN; else if (proto->tiderr_warn_interval != UINT64_MAX && proto->tiderr_tnext <= t_now) proto->tiderr_tnext = get_cycles() + proto->tiderr_warn_interval; else ep_err = NULL; if (ep_err != NULL) { rhf_errnum_string(pktmsg, sizeof(pktmsg), psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); tid = (__le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) >> HFI_KHDR_TID_SHIFT) & HFI_KHDR_TID_MASK; offset = __le32_to_cpu(rcv_ev->p_hdr->khdr.kdeth0) & HFI_KHDR_OFFSET_MASK; psmi_handle_error(ep_err, PSM2_EP_DEVICE_FAILURE, "%s with tid=%d,offset=%d,count=%d: %s %s", "TID Error", tid, offset, proto->tiderr_cnt, pktmsg, ep_err == PSMI_EP_NORETURN ? "(Terminating...)" : ""); } ips_protoexp_handle_tiderr(rcv_ev); } else if (data_err) { #if _HFI_DEBUGGING if (_HFI_DBG_ON) { uint8_t op_code = _get_proto_hfi_opcode(rcv_ev->p_hdr); if (!pkt_verbose_err) { rhf_errnum_string(pktmsg, sizeof(pktmsg), psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); _HFI_DBG_ALWAYS ("Error %s pkt type opcode 0x%x at hd=0x%x %s\n", (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EAGER) ? "eager" : ( psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) ? "expected" : (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_NON_KD) ? "non-kd" : "", op_code, rcv_ev->recvq->state->hdrq_head, pktmsg); } } #endif if (psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf) == PSM_HAL_RHF_RX_TYPE_EXPECTED) ips_protoexp_handle_data_err(rcv_ev); } else { /* not a tid or data error -- some other error */ #if _HFI_DEBUGGING if (_HFI_DBG_ON) { uint8_t op_code = __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF; if (!pkt_verbose_err) rhf_errnum_string(pktmsg, sizeof(pktmsg), psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); /* else RHFerr decode printed below */ _HFI_DBG_ALWAYS ("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n", psmi_hal_rhf_get_rx_type(rcv_ev->psm_hal_rhf), op_code, rcv_ev->recvq->state->hdrq_head, pktmsg); } #endif } if (pkt_verbose_err) { if (!*pktmsg) rhf_errnum_string(pktmsg, sizeof(pktmsg), psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); ips_proto_show_header(rcv_ev->p_hdr, pktmsg); } return 0; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_recvhdrq.c000066400000000000000000000624011370564314600205330ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_epstate.h" #include "ips_proto.h" #include "ips_expected_proto.h" #include "ips_proto_help.h" #include "ips_proto_internal.h" /* * Receive header queue initialization. */ psm2_error_t ips_recvhdrq_init(const psmi_context_t *context, const struct ips_epstate *epstate, const struct ips_proto *proto, const struct ips_recvhdrq_callbacks *callbacks, uint32_t subcontext, struct ips_recvhdrq *recvq, struct ips_recvhdrq_state *recvq_state, psmi_hal_cl_q psm_hal_cl_hdrq) { psm2_error_t err = PSM2_OK; memset(recvq, 0, sizeof(*recvq)); recvq->proto = (struct ips_proto *)proto; recvq->state = recvq_state; recvq->context = context; recvq->subcontext = subcontext; recvq->psm_hal_cl_hdrq = psm_hal_cl_hdrq; pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED); recvq->hdrq_elemlast = ((psmi_hal_get_rx_hdr_q_cnt(context->psm_hw_ctxt) - 1) * (psmi_hal_get_rx_hdr_q_ent_size(context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT)); recvq->epstate = epstate; recvq->recvq_callbacks = *callbacks; /* deep copy */ SLIST_INIT(&recvq->pending_acks); recvq->state->hdrq_head = 0; recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; recvq->state->num_hdrq_done = 0; recvq->state->num_egrq_done = 0; recvq->state->hdr_countdown = 0; recvq->state->hdrq_cachedlastscan = 0; { union psmi_envvar_val env_hdr_update; psmi_getenv("PSM2_HEAD_UPDATE", "header queue update interval (0 to update after all entries are processed). Default is 64", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val) 64, &env_hdr_update); /* Cap max header update interval to size of header/eager queue */ recvq->state->head_update_interval = min(env_hdr_update.e_uint, psmi_hal_get_rx_hdr_q_cnt(context->psm_hw_ctxt) - 1); recvq->state->egrq_update_interval = 1; } return err; } /* flush the eager buffers, by setting the eager index head to eager index tail if eager buffer queue is full. Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR was set in RHF errors), and no good eager packets were received, so that eager head wasn't advanced. */ #if 0 static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq) { const uint32_t tail = ips_recvq_tail_get(&recvq->egrq); const uint32_t head = ips_recvq_head_get(&recvq->egrq); uint32_t egr_cnt = recvq->egrq.elemcnt; if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) { _HFI_DBG("eager array full after overflow, flushing " "(head %llx, tail %llx)\n", (long long)head, (long long)tail); recvq->proto->stats.egr_overflow++; } return; } #endif /* * Helpers for ips_recvhdrq_progress. */ static __inline__ int _get_proto_subcontext(const struct ips_message_header *p_hdr) { return ((__be32_to_cpu(p_hdr->bth[1]) >> HFI_BTH_SUBCTXT_SHIFT) & HFI_BTH_SUBCTXT_MASK); } static __inline__ void _dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev) { char *payload = ips_recvhdrq_event_payload(rcv_ev); uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); #ifdef PSM_DEBUG ips_proto_show_header((struct ips_message_header *) rcv_ev->p_hdr, "received invalid pkt"); #endif if (hfi_debug & __HFI_PKTDBG) { ips_proto_dump_frame(rcv_ev->p_hdr, HFI_MESSAGE_HDR_SIZE, "header"); if (paylen) ips_proto_dump_frame(payload, paylen, "data"); } } static __inline__ void _update_error_stats(struct ips_proto *proto, uint32_t err) { if (err & PSMI_HAL_RHF_ERR_ICRC) proto->error_stats.num_icrc_err++; if (err & PSMI_HAL_RHF_ERR_ECC) proto->error_stats.num_ecc_err++; if (err & PSMI_HAL_RHF_ERR_LEN) proto->error_stats.num_len_err++; if (err & PSMI_HAL_RHF_ERR_TID) proto->error_stats.num_tid_err++; if (err & PSMI_HAL_RHF_ERR_DC) proto->error_stats.num_dc_err++; if (err & PSMI_HAL_RHF_ERR_DCUN) proto->error_stats.num_dcunc_err++; if (err & PSMI_HAL_RHF_ERR_KHDRLEN) proto->error_stats.num_khdrlen_err++; } #ifdef PSM_DEBUG static int _check_headers(struct ips_recvhdrq_event *rcv_ev, psmi_hal_cl_q cl_q) { struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; struct ips_proto *proto = rcv_ev->proto; uint32_t *lrh = (uint32_t *) rcv_ev->p_hdr; uint32_t dest_context; const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]); const uint16_t base_dlid = __be16_to_cpu(recvq->proto->epinfo.ep_base_lid); /* Check that the receive header queue entry has a sane sequence number */ if (psmi_hal_check_rhf_sequence_number(psmi_hal_rhf_get_seq(rcv_ev->psm_hal_rhf)) != PSM_HAL_ERROR_OK) { unsigned int seqno=0; psmi_hal_get_rhf_expected_sequence_number(&seqno, cl_q, recvq->context->psm_hw_ctxt); psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n", psmi_hal_rhf_get_seq(rcv_ev->psm_hal_rhf), seqno, lrh[0], lrh[1]); return -1; } /* Verify that the packet was destined for our context */ dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr); if_pf(dest_context != recvq->proto->epinfo.ep_context) { struct ips_recvhdrq_state *state = recvq->state; /* Packet not targeted at us. Drop packet and continue */ ips_proto_dump_err_stats(proto); _dump_invalid_pkt(rcv_ev); psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n", dest_context, recvq->proto->epinfo.ep_context, state->hdrq_head); return -1; } /* Verify that rhf packet length matches the length in LRH */ if_pf(psmi_hal_rhf_get_packet_length(rcv_ev->psm_hal_rhf) != (__be16_to_cpu(rcv_ev->p_hdr->lrh[2]) << BYTE2DWORD_SHIFT)) { _HFI_EPDBG ("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n", psmi_hal_rhf_get_packet_length(rcv_ev->psm_hal_rhf) >> 2, __be16_to_cpu(rcv_ev->p_hdr->lrh[2])); ips_proto_dump_err_stats(proto); _dump_invalid_pkt(rcv_ev); return -1; } /* Verify that the DLID matches our local LID. */ if_pf(!((base_dlid <= pkt_dlid) && (pkt_dlid <= (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) { _HFI_EPDBG ("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n", rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid); ips_proto_dump_err_stats(proto); _dump_invalid_pkt(rcv_ev); return -1; } return 0; } #endif static __inline__ int do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev) { char *payload = ips_recvhdrq_event_payload(rcv_ev); uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); uint32_t *ckptr; uint32_t recv_cksum, cksum, dest_subcontext; /* With checksum every packet has a payload */ psmi_assert_always(payload); ckptr = (uint32_t *) (payload + paylen); recv_cksum = ckptr[0]; /* Calculate checksum hdr + payload (includes any padding words) */ cksum = 0xffffffff; cksum = ips_crc_calculate(HFI_MESSAGE_HDR_SIZE, (uint8_t *) rcv_ev->p_hdr, cksum); if (paylen) cksum = ips_crc_calculate(paylen, (uint8_t *) payload, cksum); if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) { struct ips_epstate_entry *epstaddr; uint32_t lcontext; psmi_hal_cl_idx hd, tl; epstaddr = ips_epstate_lookup(rcv_ev->recvq->epstate, rcv_ev->p_hdr->connidx); epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL; lcontext = epstaddr ? rcv_ev->proto->epinfo.ep_context : -1; hd = psmi_hal_get_cl_q_head_index(PSM_HAL_CL_Q_RX_HDR_Q, rcv_ev->recvq->context->psm_hw_ctxt); tl = psmi_hal_get_cl_q_tail_index(PSM_HAL_CL_Q_RX_HDR_Q, rcv_ev->recvq->context->psm_hw_ctxt); dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr); _HFI_ERROR ("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%" PRIx64 ", rhfseq 0x%x\n", (dest_subcontext != rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext, epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->pathgrp-> pg_base_dlid) : -1, cksum, ckptr[0], ckptr[1], _get_proto_hfi_opcode(rcv_ev->p_hdr), psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf), hd, tl, rcv_ev->psm_hal_rhf.raw_rhf, psmi_hal_rhf_get_seq(rcv_ev->psm_hal_rhf)); /* Dump packet */ _dump_invalid_pkt(rcv_ev); return 0; /* Packet checksum error */ } return 1; } PSMI_ALWAYS_INLINE( void process_pending_acks(struct ips_recvhdrq *recvq)) { ips_scb_t ctrlscb; struct ips_message_header *msg_hdr = NULL; /* If any pending acks, dispatch them now */ while (!SLIST_EMPTY(&recvq->pending_acks)) { struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks); SLIST_REMOVE_HEAD(&recvq->pending_acks, next); SLIST_NEXT(flow, next) = NULL; ctrlscb.scb_flags = 0; msg_hdr = &ctrlscb.ips_lrh; msg_hdr->ack_seq_num = flow->recv_seq_num.psn_num; if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { psmi_assert_always((flow-> flags & IPS_FLOW_FLAG_PENDING_NAK) == 0); flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; ips_proto_send_ctrl_message(flow, OPCODE_ACK, &flow->ipsaddr-> ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } else { psmi_assert_always(flow-> flags & IPS_FLOW_FLAG_PENDING_NAK); flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; ips_proto_send_ctrl_message(flow, OPCODE_NAK, &flow->ipsaddr-> ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } } } /* * Core receive progress function * * recvhdrq_progress is the core function that services the receive header * queue and optionally, the eager queue. At the lowest level, it identifies * packets marked with errors by the chip and also detects and corrects when * eager overflow conditions occur. At the highest level, it queries the * 'epstate' interface to classify packets from "known" and "unknown" * endpoints. In order to support shared contexts, it can also handle packets * destined for other contexts (or "subcontexts"). */ psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq) { /* When PSM_PERF is enabled, the following line causes the PMU to start a stop watch to measure instruction cycles of the RX speedpath of PSM. The stop watch is stopped below. */ GENERIC_PERF_BEGIN(PSM_RX_SPEEDPATH_CTR); struct ips_recvhdrq_state *state = recvq->state; PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = recvq->proto, .recvq = recvq }; struct ips_epstate_entry *epstaddr; uint32_t num_hdrq_done = 0; const uint32_t num_hdrq_todo = psmi_hal_get_rx_hdr_q_cnt(recvq->context->psm_hw_ctxt); uint32_t dest_subcontext; const uint32_t hdrq_elemsz = psmi_hal_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; int ret = IPS_RECVHDRQ_CONTINUE; int done = 0, empty = 0; int do_hdr_update = 0; const psmi_hal_cl_q psm_hal_hdr_q = recvq->psm_hal_cl_hdrq; const psmi_hal_cl_q psm_hal_egr_q = psm_hal_hdr_q + 1; /* Returns whether the currently set 'rcv_hdr'/head is a readable entry */ #define next_hdrq_is_ready() (! empty ) if (psmi_hal_cl_q_empty(state->hdrq_head, psm_hal_hdr_q, recvq->context->psm_hw_ctxt)) return PSM2_OK; PSM2_LOG_MSG("entering"); done = !next_hdrq_is_ready(); rcv_ev.psm_hal_hdr_q = psm_hal_hdr_q; while (!done) { psmi_hal_get_receive_event(state->hdrq_head, recvq->context->psm_hw_ctxt, &rcv_ev); rcv_ev.has_cksum = ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) && (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM)); _HFI_VDBG ("new packet: rcv_hdr %p, rhf %" PRIx64 "\n", rcv_ev.p_hdr, rcv_ev.psm_hal_rhf.raw_rhf); #ifdef PSM_DEBUG if_pf(_check_headers(&rcv_ev, psm_hal_hdr_q)) goto skip_packet; #endif dest_subcontext = _get_proto_subcontext(rcv_ev.p_hdr); /* If the destination is not our subcontext, process * message as subcontext message (shared contexts) */ if (dest_subcontext != recvq->subcontext) { rcv_ev.ipsaddr = NULL; ret = recvq->recvq_callbacks.callback_subcontext (&rcv_ev, dest_subcontext); if (ret == IPS_RECVHDRQ_REVISIT) { PSM2_LOG_MSG("leaving"); /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the RX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); return PSM2_OK_NO_PROGRESS; } goto skip_packet; } if_pf(psmi_hal_rhf_get_all_err_flags(rcv_ev.psm_hal_rhf)) { _update_error_stats(recvq->proto, psmi_hal_rhf_get_all_err_flags(rcv_ev.psm_hal_rhf)); recvq->recvq_callbacks.callback_error(&rcv_ev); if ((psmi_hal_rhf_get_rx_type(rcv_ev.psm_hal_rhf) != PSM_HAL_RHF_RX_TYPE_EAGER) || (!(psmi_hal_rhf_get_all_err_flags(rcv_ev.psm_hal_rhf) & PSMI_HAL_RHF_ERR_TID))) goto skip_packet; /* no pending eager update, header * is not currently under tracing. */ if (state->hdr_countdown == 0 && state->rcv_egr_index_head == NO_EAGER_UPDATE) { uint32_t egr_cnt = psmi_hal_get_rx_egr_tid_cnt(recvq->context->psm_hw_ctxt); psmi_hal_cl_idx etail=0, ehead=0; ehead = psmi_hal_get_cl_q_head_index( psm_hal_egr_q, rcv_ev.recvq->context->psm_hw_ctxt); etail = psmi_hal_get_cl_q_tail_index( psm_hal_egr_q, rcv_ev.recvq->context->psm_hw_ctxt); if (ehead == ((etail + 1) % egr_cnt)) { /* eager is full, * trace existing header entries */ uint32_t hdr_size = recvq->hdrq_elemlast + hdrq_elemsz; psmi_hal_cl_idx htail=0; htail = psmi_hal_get_cl_q_tail_index( psm_hal_hdr_q, rcv_ev.recvq->context->psm_hw_ctxt); const uint32_t hhead = state->hdrq_head; state->hdr_countdown = (htail > hhead) ? (htail - hhead) : (htail + hdr_size - hhead); } } /* Eager packet and tiderr. * Don't consider updating egr head, unless we're in * the congested state. If we're congested, we should * try to keep the eager buffers free. */ if (!rcv_ev.is_congested) goto skip_packet_no_egr_update; else goto skip_packet; } /* If checksum is enabled, verify that it is valid */ if_pf(rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev)) goto skip_packet; if (_HFI_VDBG_ON) { psmi_hal_cl_idx egr_buff_q_head, egr_buff_q_tail; egr_buff_q_head = psmi_hal_get_cl_q_head_index( psm_hal_egr_q, rcv_ev.recvq->context->psm_hw_ctxt); egr_buff_q_tail = psmi_hal_get_cl_q_tail_index( psm_hal_egr_q, rcv_ev.recvq->context->psm_hw_ctxt); _HFI_VDBG_ALWAYS( "hdrq_head %d, p_hdr: %p, opcode %x, payload %p paylen %d; " "egrhead %x egrtail %x; " "useegrbit %x egrindex %x, egroffset %x, egrindexhead %x\n", state->hdrq_head, rcv_ev.p_hdr, _get_proto_hfi_opcode(rcv_ev.p_hdr), ips_recvhdrq_event_payload(&rcv_ev), ips_recvhdrq_event_paylen(&rcv_ev), egr_buff_q_head,egr_buff_q_tail, psmi_hal_rhf_get_use_egr_buff(rcv_ev.psm_hal_rhf), psmi_hal_rhf_get_egr_buff_index(rcv_ev.psm_hal_rhf), psmi_hal_rhf_get_egr_buff_offset(rcv_ev.psm_hal_rhf), state->rcv_egr_index_head); } PSM2_LOG_PKT_STRM(PSM2_LOG_RX,rcv_ev.p_hdr,&rcv_ev.psm_hal_rhf.raw_rhf, "PKT_STRM:"); /* Classify packet from a known or unknown endpoint */ epstaddr = ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->connidx); if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) { rcv_ev.ipsaddr = NULL; recvq->recvq_callbacks. callback_packet_unknown(&rcv_ev); } else { rcv_ev.ipsaddr = epstaddr->ipsaddr; ret = ips_proto_process_packet(&rcv_ev); if (ret == IPS_RECVHDRQ_REVISIT) { PSM2_LOG_MSG("leaving"); /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the RX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); return PSM2_OK_NO_PROGRESS; } } skip_packet: /* * if eager buffer is used, record the index. */ if (psmi_hal_rhf_get_use_egr_buff(rcv_ev.psm_hal_rhf)) { /* set only when a new entry is used */ if (psmi_hal_rhf_get_egr_buff_offset(rcv_ev.psm_hal_rhf) == 0) { state->rcv_egr_index_head = psmi_hal_rhf_get_egr_buff_index(rcv_ev.psm_hal_rhf); state->num_egrq_done++; } /* a header entry is using an eager entry, stop tracing. */ state->hdr_countdown = 0; } skip_packet_no_egr_update: /* Note that state->hdrq_head is sampled speculatively by the code * in ips_ptl_shared_poll() when context sharing, so it is not safe * for this shared variable to temporarily exceed the last element. */ _HFI_VDBG ("head %d, elemsz %d elemlast %d\n", state->hdrq_head, hdrq_elemsz, recvq->hdrq_elemlast); psmi_hal_retire_hdr_q_entry(&state->hdrq_head, psm_hal_hdr_q, recvq->context->psm_hw_ctxt, hdrq_elemsz, recvq->hdrq_elemlast, &empty); state->num_hdrq_done++; num_hdrq_done++; done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK) || (num_hdrq_done == num_hdrq_todo)); do_hdr_update = (state->head_update_interval ? (state->num_hdrq_done == state->head_update_interval) : done); if (do_hdr_update) { psmi_hal_set_cl_q_head_index( state->hdrq_head, psm_hal_hdr_q, rcv_ev.recvq->context->psm_hw_ctxt); /* Reset header queue entries processed */ state->num_hdrq_done = 0; } if (state->num_egrq_done >= state->egrq_update_interval) { /* Lazy update of egrq */ if (state->rcv_egr_index_head != NO_EAGER_UPDATE) { psmi_hal_set_cl_q_head_index( state->rcv_egr_index_head, psm_hal_egr_q, recvq->context->psm_hw_ctxt); state->rcv_egr_index_head = NO_EAGER_UPDATE; state->num_egrq_done = 0; } } if (state->hdr_countdown > 0) { /* a header entry is consumed. */ state->hdr_countdown -= hdrq_elemsz; if (state->hdr_countdown == 0) { /* header entry count reaches zero. */ psmi_hal_cl_idx tail=0; tail = psmi_hal_get_cl_q_tail_index( psm_hal_egr_q, recvq->context->psm_hw_ctxt); psmi_hal_cl_idx head=0; head = psmi_hal_get_cl_q_head_index( psm_hal_egr_q, recvq->context->psm_hw_ctxt); uint32_t egr_cnt = psmi_hal_get_rx_egr_tid_cnt(recvq->context->psm_hw_ctxt); /* Checks eager-full again. This is a real false-egr-full */ if (head == ((tail + 1) % egr_cnt)) { psmi_hal_set_cl_q_tail_index( tail, psm_hal_egr_q, recvq->context->psm_hw_ctxt); _HFI_DBG ("eager array full after overflow, flushing " "(head %llx, tail %llx)\n", (long long)head, (long long)tail); recvq->proto->stats.egr_overflow++; } else _HFI_ERROR ("PSM BUG: EgrOverflow: eager queue is not full\n"); } } } /* while (hdrq_entries_to_read) */ /* Process any pending acks before exiting */ process_pending_acks(recvq); PSM2_LOG_MSG("leaving"); /* When PSM_PERF is enabled, the following line causes the PMU to stop a stop watch to measure instruction cycles of the RX speedpath of PSM. The stop watch was started above. */ GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); return num_hdrq_done ? PSM2_OK : PSM2_OK_NO_PROGRESS; } /* This function is designed to implement RAPID CCA. It iterates through the recvq, checking each element for set FECN or BECN bits. In the case of finding one, the proper response is executed, and the bits are cleared. */ psm2_error_t ips_recvhdrq_scan_cca (struct ips_recvhdrq *recvq) { /* Looks at hdr and determines if it is the last item in the queue */ #define is_last_hdr(idx) \ psmi_hal_cl_q_empty(idx, psm_hal_hdr_q, recvq->context->psm_hw_ctxt) struct ips_recvhdrq_state *state = recvq->state; PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = {.proto = recvq->proto, .recvq = recvq }; uint32_t num_hdrq_done = state->hdrq_cachedlastscan / psmi_hal_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; const int num_hdrq_todo = psmi_hal_get_rx_hdr_q_cnt(recvq->context->psm_hw_ctxt); const uint32_t hdrq_elemsz = psmi_hal_get_rx_hdr_q_ent_size(recvq->context->psm_hw_ctxt) >> BYTE2DWORD_SHIFT; int done; uint32_t scan_head = state->hdrq_head + state->hdrq_cachedlastscan; const psmi_hal_cl_q psm_hal_hdr_q = recvq->psm_hal_cl_hdrq; /* Skip the first element, since we're going to process it soon anyway */ if ( state->hdrq_cachedlastscan == 0 ) { scan_head += hdrq_elemsz; num_hdrq_done++; } PSM2_LOG_MSG("entering"); done = !is_last_hdr(scan_head); rcv_ev.psm_hal_hdr_q = psm_hal_hdr_q; while (!done) { psmi_hal_get_receive_event(scan_head, recvq->context->psm_hw_ctxt, &rcv_ev); rcv_ev.has_cksum = ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) && (rcv_ev.p_hdr->flags & IPS_SEND_FLAG_PKTCKSUM)); _HFI_VDBG ("scanning new packet for CCA: rcv_hdr %p, rhf %" PRIx64 "\n", rcv_ev.p_hdr, rcv_ev.psm_hal_rhf.raw_rhf); if_pt ( _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN ) { struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->connidx); if (epstaddr != NULL && epstaddr->ipsaddr != NULL) { rcv_ev.ipsaddr = epstaddr->ipsaddr; /* Send BECN back */ ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr; struct ips_message_header *p_hdr = rcv_ev.p_hdr; ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); struct ips_flow *flow; ips_scb_t ctrlscb; psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; ctrlscb.scb_flags = 0; ctrlscb.ips_lrh.data[0].u32w0 = flow->cca_ooo_pkts; rcv_ev.proto->epaddr_stats.congestion_pkts++; /* Clear FECN event */ rcv_ev.is_congested &= ~IPS_RECV_EVENT_FECN; ips_proto_send_ctrl_message(flow, OPCODE_BECN, &flow->ipsaddr-> ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); } } else if_pt (0 != (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1))) { struct ips_epstate_entry *epstaddr = ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->connidx); if (epstaddr != NULL && epstaddr->ipsaddr != NULL) { rcv_ev.ipsaddr = epstaddr->ipsaddr; /* Adjust flow */ struct ips_proto *proto = rcv_ev.proto; struct ips_message_header *p_hdr = rcv_ev.p_hdr; ips_epaddr_t *ipsaddr = rcv_ev.ipsaddr; struct ips_flow *flow; ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); psmi_assert(flowid < EP_FLOW_LAST); flow = &ipsaddr->flows[flowid]; if ((flow->path->pr_ccti + proto->cace[flow->path->pr_sl].ccti_increase) <= proto->ccti_limit) { ips_cca_adjust_rate(flow->path, proto->cace[flow->path->pr_sl].ccti_increase); /* Clear congestion event */ rcv_ev.is_congested &= ~IPS_RECV_EVENT_BECN; } } } num_hdrq_done++; scan_head += hdrq_elemsz; state->hdrq_cachedlastscan += hdrq_elemsz; done = (num_hdrq_done == num_hdrq_todo && !is_last_hdr(scan_head) ); } /* while (hdrq_entries_to_read) */ PSM2_LOG_MSG("leaving"); return PSM2_OK; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_recvhdrq.h000066400000000000000000000165271370564314600205500ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "ips_proto_params.h" #include "ips_proto_header.h" #ifndef _IPS_RECVHDRQ_H #define _IPS_RECVHDRQ_H struct ips_recvhdrq; struct ips_recvhdrq_state; struct ips_epstate; /* process current packet, continue on next packet */ #define IPS_RECVHDRQ_CONTINUE 0 /* process current packet, break and return to caller */ #define IPS_RECVHDRQ_BREAK 1 /* keep current packet, revisit the same packet next time */ #define IPS_RECVHDRQ_REVISIT 2 /* CCA related receive events */ #define IPS_RECV_EVENT_FECN 0x1 #define IPS_RECV_EVENT_BECN 0x2 struct ips_recvhdrq_event { struct ips_proto *proto; const struct ips_recvhdrq *recvq; /* where message received */ psmi_hal_rhf_t psm_hal_rhf; struct ips_message_header *p_hdr; /* protocol header in rcv_hdr */ struct ips_epaddr *ipsaddr; /* peer ipsaddr, if available */ uint8_t has_cksum; /* payload has cksum */ uint8_t is_congested; /* Packet faced congestion */ psmi_hal_cl_q psm_hal_hdr_q; }; struct ips_recvhdrq_callbacks { int (*callback_packet_unknown) (const struct ips_recvhdrq_event *); int (*callback_subcontext) (struct ips_recvhdrq_event *, uint32_t subcontext); int (*callback_error) (struct ips_recvhdrq_event *); }; psm2_error_t ips_recvhdrq_init(const psmi_context_t *context, const struct ips_epstate *epstate, const struct ips_proto *proto, const struct ips_recvhdrq_callbacks *callbacks, uint32_t subcontext, struct ips_recvhdrq *recvq, struct ips_recvhdrq_state *recvq_state, psmi_hal_cl_q cl_q); psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq); /* This function is designed to implement RAPID CCA. It iterates * through the recvq, checking each element for set FECN or BECN bits. * In the case of finding one, the proper response is executed, and the bits * are cleared. */ psm2_error_t ips_recvhdrq_scan_cca(struct ips_recvhdrq *recvq); /* * Structure containing state for recvhdrq reading. This is logically * part of ips_recvhdrq but needs to be separated out for context * sharing so that it can be put in a shared memory page and hence * be available to all processes sharing the context. Generally, do not * put pointers in here since the address map of each process can be * different. */ #define NO_EAGER_UPDATE ~0U struct ips_recvhdrq_state { psmi_hal_cl_idx hdrq_head; /* software copy of head */ psmi_hal_cl_idx rcv_egr_index_head; /* software copy of eager index head */ uint32_t head_update_interval; /* Header update interval */ uint32_t num_hdrq_done; /* Num header queue done */ uint32_t egrq_update_interval; /* Eager buffer update interval */ uint32_t num_egrq_done; /* num eager buffer done */ uint32_t hdr_countdown; /* for false-egr-full tracing */ uint32_t hdrq_cachedlastscan; /* last element to be prescanned */ }; /* * Structure to read from recvhdrq */ struct ips_recvhdrq { struct ips_proto *proto; const psmi_context_t *context; /* error handling, epid id, etc. */ struct ips_recvhdrq_state *state; uint32_t subcontext; /* messages that don't match subcontext call * recv_callback_subcontext */ psmi_hal_cl_q psm_hal_cl_hdrq; /* Header queue handling */ pthread_spinlock_t hdrq_lock; /* Lock for thread-safe polling */ uint32_t hdrq_elemlast; /* last element precomputed */ /* Lookup endpoints epid -> ptladdr (rank)) */ const struct ips_epstate *epstate; /* Callbacks to handle recvq events */ struct ips_recvhdrq_callbacks recvq_callbacks; /* List of flows with pending acks for receive queue */ SLIST_HEAD(pending_flows, ips_flow) pending_acks; volatile __u64 *spi_status; }; PSMI_INLINE(int ips_recvhdrq_isempty(const struct ips_recvhdrq *recvq)) { return psmi_hal_cl_q_empty(recvq->state->hdrq_head, recvq->psm_hal_cl_hdrq, recvq->context->psm_hw_ctxt); } PSMI_INLINE( void * ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev)) { if (psmi_hal_rhf_get_use_egr_buff(rcv_ev->psm_hal_rhf)) return (char*)(psmi_hal_get_egr_buff( psmi_hal_rhf_get_egr_buff_index(rcv_ev->psm_hal_rhf), (psmi_hal_cl_q)(rcv_ev->psm_hal_hdr_q + 1) /* The circular list q (cl_q) for the egr buff for any rx hdrq event is always one more than the hdrq cl q */, rcv_ev->recvq->context->psm_hw_ctxt))+ (psmi_hal_rhf_get_egr_buff_offset(rcv_ev->psm_hal_rhf)*64); else return NULL; } PSMI_INLINE( uint32_t ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev)) { uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0; return psmi_hal_rhf_get_packet_length(rcv_ev->psm_hal_rhf) - (sizeof(struct ips_message_header) + HFI_CRC_SIZE_IN_BYTES + cksum_len); /* PSM does not use bth0].PadCnt, it figures out real datalen other way */ } PSMI_INLINE(int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq)) { int ret = pthread_spin_trylock(&recvq->hdrq_lock); return !ret; } PSMI_INLINE(int ips_recvhdrq_lock(struct ips_recvhdrq *recvq)) { int ret = pthread_spin_lock(&recvq->hdrq_lock); return !ret; } PSMI_INLINE(int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq)) { int ret = pthread_spin_unlock(&recvq->hdrq_lock); return !ret; } #endif /* _IPS_RECVHDRQ_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_recvq.c000066400000000000000000000065041370564314600200370ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "ips_recvq.h" /* We return a table of pointer indexes. * * From the point of view of the returned pointer, index -1 always points to * the address to call psmi_free on (since we force page-alignment). */ void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *baseptr, uint32_t bufnum, uint32_t bufsize) { unsigned i; void *ptr_alloc; uintptr_t *buft; uintptr_t base = (uintptr_t) baseptr; ptr_alloc = psmi_malloc(ep, UNDEFINED, PSMI_PAGESIZE + sizeof(uintptr_t) * (bufnum + 1)); if (ptr_alloc == NULL) return NULL; /* First pointer is to the actual allocated address, so we can free it but * buft[1] is first on the page boundary */ buft = (uintptr_t *) PSMI_ALIGNUP(ptr_alloc + 1, PSMI_PAGESIZE); buft[-1] = (uintptr_t) ptr_alloc; for (i = 0; i < bufnum; i++) buft[i] = (uintptr_t) ((char *)base + i * bufsize); return (void **)buft; } void ips_recvq_egrbuf_table_free(void **buftable) { uintptr_t *buft = (uintptr_t *) buftable; void *ptr_alloc = (void *)buft[-1]; psmi_free(ptr_alloc); } opa-psm2-PSM2_11.2.185/ptl_ips/ips_recvq.h000066400000000000000000000053541370564314600200460ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_RECVQ_H #define _IPS_RECVQ_H #include "psm_user.h" /* * Tables to map eager indexes into their buffer addresses * * If function returns NULL, no memory has been allocated and the error handler * has been executed on 'ep' and hence assume status PSM2_NO_MEMORY. */ void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *base, uint32_t bufnum, uint32_t bufsize); void ips_recvq_egrbuf_table_free(void **buftable); #endif /* _IPS_RECVQ_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_scb.c000066400000000000000000000230221370564314600174600ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_scb.h" #include "ips_proto_internal.h" psm2_error_t ips_scbctrl_init(const psmi_context_t *context, uint32_t numscb, uint32_t numbufs, uint32_t imm_size, uint32_t bufsize, ips_scbctrl_avail_callback_fn_t scb_avail_callback, void *scb_avail_context, struct ips_scbctrl *scbc) { int i; struct ips_scb *scb; size_t scb_size; size_t alloc_sz; uintptr_t base, imm_base; psm2_ep_t ep = context->ep; /* scbc->context = context; */ psm2_error_t err = PSM2_OK; psmi_assert_always(numscb > 0); scbc->sbuf_num = scbc->sbuf_num_cur = numbufs; SLIST_INIT(&scbc->sbuf_free); scbc->sbuf_buf_size = bufsize; scbc->sbuf_buf_base = NULL; scbc->sbuf_buf_alloc = NULL; scbc->sbuf_buf_last = NULL; /* send buffers are not mandatory but when allocating them, make sure they * are on a page boundary */ if (numbufs > 0) { struct ips_scbbuf *sbuf; bufsize = PSMI_ALIGNUP(bufsize, 64); alloc_sz = numbufs * bufsize + PSMI_PAGESIZE; scbc->sbuf_buf_alloc = psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz); if (scbc->sbuf_buf_alloc == NULL) { err = PSM2_NO_MEMORY; goto fail; } base = (uintptr_t) scbc->sbuf_buf_alloc; base = PSMI_ALIGNUP(base, PSMI_PAGESIZE); scbc->sbuf_buf_base = (void *)base; scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs - 1)); _HFI_VDBG ("sendbufs=%d, (size=%d),base=[%p..%p)\n", numbufs, bufsize, (void *)scbc->sbuf_buf_base, (void *)scbc->sbuf_buf_last); for (i = 0; i < numbufs; i++) { sbuf = (struct ips_scbbuf *)(base + bufsize * i); SLIST_NEXT(sbuf, next) = NULL; SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next); } } imm_base = 0; scbc->scb_imm_size = imm_size; if (scbc->scb_imm_size) { scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64); alloc_sz = numscb * scbc->scb_imm_size + 64; scbc->scb_imm_buf = psmi_memalign(ep, NETWORK_BUFFERS, 64, alloc_sz); if (scbc->scb_imm_buf == NULL) { err = PSM2_NO_MEMORY; goto fail; } memset(scbc->scb_imm_buf, 0, alloc_sz); imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64); } else scbc->scb_imm_buf = NULL; scbc->scb_num = scbc->scb_num_cur = numscb; SLIST_INIT(&scbc->scb_free); scb_size = PSMI_ALIGNUP(sizeof(*scb), 64); alloc_sz = numscb * scb_size; scbc->scb_base = psmi_memalign(ep, NETWORK_BUFFERS, 64, alloc_sz); if (scbc->scb_base == NULL) { err = PSM2_NO_MEMORY; goto fail; } memset(scbc->scb_base, 0, alloc_sz); base = (uintptr_t) scbc->scb_base; /* * Allocate ack/send timer for each scb object. */ scbc->timers = (struct psmi_timer *) psmi_calloc(ep, UNDEFINED, 2*numscb, sizeof(struct psmi_timer)); if (scbc->timers == NULL) { err = PSM2_NO_MEMORY; goto fail; } for (i = 0; i < numscb; i++) { scb = (struct ips_scb *)(base + i * scb_size); scb->scbc = scbc; if (scbc->scb_imm_buf) scb->imm_payload = (void *)(imm_base + (i * scbc->scb_imm_size)); else scb->imm_payload = NULL; SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); /* * Initialize timers. * Associate the timers to each scb, the association is * not fixed because later PSM may exchange the timers * between scb, the reason for exchanging is that the * timer is currently using by flow, but the scb is to * be freed. see ack/nak processing in file ips_prot_recv.c */ scb->timer_ack = &scbc->timers[2*i]; psmi_timer_entry_init(scb->timer_ack, ips_proto_timer_ack_callback, scb); scb->timer_send = &scbc->timers[2*i+1]; psmi_timer_entry_init(scb->timer_send, ips_proto_timer_send_callback, scb); } scbc->scb_avail_callback = scb_avail_callback; scbc->scb_avail_context = scb_avail_context; fail: return err; } psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc) { if (scbc->scb_base != NULL) { psmi_free(scbc->scb_base); } if (scbc->sbuf_buf_alloc) { psmi_free(scbc->sbuf_buf_alloc); } if (scbc->timers != NULL) { psmi_free(scbc->timers); } if (scbc->scb_imm_buf) { psmi_free(scbc->scb_imm_buf); } return PSM2_OK; } int ips_scbctrl_bufalloc(ips_scb_t *scb) { struct ips_scbctrl *scbc = scb->scbc; psmi_assert(scbc->sbuf_num > 0); psmi_assert(!((ips_scb_buffer(scb) >= scbc->sbuf_buf_base) && (ips_scb_buffer(scb) <= scbc->sbuf_buf_last))); psmi_assert(scb->payload_size <= scbc->sbuf_buf_size); if (scb->payload_size <= scbc->scb_imm_size) { /* Attach immediate buffer */ ips_scb_buffer(scb) = scb->imm_payload; return 1; } if (SLIST_EMPTY(&scbc->sbuf_free)) return 0; else { psmi_assert(scbc->sbuf_num_cur); ips_scb_buffer(scb) = SLIST_FIRST(&scbc->sbuf_free); scbc->sbuf_num_cur--; /* If under memory pressure request ACK for packet to reclaim * credits. */ if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1)) scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; SLIST_REMOVE_HEAD(&scbc->sbuf_free, next); return 1; } } int ips_scbctrl_avail(struct ips_scbctrl *scbc) { return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0); } ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len, uint32_t flags) { ips_scb_t *scb, *scb_head = NULL; psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num > 0) : 1); psmi_assert(scbc->sbuf_buf_size >= len); while (scbnum--) { if (SLIST_EMPTY(&scbc->scb_free)) break; scb = SLIST_FIRST(&scbc->scb_free); /* Need to set this here as bufalloc may request * an ACK under memory pressure */ scb->scb_flags = 0; if (flags & IPS_SCB_FLAG_ADD_BUFFER) { scb->payload_size = len; if (!ips_scbctrl_bufalloc(scb)) break; } else { ips_scb_buffer(scb) = NULL; scb->payload_size = 0; } scb->tidsendc = NULL; scb->callback = NULL; scb->tidctrl = 0; scb->nfrag = 1; scb->frag_size = 0; #ifdef PSM_CUDA scb->mq_req = NULL; #endif scbc->scb_num_cur--; if (scbc->scb_num_cur < (scbc->scb_num >> 1)) scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; SLIST_REMOVE_HEAD(&scbc->scb_free, next); SLIST_NEXT(scb, next) = scb_head; scb_head = scb; } return scb_head; } MOCK_DEF_EPILOGUE(ips_scbctrl_alloc); void ips_scbctrl_free(ips_scb_t *scb) { struct ips_scbctrl *scbc = scb->scbc; if (scbc->sbuf_num && (ips_scb_buffer(scb) >= scbc->sbuf_buf_base) && (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)) { scbc->sbuf_num_cur++; SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next); } ips_scb_buffer(scb) = NULL; scb->tidsendc = NULL; scb->payload_size = 0; scbc->scb_num_cur++; if (SLIST_EMPTY(&scbc->scb_free)) { SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); if (scbc->scb_avail_callback != NULL) scbc->scb_avail_callback(scbc, scbc->scb_avail_context); } else SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); return; } ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc) { ips_scb_t *scb; if (SLIST_EMPTY(&scbc->scb_free)) return NULL; scb = SLIST_FIRST(&scbc->scb_free); SLIST_REMOVE_HEAD(&scbc->scb_free, next); SLIST_NEXT(scb, next) = NULL; ips_scb_buffer(scb) = NULL; scb->payload_size = 0; scb->scb_flags = 0; scb->tidsendc = NULL; scb->callback = NULL; scb->tidctrl = 0; scb->nfrag = 1; scb->frag_size = 0; #ifdef PSM_CUDA scb->mq_req = NULL; #endif scbc->scb_num_cur--; if (scbc->scb_num_cur < (scbc->scb_num >> 1)) scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; return scb; } MOCK_DEF_EPILOGUE(ips_scbctrl_alloc_tiny); opa-psm2-PSM2_11.2.185/ptl_ips/ips_scb.h000066400000000000000000000156641370564314600175020ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_SCB_H #define _IPS_SCB_H #include "psm2_mock_testing.h" #include "psm_user.h" #include "ips_proto_header.h" /* ips_alloc_scb flags */ #define IPS_SCB_FLAG_NONE 0x0 #define IPS_SCB_FLAG_ADD_BUFFER 0x1 /* macros to update scb */ #define ips_scb_opcode(scb) scb->opcode #define ips_scb_buffer(scb) scb->payload #define ips_scb_length(scb) scb->payload_size #define ips_scb_flags(scb) scb->scb_flags #define ips_scb_dma_cntr(scb) scb->dma_cntr #define ips_scb_epaddr(scb) scb->epaddr #define ips_scb_cb(scb) scb->callback #define ips_scb_cb_param(scb) scb->cb_param #define ips_scb_copy_tag(dst, src) \ (dst)[0] = (src)[0]; \ (dst)[1] = (src)[1]; \ (dst)[2] = (src)[2]; struct ips_scbbuf; struct ips_scb; struct ips_scbctrl; struct ips_tid_send_desc; typedef void (*ips_scbctrl_avail_callback_fn_t) (struct ips_scbctrl *, void *context); STAILQ_HEAD(ips_scb_stailq, ips_scb); SLIST_HEAD(ips_scb_slist, ips_scb); struct ips_scbctrl { /* const psmi_context_t *context; */ /* Send control blocks for each send */ uint32_t scb_num; uint32_t scb_num_cur; SLIST_HEAD(scb_free, ips_scb) scb_free; void *scb_base; ips_scbctrl_avail_callback_fn_t scb_avail_callback; void *scb_avail_context; /* Immediate data for send buffers */ uint32_t scb_imm_size; void *scb_imm_buf; psmi_timer *timers; /* ack/send timers */ /* * Send buffers (or bounce buffers) to keep user data if we need to * retransmit. */ uint32_t sbuf_num; uint32_t sbuf_num_cur; SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free; void *sbuf_buf_alloc; uint32_t sbuf_buf_size; void *sbuf_buf_base; void *sbuf_buf_last; }; struct ips_scbbuf { SLIST_ENTRY(ips_scbbuf) next; }; typedef struct ips_scb ips_scb_t; struct ips_scb { union { SLIST_ENTRY(ips_scb) next; STAILQ_ENTRY(ips_scb) nextq; }; union { void *payload; struct ips_scbbuf *sbuf; }; uint64_t ack_timeout; /* in cycles */ uint64_t abs_timeout; /* in cycles */ psmi_timer *timer_send; /* for sending packets */ psmi_timer *timer_ack; /* for acking packets */ /* Used when composing packet */ psmi_seqnum_t seq_num; uint32_t cksum[2]; uint32_t scb_flags; uint32_t payload_size; /* remaining first packet size */ uint32_t chunk_size; /* total buffer size if nfrag > 1 */ /* initially chunk_size_remaining = chunk_size. */ uint32_t chunk_size_remaining; /* buffer size to re-transmit */ uint16_t nfrag; /* total packets in sequence */ /* initially nfrag_remaining = nfrag */ uint16_t nfrag_remaining; /* number packets to re-transmit */ uint16_t dma_complete; uint16_t tidctrl; uint16_t frag_size; /* max packet size in sequence */ uint16_t opcode; uint16_t tsess_length; uint32_t *tsess; struct ips_flow *flow; struct ips_tid_send_desc *tidsendc; struct ips_scbctrl *scbc; void *imm_payload; union { int (*callback) (void *, uint32_t); psm2_am_completion_fn_t completion_am; }; void *cb_param; #ifdef PSM_CUDA psm2_mq_req_t mq_req; /* back pointer to original request */ #endif /* sdma header place holder, PSM2 code should access * the psm_hal_sdma_req_info only using the psmi_get_sdma_req_info() * accessor function. */ /* * The size of struct psm_hal_sdma_req_info is variable. (10 bytes for * GPU-direct and 8 bytes for non GPU-Direct) * When GPU-Direct feature is used, all 10 bytes of the space is used. * Otherwise, we only use upto 8 bytes. The usage is controlled by * psmi_get_sdma_req_info() in ips_proto.h */ struct psm_hal_sdma_req_info _DO_NOT_USE_; struct { struct psm_hal_pbc pbc; struct ips_message_header ips_lrh; } PSMI_CACHEALIGN; }; /* Make sure pbc is at the right place before the message header */ COMPILE_TIME_ASSERT(PBC_ABUTS_IPS_MSG_HDR,(sizeof(struct psm_hal_pbc) == (size_t) (offsetof(struct ips_scb, ips_lrh) - offsetof(struct ips_scb, pbc)))); #ifdef PSM_CUDA #define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU) #endif void ips_scbctrl_free(ips_scb_t *scb); int ips_scbctrl_bufalloc(ips_scb_t *scb); int ips_scbctrl_avail(struct ips_scbctrl *scbc); ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len, uint32_t flags); MOCK_DCL_EPILOGUE(ips_scbctrl_alloc); ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc); MOCK_DCL_EPILOGUE(ips_scbctrl_alloc_tiny); psm2_error_t ips_scbctrl_init(const psmi_context_t *context, uint32_t numscb, uint32_t numbufs, uint32_t imm_size, uint32_t bufsize, ips_scbctrl_avail_callback_fn_t, void *avail_context, struct ips_scbctrl *); psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *); psm2_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd); #endif /* _IPS_SCB_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_stats.h000066400000000000000000000055501370564314600200620ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_STATS_H #define _IPS_STATS_H struct psm2_epaddr; /* for non-PSM clients */ /* Old stats */ typedef struct { uint64_t err_chk_send; uint64_t err_chk_recv; uint64_t send_failed; uint64_t recv_dropped; union { uint64_t recv_copied; /* obsolete */ uint64_t nak_sent; }; uint64_t nak_recv; uint64_t total_send_eager; uint64_t total_send_exp; uint64_t acks_sent; uint64_t retransmits; uint64_t recv_matched; uint64_t recv_unmatched; uint64_t scb_alloc_yields; } ips_sess_stat; int ips_get_stat(struct psm2_epaddr *epaddr, ips_sess_stat *stats); #endif /* _IPS_STATS_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_subcontext.h000066400000000000000000000061501370564314600211170ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef __IPS_SUBCONTEXT_H #define __IPS_SUBCONTEXT_H #include "psm_user.h" #include "ips_recvhdrq.h" #include "ips_writehdrq.h" /* This data structure is allocated in ureg page of each subcontext process */ struct ips_subcontext_ureg { /* head/eager head/tail register storage, one per cacheline (member is unused by PSM, but needed here to match driver structures). */ uint64_t subcontext_uregbase[40 /* i.e. ur_maxreg * 8 */]; struct ips_writehdrq_state writeq_state; /* used in all ureg pages */ } __attribute__ ((aligned(64))); struct ips_hwcontext_ctrl { pthread_spinlock_t context_lock; /* lock shared by all subctxts */ struct ips_recvhdrq_state recvq_state; /* state shared by all subctxts */ uint32_t rx_hdrq_rhf_seq; /* rhf seq for the hw hdrq shared by all subctxts */ } __attribute__ ((aligned(64))); #endif opa-psm2-PSM2_11.2.185/ptl_ips/ips_tid.c000066400000000000000000000174021370564314600174760ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_tid.h" #include "ips_proto.h" #include "ips_expected_proto.h" psm2_error_t ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp, ips_tid_avail_cb_fn_t cb, void *cb_context) { struct ips_tid *tidc = &protoexp->tidc; struct psmi_stats_entry entries[] = { PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL, NULL, &tidc->tid_num_total), }; tidc->context = context; tidc->protoexp = protoexp; tidc->tid_num_total = 0; tidc->tid_num_inuse = 0; tidc->tid_avail_cb = cb; tidc->tid_avail_context = cb_context; tidc->tid_array = NULL; /* * PSM uses tid registration caching only if driver has enabled it. */ if (!psmi_hal_has_cap(PSM_HAL_CAP_TID_UNMAP)) { int i; cl_qmap_t *p_map; cl_map_item_t *root,*nil_item; tidc->tid_array = (uint32_t *) psmi_calloc(context->ep, UNDEFINED, psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt), sizeof(uint32_t)); if (tidc->tid_array == NULL) return PSM2_NO_MEMORY; /* * first is root node, last is terminator node. */ p_map = &tidc->tid_cachemap; root = (cl_map_item_t *) psmi_calloc(context->ep, UNDEFINED, psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt) + 2, sizeof(cl_map_item_t)); if (root == NULL) return PSM2_NO_MEMORY; nil_item = &root [psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt) + 1]; ips_tidcache_map_init(p_map,root,nil_item); NTID = 0; NIDLE = 0; IPREV(IHEAD) = INEXT(IHEAD) = IHEAD; for (i = 1; i <= psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); i++) { INVALIDATE(i) = 1; } /* * if not shared context, all tids are used by the same * process. Otherwise, subcontext process can only cache * its own portion. Driver makes the same tid number * assignment to subcontext processes. */ tidc->tid_cachesize = psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); if (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0) { uint16_t remainder = tidc->tid_cachesize % psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); tidc->tid_cachesize /= psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); if (psmi_hal_get_subctxt(context->psm_hw_ctxt) < remainder) tidc->tid_cachesize++; } } /* * Setup shared control structure. */ tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl; if (!tidc->tid_ctrl) { tidc->tid_ctrl = (struct ips_tid_ctrl *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_tid_ctrl)); if (tidc->tid_ctrl == NULL) { return PSM2_NO_MEMORY; } } /* * Only the master process can initialize. */ if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock, PTHREAD_PROCESS_SHARED); tidc->tid_ctrl->tid_num_max = psmi_hal_get_tid_exp_cnt(context->psm_hw_ctxt); tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max; } return psmi_stats_register_type(PSMI_STATS_NO_HEADING, PSMI_STATSTYPE_TIDS, entries, PSMI_STATS_HOWMANY(entries), tidc); } psm2_error_t ips_tid_fini(struct ips_tid *tidc) { if (tidc->tid_array) ips_tidcache_cleanup(tidc); if (!tidc->context->tid_ctrl) psmi_free(tidc->tid_ctrl); return PSM2_OK; } psm2_error_t ips_tid_acquire(struct ips_tid *tidc, const void *buf, uint32_t *length, uint32_t *tid_array, uint32_t *tidcnt #ifdef PSM_CUDA , uint8_t is_cuda_ptr #endif ) { struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; psm2_error_t err = PSM2_OK; uint16_t flags = 0; int rc; psmi_assert(((uintptr_t) buf & 0xFFF) == 0); psmi_assert(((*length) & 0xFFF) == 0); if (tidc->context->tid_ctrl) pthread_spin_lock(&ctrl->tid_ctrl_lock); if (!ctrl->tid_num_avail) { err = PSM2_EP_NO_RESOURCES; goto fail; } /* Clip length if it exceeds worst case tid allocation, where each entry in the tid array can accommodate only 1 page. */ if (*length > 4096*tidc->tid_ctrl->tid_num_max) { *length = 4096*tidc->tid_ctrl->tid_num_max; } #ifdef PSM_CUDA if (is_cuda_ptr) flags = PSM_HAL_BUF_GPU_MEM; #endif rc = psmi_hal_update_tid(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) buf, length, (uint64_t) (uintptr_t) tid_array, tidcnt, flags); if (rc < 0) { /* Unable to pin pages? retry later */ err = PSM2_EP_DEVICE_FAILURE; goto fail; } psmi_assert_always((*tidcnt) > 0); psmi_assert(ctrl->tid_num_avail >= (*tidcnt)); ctrl->tid_num_avail -= (*tidcnt); tidc->tid_num_total += (*tidcnt); tidc->tid_num_inuse += (*tidcnt); fail: if (tidc->context->tid_ctrl) pthread_spin_unlock(&ctrl->tid_ctrl_lock); return err; } psm2_error_t ips_tid_release(struct ips_tid *tidc, uint32_t *tid_array, uint32_t tidcnt) { struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; psm2_error_t err = PSM2_OK; psmi_assert(tidcnt > 0); if (tidc->context->tid_ctrl) pthread_spin_lock(&ctrl->tid_ctrl_lock); if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) { if (tidc->context->tid_ctrl) pthread_spin_unlock(&ctrl->tid_ctrl_lock); /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", tidcnt); goto fail; } ctrl->tid_num_avail += tidcnt; if (tidc->context->tid_ctrl) pthread_spin_unlock(&ctrl->tid_ctrl_lock); tidc->tid_num_inuse -= tidcnt; /* If an available callback is registered invoke it */ if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max) && tidc->tid_avail_cb) tidc->tid_avail_cb(tidc, tidc->tid_avail_context); fail: return err; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_tid.h000066400000000000000000000120531370564314600175000ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* included header files */ #ifndef _IPS_TID_H #define _IPS_TID_H #include "psm_user.h" #include "ips_tidcache.h" struct ips_tid; typedef void (*ips_tid_avail_cb_fn_t) (struct ips_tid *, void *context); struct ips_tid_ctrl { pthread_spinlock_t tid_ctrl_lock; uint32_t tid_num_max; uint32_t tid_num_avail; } __attribute__ ((aligned(64))); struct ips_tid { const psmi_context_t *context; struct ips_protoexp *protoexp; void *tid_avail_context; struct ips_tid_ctrl *tid_ctrl; ips_tid_avail_cb_fn_t tid_avail_cb; uint64_t tid_num_total; uint32_t tid_num_inuse; uint32_t tid_cachesize; /* items can be cached */ cl_qmap_t tid_cachemap; /* RB tree implementation */ /* * tids storage. * This is used in tid registration caching case for * tid invalidation, acquire, replace and release, * entries should be the assigned tid number. */ uint32_t *tid_array; }; psm2_error_t ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp, ips_tid_avail_cb_fn_t cb, void *cb_context); psm2_error_t ips_tid_fini(struct ips_tid *tidc); /* Acquiring tids. * Buffer base has to be aligned on page boundary * Buffer length has to be multiple pages */ psm2_error_t ips_tidcache_acquire(struct ips_tid *tidc, const void *buf, /* input buffer, aligned to page boundary */ uint32_t *length, /* buffer length, aligned to page size */ uint32_t *tid_array, /* output tidarray, */ uint32_t *tidcnt, /* output of tid count */ uint32_t *pageoff /* output of offset in first tid */ #ifdef PSM_CUDA , uint8_t is_cuda_ptr #endif ); psm2_error_t ips_tidcache_release(struct ips_tid *tidc, uint32_t *tid_array, /* input tidarray, */ uint32_t tidcnt); /* input of tid count */ psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc); psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc); psm2_error_t ips_tid_acquire(struct ips_tid *tidc, const void *buf, /* input buffer, aligned to page boundary */ uint32_t *length, /* buffer length, aligned to page size */ uint32_t *tid_array, /* output tidarray, */ uint32_t *tidcnt #ifdef PSM_CUDA , uint8_t is_cuda_ptr #endif ); /* output of tid count */ psm2_error_t ips_tid_release(struct ips_tid *tidc, uint32_t *tid_array, /* input tidarray, */ uint32_t tidcnt); /* input of tid count */ PSMI_INLINE(int ips_tid_num_available(struct ips_tid *tidc)) { if (tidc->tid_ctrl->tid_num_avail == 0) { if (tidc->tid_ctrl->tid_num_max == tidc->tid_num_inuse) return -1; else return 0; } return tidc->tid_ctrl->tid_num_avail; } /* Note that the caller is responsible for making sure that NIDLE is non-zero before calling ips_tidcache_evict. If NIDLE is 0 at the time of call, ips_tidcache_evict is unstable. */ uint64_t ips_tidcache_evict(struct ips_tid *tidc, uint64_t length); #endif /* _IPS_TID_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ips_tidcache.c000066400000000000000000000414711370564314600204650ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_expected_proto.h" #define RBTREE_GET_LEFTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start) #define RBTREE_GET_RIGHTMOST(PAYLOAD_PTR) ((PAYLOAD_PTR)->start+((PAYLOAD_PTR)->length<<12)) #define RBTREE_ASSERT psmi_assert #define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->ntid) #include "rbtree.c" void ips_tidcache_map_init(cl_qmap_t *p_map, cl_map_item_t* const root, cl_map_item_t* const nil_item) { ips_cl_qmap_init(p_map,root,nil_item); } /* * * Force to remove a tid, check invalidation event afterwards. */ static psm2_error_t ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t idx; uint64_t events_mask; psm2_error_t err; /* * call driver to free the tids. */ if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", 1); return err; } while (tidcnt) { tidcnt--; idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); /* * sanity check. */ psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); psmi_assert(INVALIDATE(idx) == 0); psmi_assert(REFCNT(idx) == 0); /* * mark the tid invalidated. */ INVALIDATE(idx) = 1; /* * remove the tid from RB tree. */ IDLE_REMOVE(idx); ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); } /* * Because the freed tid is not from invalidation list, * it is possible that kernel just invalidated the tid, * then we need to check and process the invalidation * before we can re-use this tid. The reverse order * will wrongly invalidate this tid again. */ err = psmi_hal_get_hfi_event_bits(&events_mask,tidc->context->psm_hw_ctxt); if_pf (err) return PSM2_INTERNAL_ERR; if (events_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { err = ips_tidcache_invalidation(tidc); if (err) return err; } return PSM2_OK; } /* * Register a new buffer with driver, and cache the tidinfo. */ static psm2_error_t ips_tidcache_register(struct ips_tid *tidc, unsigned long start, uint32_t length, uint32_t *firstidx #ifdef PSM_CUDA , uint8_t is_cuda_ptr #endif ) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t tidoff, tidlen; uint32_t idx, tidcnt; uint16_t flags = 0; psm2_error_t err; /* * make sure we have at least one free tid to * register the new buffer. */ if (NTID == tidc->tid_cachesize) { /* all tids are in active use, error? */ if (NIDLE == 0) return PSM2_OK_NO_PROGRESS; /* * free the first tid in idle queue. */ idx = IPREV(IHEAD); tidc->tid_array[0] = p_map->root[idx].payload.tidinfo; err = ips_tidcache_remove(tidc, 1); if (err) return err; } psmi_assert(NTID < tidc->tid_cachesize); /* Clip length if it exceeds worst case tid allocation, where each entry in the tid array can accommodate only 1 page. */ if (length > 4096*tidc->tid_ctrl->tid_num_max) { length = 4096*tidc->tid_ctrl->tid_num_max; } /* * register the new buffer. */ retry: tidcnt = 0; #ifdef PSM_CUDA if (is_cuda_ptr) flags = PSM_HAL_BUF_GPU_MEM; #endif if (psmi_hal_update_tid(tidc->context->psm_hw_ctxt, (uint64_t) start, &length, (uint64_t) tidc->tid_array, &tidcnt, flags) < 0) { /* if driver reaches lockable memory limit */ if ((errno == ENOMEM #ifdef PSM_CUDA /* This additional check is in place for just the cuda * version. It is a temporary workaround for a known * issue where nvidia driver returns EINVAL instead of * ENOMEM when there is no BAR1 space left to pin pages. * PSM frees tidcache enteries when the driver sends * EINVAL there by unpinning pages and freeing some * BAR1 space.*/ || (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM((void*)start) && errno == EINVAL) #endif ) && NIDLE) { uint64_t lengthEvicted = ips_tidcache_evict(tidc,length); if (lengthEvicted >= length) goto retry; } else if (errno == EFAULT) psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Unhandled error in TID Update: %s\n", strerror(errno)); #ifdef PSM_CUDA else if (PSMI_IS_CUDA_ENABLED && errno == ENOTSUP) psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, " Nvidia driver apis mismatch: %s\n", strerror(errno)); #endif /* Unable to pin pages? retry later */ return PSM2_EP_DEVICE_FAILURE; } psmi_assert_always(tidcnt > 0); psmi_assert((tidcnt+NTID) <= tidc->tid_cachesize); /* * backward processing because we want to return * the first RB index in the array. */ idx = 0; tidoff = length; while (tidcnt) { /* * Driver only returns tidctrl=1 or tidctrl=2. */ tidcnt--; idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); tidlen = IPS_TIDINFO_GET_LENGTH(tidc->tid_array[tidcnt]); /* * sanity check. */ psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); psmi_assert(INVALIDATE(idx) != 0); psmi_assert(REFCNT(idx) == 0); /* * clear the tid invalidated. */ INVALIDATE(idx) = 0; /* * put the tid into a RB node. */ tidoff -= tidlen << 12; START(idx) = start + tidoff; LENGTH(idx) = tidlen; p_map->root[idx].payload.tidinfo = tidc->tid_array[tidcnt]; /* * put the node into RB tree and idle queue head. */ IDLE_INSERT(idx); ips_cl_qmap_insert_item(p_map, &p_map->root[idx]); } psmi_assert(idx != 0); psmi_assert(tidoff == 0); *firstidx = idx; return PSM2_OK; } /* * Get mmu notifier invalidation info and update PSM's caching. */ psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t i, j, idx, tidcnt; psm2_error_t err; /* * get a list of invalidated tids from driver, * driver will clear the event bit before return. */ tidcnt = 0; if (psmi_hal_get_tidcache_invalidation(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) tidc->tid_array, &tidcnt) < 0) { /* If failed to get invalidation info, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to get invalidation info"); return err; } psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max); j = 0; for (i = 0; i < tidcnt; i++) { /* * Driver only returns tidctrl=1 or tidctrl=2. */ idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]); psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); /* * sanity check. */ psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]); psmi_assert(LENGTH(idx) == IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i])); /* * if the tid is already invalidated, ignore it, * but do sanity check. */ if (INVALIDATE(idx) != 0) { psmi_assert(REFCNT(idx) == 0); continue; } /* * mark the tid invalidated. */ INVALIDATE(idx) = 1; /* * if the tid is idle, remove the tid from RB tree * and idle queue, put on free list. */ if (REFCNT(idx) == 0) { IDLE_REMOVE(idx); ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); if (i != j) tidc->tid_array[j] = tidc->tid_array[i]; j++; } } if (j > 0) { /* * call driver to free the tids. */ if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", j); return err; } } return PSM2_OK; } psm2_error_t ips_tidcache_acquire(struct ips_tid *tidc, const void *buf, uint32_t *length, uint32_t *tid_array, uint32_t *tidcnt, uint32_t *tidoff #ifdef PSM_CUDA , uint8_t is_cuda_ptr #endif ) { cl_qmap_t *p_map = &tidc->tid_cachemap; cl_map_item_t *p_item; unsigned long start = (unsigned long)buf; unsigned long end = start + (*length); uint32_t idx, nbytes; uint64_t event_mask; psm2_error_t err; /* * Before every tid caching search, we need to update the * tid caching if there is invalidation event, otherwise, * the cached address may be invalidated and we might have * wrong matching. */ err = psmi_hal_get_hfi_event_bits(&event_mask,tidc->context->psm_hw_ctxt); if_pf (err) return PSM2_INTERNAL_ERR; if (event_mask & PSM_HAL_HFI_EVENT_TID_MMU_NOTIFY) { err = ips_tidcache_invalidation(tidc); if (err) return err; } /* * Now we can do matching from the caching, because obsolete * address in caching has been removed or identified. */ retry: p_item = ips_cl_qmap_search(p_map, start, end); idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) + IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo); /* * There is tid matching. */ if (idx) { /* * if there is a caching match, but the tid has been * invalidated, we can't match this tid, and we also * can't register this address, we need to wait this * tid to be freed. */ if (INVALIDATE(idx) != 0) return PSM2_OK_NO_PROGRESS; /* * if the page offset within the tid is not less than * 128K, the address offset within the page is not 64B * multiple, PSM can't handle this tid with any offset * mode. We need to free this tid and re-register with * the asked page address. */ if (((start - START(idx)) >= 131072) && ((*tidoff) & 63)) { /* * If the tid is currently used, retry later. */ if (REFCNT(idx) != 0) return PSM2_OK_NO_PROGRESS; /* * free this tid. */ tidc->tid_array[0] = p_map->root[idx].payload.tidinfo; err = ips_tidcache_remove(tidc, 1); if (err) return err; /* try to match a node again */ goto retry; } } /* * If there is no match node, or 'start' falls out of node range, * whole or partial buffer from 'start' is not registered yet. */ if (!idx || START(idx) > start) { if (!idx) nbytes = end - start; else nbytes = START(idx) - start; /* * Because we don't have any match tid yet, if * there is an error, we return from here, PSM * will try later. */ err = ips_tidcache_register(tidc, start, nbytes, &idx #ifdef PSM_CUDA , is_cuda_ptr #endif ); if (err) return err; } /* * sanity check. */ psmi_assert(START(idx) <= start); psmi_assert(INVALIDATE(idx) == 0); *tidoff += start - START(idx); *tidcnt = 1; tid_array[0] = p_map->root[idx].payload.tidinfo; REFCNT(idx)++; if (REFCNT(idx) == 1) IDLE_REMOVE(idx); start = END(idx); while (start < end) { p_item = ips_cl_qmap_successor(p_map, &p_map->root[idx]); idx = 2*IPS_TIDINFO_GET_TID(p_item->payload.tidinfo) + IPS_TIDINFO_GET_TIDCTRL(p_item->payload.tidinfo); if (!idx || START(idx) != start) { if (!idx) nbytes = end - start; else nbytes = (START(idx) > end) ? (end - start) : (START(idx) - start); /* * Because we already have at least one match tid, * if it is error to register new pages, we break * here and return the tids we already have. */ err = ips_tidcache_register(tidc, start, nbytes, &idx #ifdef PSM_CUDA , is_cuda_ptr #endif ); if (err) break; } else if (INVALIDATE(idx) != 0) { /* * the tid has been invalidated, it is still in * caching because it is still being used, but * any new usage is not allowed, we ignore it and * return the tids we already have. */ psmi_assert(REFCNT(idx) != 0); break; } /* * sanity check. */ psmi_assert(START(idx) == start); psmi_assert(INVALIDATE(idx) == 0); tid_array[(*tidcnt)++] = p_map->root[idx].payload.tidinfo; REFCNT(idx)++; if (REFCNT(idx) == 1) IDLE_REMOVE(idx); start = END(idx); } if (start < end) *length = start - (unsigned long)buf; /* otherwise, all pages are registered */ psmi_assert((*tidcnt) > 0); return PSM2_OK; } psm2_error_t ips_tidcache_release(struct ips_tid *tidc, uint32_t *tid_array, uint32_t tidcnt) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t i, j, idx; psm2_error_t err; psmi_assert(tidcnt > 0); j = 0; for (i = 0; i < tidcnt; i++) { /* * Driver only returns tidctrl=1 or tidctrl=2. */ idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) + IPS_TIDINFO_GET_TIDCTRL(tid_array[i]); psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); psmi_assert(REFCNT(idx) != 0); REFCNT(idx)--; if (REFCNT(idx) == 0) { if (INVALIDATE(idx) != 0) { ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); tidc->tid_array[j] = tid_array[i]; j++; } else { IDLE_INSERT(idx); } } } if (j > 0) { /* * call driver to free the tids. */ if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", j); return err; } } return PSM2_OK; } /* * * Call driver to free all cached tids. */ psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc) { cl_qmap_t *p_map = &tidc->tid_cachemap; psm2_error_t err; int i, j; j = 0; for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) { psmi_assert(REFCNT(i) == 0); if (INVALIDATE(i) == 0) { tidc->tid_array[j++] = p_map->root[i].payload.tidinfo; } } if (j > 0) { /* * call driver to free the tids. */ if (psmi_hal_free_tid(tidc->context->psm_hw_ctxt, (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", j); return err; } } psmi_free(tidc->tid_array); psmi_free(tidc->tid_cachemap.root); return PSM2_OK; } /* Note that the caller is responsible for making sure that NIDLE is non-zero before calling ips_tidcache_evict. If NIDLE is 0 at the time of call, ips_tidcache_evict is unstable. */ uint64_t ips_tidcache_evict(struct ips_tid *tidc,uint64_t length) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t idx = IHEAD, tidcnt = 0, tidlen = 0; /* * try to free the required * pages from idle queue tids */ do { idx = IPREV(idx); psmi_assert(idx != 0); tidc->tid_array[tidcnt] = p_map->root[idx].payload.tidinfo; tidcnt++; tidlen += IPS_TIDINFO_GET_LENGTH (p_map->root[idx].payload.tidinfo)<<12; } while (tidcnt < NIDLE && tidlen < length); /* * free the selected tids on successfully finding some:. */ if (tidcnt > 0 && ips_tidcache_remove(tidc, tidcnt)) return 0; return tidlen; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_tidcache.h000066400000000000000000000131301370564314600204610ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _IPS_TIDCACHE_H #define _IPS_TIDCACHE_H #include #include #include #include /* * Design notes. * * PSM needs to call into driver to program receiving buffer pages to * HFI gen1 hardware, each tid can be programmed with physically contiguous * power-of-two pages from 1 pages to 512 pages. This procedure takes * time. * * Lots of applications tend to re-use the same receiving buffer, caching * such programmed tids in user space process will save time and improve * application performance. * * This PSM tid registration caching design requires cooperation between * PSM and driver. Here is what happen between PSM and driver. * * 1. PSM call into driver with a chunk of buffer with virtual address * and length. * 2. driver pins the buffer pages, program hardware with the physical * pages, get a list of tids. * 3. driver caches the tids with the corresponding virtual address in * user space for each tid, and return the list of tids back to PSM. * 4. PSM also caches the list of tids with the corresponding virtual * address for each tid, and use the list of tids for transmission. * 5. when process frees a buffer, kernel VM will catch the event and * calls the callback in driver to notify that the virtual address * range is gone in the process. * 6. driver will search its cache system and find the tids with the * removed virtual address, put these tid in an invalidation queue * and notify PSM the event. * 7. PSM will pick the event and remove the tids from its own cache * as well. * 8. PSM must check such invalidation event every time before searching * its caching system to match tids for a 'new' buffer chunk. * 9, when the caching system is full, and a new buffer chunk is asked * to register, PSM picks a victim to remove. */ typedef struct { unsigned long start; /* start virtual address */ uint32_t tidinfo; /* tid encoding */ uint16_t length; /* length in pages */ uint16_t invalidate; /* invalidate flag */ uint16_t refcount; /* usage reference count */ uint16_t i_prev; /* idle queue previous */ uint16_t i_next; /* idle queue next */ } rbtree_tidcache_mapitem_pl_t; typedef struct { uint32_t ntid; /* tids are cached */ uint32_t nidle; /* tids are idle */ } rbtree_tidcache_map_pl_t; #define RBTREE_MI_PL rbtree_tidcache_mapitem_pl_t #define RBTREE_MAP_PL rbtree_tidcache_map_pl_t #include "rbtree.h" /* * Macro definition for easy programming. */ #define NTID p_map->payload.ntid #define REFCNT(x) p_map->root[x].payload.refcount #define INVALIDATE(x) p_map->root[x].payload.invalidate #define LENGTH(x) p_map->root[x].payload.length #define START(x) p_map->root[x].payload.start #define END(x) (START(x) + (LENGTH(x)<<12)) /* * Macro for idle tid queue management. */ #define NIDLE p_map->payload.nidle #define IHEAD 0 #define INEXT(x) p_map->root[x].payload.i_next #define IPREV(x) p_map->root[x].payload.i_prev #define IDLE_REMOVE(x) do { \ INEXT(IPREV(x)) = INEXT(x); \ IPREV(INEXT(x)) = IPREV(x); \ NIDLE--; \ } while (0) #define IDLE_INSERT(x) do { \ INEXT(x) = INEXT(IHEAD); \ IPREV(x) = IHEAD; \ IPREV(INEXT(IHEAD)) = x; \ INEXT(IHEAD) = x; \ NIDLE++; \ } while (0) extern void ips_tidcache_map_init(cl_qmap_t *p_map, cl_map_item_t* const root, cl_map_item_t* const nil_item); #endif opa-psm2-PSM2_11.2.185/ptl_ips/ips_tidflow.c000066400000000000000000000174021370564314600203660ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_proto.h" #include "ips_expected_proto.h" #include "ips_tidflow.h" psm2_error_t ips_tf_init(struct ips_protoexp *protoexp, const psmi_context_t *context, struct ips_tf *tfc, ips_tf_avail_cb_fn_t cb) { int tf_idx; #if TF_ADD struct psmi_stats_entry entries[] = { PSMI_STATS_DECL("tidflow update count", MPSPAWN_STATS_REDUCTION_ALL, NULL, &tfc->tf_num_total), }; #endif tfc->context = context; tfc->tf_num_total = 0; tfc->tf_num_inuse = 0; tfc->tf_avail_cb = cb; tfc->tf_avail_context = (void *)protoexp; if (psmi_hal_has_cap(PSM_HAL_CAP_EXTENDED_PSN)) { tfc->tf_gen_mask = 0xFFFFF; } else { tfc->tf_gen_mask = 0x1FFF; } /* Allocate and Initialize tidrecvc array. */ tfc->tidrecvc = (struct ips_tid_recv_desc *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_tid_recv_desc)*HFI_TF_NFLOWS); if (tfc->tidrecvc == NULL) return PSM2_NO_MEMORY; for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { tfc->tidrecvc[tf_idx].context = context; tfc->tidrecvc[tf_idx].protoexp = protoexp; tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx; tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx; tfc->tidrecvc[tf_idx].tidflow.flowid = EP_FLOW_TIDFLOW; tfc->tidrecvc[tf_idx].tidflow.frag_size = protoexp->proto->epinfo.ep_mtu; } /* Shared control structure, it will be in shared memory * for context sharing, otherwise calloc() it */ tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl; if (!tfc->tf_ctrl) { tfc->tf_ctrl = (struct ips_tf_ctrl *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_tf_ctrl)); if (tfc->tf_ctrl == NULL) { return PSM2_NO_MEMORY; } } /* * Only the master process can initialize. */ if (psmi_hal_get_subctxt(context->psm_hw_ctxt) == 0) { pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock, PTHREAD_PROCESS_SHARED); tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS; tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS; for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { /* Update flow state */ tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED; tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx; tfc->tf_ctrl->tf[tf_idx].next_gen = 0; tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; psmi_hal_tidflow_reset(tfc->context->psm_hw_ctxt, tf_idx, tfc->tf_gen_mask, 0x7FF); } tfc->tf_ctrl->tf_head = 0; } #if TF_ADD /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */ return psmi_stats_register_type(PSMI_STATS_NO_HEADING, PSMI_STATSTYPE_TIDS, entries, PSMI_STATS_HOWMANY(entries), tidc); #else return PSM2_OK; #endif } psm2_error_t ips_tf_fini(struct ips_tf *tfc) { if (!tfc->context->tf_ctrl) psmi_free(tfc->tf_ctrl); psmi_free(tfc->tidrecvc); return PSM2_OK; } /* Allocate a tidflow */ psm2_error_t ips_tf_allocate(struct ips_tf *tfc, struct ips_tid_recv_desc **tidrecvc) { struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; struct ips_tf_entry *entry; if (tfc->context->tf_ctrl) pthread_spin_lock(&ctrl->tf_ctrl_lock); if (!ctrl->tf_num_avail) { psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS); *tidrecvc = NULL; if (tfc->context->tf_ctrl) pthread_spin_unlock(&ctrl->tf_ctrl_lock); return PSM2_EP_NO_RESOURCES; } entry = &ctrl->tf[ctrl->tf_head]; ctrl->tf_head = entry->next_free; ctrl->tf_num_avail--; if (tfc->context->tf_ctrl) pthread_spin_unlock(&ctrl->tf_ctrl_lock); tfc->tf_num_total++; tfc->tf_num_inuse++; psmi_assert(entry->state == TF_STATE_DEALLOCATED); entry->state = TF_STATE_ALLOCATED; *tidrecvc = &(tfc->tidrecvc[entry->tf_idx]); /* initial tidflow generation */ (*tidrecvc)->tidflow_active_gen = entry->next_gen; psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx); psmi_assert_always(entry->next_gen < tfc->tf_gen_mask); entry->next_gen++; if (entry->next_gen == tfc->tf_gen_mask) entry->next_gen = 0; return PSM2_OK; } /* Deallocate a tidflow */ psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx) { struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; struct ips_tf_entry *entry; psmi_assert(tf_idx < HFI_TF_NFLOWS); psmi_assert(tf_idx >= 0); entry = &ctrl->tf[tf_idx]; psmi_assert(entry->state == TF_STATE_ALLOCATED); entry->state = TF_STATE_DEALLOCATED; /* * The wire protocol only uses 16bits tidrecvc generation * count in exptid packet, this should be bigger enough, * u16w3 is the lower 16bits of _desc_genc */ tfc->tidrecvc[tf_idx].rdescid.u16w3++; /* Mark invalid generation for flow (stale packets will be dropped) */ psmi_hal_tidflow_reset(tfc->context->psm_hw_ctxt, tf_idx, tfc->tf_gen_mask, 0x7FF); if (tfc->context->tf_ctrl) pthread_spin_lock(&ctrl->tf_ctrl_lock); entry->next_free = ctrl->tf_head; ctrl->tf_head = tf_idx; ctrl->tf_num_avail++; if (tfc->context->tf_ctrl) pthread_spin_unlock(&ctrl->tf_ctrl_lock); tfc->tf_num_inuse--; /* If an available callback is registered invoke it */ if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb) tfc->tf_avail_cb(tfc, tfc->tf_avail_context); return PSM2_OK; } /* Allocate a generation for a flow */ psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, uint32_t tf_idx, uint32_t *tfgen) { struct ips_tf_entry *entry; int ret = PSM2_OK; psmi_assert(tf_idx < HFI_TF_NFLOWS); psmi_assert(tf_idx >= 0); entry = &tfc->tf_ctrl->tf[tf_idx]; psmi_assert(entry->state == TF_STATE_ALLOCATED); *tfgen = entry->next_gen; entry->next_gen++; if (entry->next_gen == tfc->tf_gen_mask) entry->next_gen = 0; psmi_assert_always(*tfgen < tfc->tf_gen_mask); return ret; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_tidflow.h000066400000000000000000000075741370564314600204040ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2016 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2016 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #ifndef _IPS_TIDFLOW_H #define _IPS_TIDFLOW_H #include "psm_user.h" struct ips_tf; struct ips_protoexp; typedef void (*ips_tf_avail_cb_fn_t) (struct ips_tf *, void *context); typedef enum { TF_STATE_INVALID = 0, TF_STATE_ALLOCATED = 1, TF_STATE_DEALLOCATED = 2 } tf_state_t; struct ips_tf_entry { tf_state_t state; uint32_t tf_idx; uint32_t next_gen; uint32_t next_free; }; struct ips_tf_ctrl { pthread_spinlock_t tf_ctrl_lock; uint32_t tf_num_max; uint32_t tf_num_avail; uint32_t tf_head; struct ips_tf_entry tf[HFI_TF_NFLOWS]; } __attribute__ ((aligned(64))); struct ips_tf { const psmi_context_t *context; ips_tf_avail_cb_fn_t tf_avail_cb; void *tf_avail_context; struct ips_tf_ctrl *tf_ctrl; uint64_t tf_num_total; uint32_t tf_num_inuse; uint32_t tf_gen_mask; /* Pointer to array of size HFI_TF_NFLOWS */ struct ips_tid_recv_desc *tidrecvc; }; PSMI_ALWAYS_INLINE(int ips_tf_available(struct ips_tf *tf)) { if (tf->tf_ctrl->tf_num_avail == 0) { if (tf->tf_ctrl->tf_num_max == tf->tf_num_inuse) return -1; else return 0; } return tf->tf_ctrl->tf_num_avail; } psm2_error_t ips_tf_init(struct ips_protoexp *protoexp, const psmi_context_t *context, struct ips_tf *tfc, ips_tf_avail_cb_fn_t cb); psm2_error_t ips_tf_fini(struct ips_tf *tfc); /* Allocate a tidflow */ psm2_error_t ips_tf_allocate(struct ips_tf *tfc, struct ips_tid_recv_desc **tidrecvc); /* Deallocate a tidflow */ psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx); /* Allocate a generation for a flow */ psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, uint32_t tf_idx, uint32_t *tfgen); #endif opa-psm2-PSM2_11.2.185/ptl_ips/ips_writehdrq.c000066400000000000000000000055651370564314600207360ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include "psm_user.h" #include "psm2_hal.h" #include "ips_writehdrq.h" #include "ips_proto_params.h" psm2_error_t ips_writehdrq_init(const psmi_context_t *context, struct ips_writehdrq *writeq, struct ips_writehdrq_state *state, uint32_t subcontext) { uint32_t elemsz = psmi_hal_get_rx_hdr_q_ent_size(context->psm_hw_ctxt), elemcnt = psmi_hal_get_rx_hdr_q_cnt(context->psm_hw_ctxt); memset(writeq, 0, sizeof(*writeq)); writeq->context = context; writeq->state = state; writeq->hdrq_elemlast = (elemcnt - 1) * (elemsz >> BYTE2DWORD_SHIFT); writeq->state->enabled = 1; return PSM2_OK; } opa-psm2-PSM2_11.2.185/ptl_ips/ips_writehdrq.h000066400000000000000000000062751370564314600207420ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_WRITEHDRQ_H #define _IPS_WRITEHDRQ_H #include "psm_user.h" #include "ips_recvq.h" /* * Structure containing state for writehdrq writing. This is logically * part of ips_writehdrq but needs to be separated out for context * sharing so that it can be put in a shared memory page and hence * be available to all processes sharing the port. Generally, do not * put pointers in here since the address map of each process can be * different. */ struct ips_writehdrq_state { uint32_t hdrq_rhf_seq; /* last seq */ uint32_t egrq_offset; /* in bytes unit, not 64B */ uint32_t enabled; /* enables writing */ }; struct ips_writehdrq { const psmi_context_t *context; struct ips_writehdrq_state *state; uint32_t hdrq_elemlast; }; psm2_error_t ips_writehdrq_init(const psmi_context_t *context, struct ips_writehdrq *writeq, struct ips_writehdrq_state *state, uint32_t subcontext); #endif /* _IPS_WRITEHDRQ_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ptl.c000066400000000000000000000613571370564314600166520ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* This file implements the PSM PTL for ips */ #include "psm_user.h" #include "psm2_hal.h" #include "ptl_ips.h" #include "psm_mq_internal.h" int ips_ptl_recvq_isempty(const struct ptl *ptl); static int ips_subcontext_ignore(struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext) { return IPS_RECVHDRQ_CONTINUE; } static int ips_subcontext_process(struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext) { struct ptl_shared *recvshc = ((struct ptl_ips *)(rcv_ev->proto->ptl))->recvshc; if_pt(subcontext != recvshc->subcontext && subcontext < recvshc->subcontext_cnt) { return psmi_hal_forward_packet_to_subcontext(&recvshc->writeq[subcontext], rcv_ev, subcontext, rcv_ev->recvq->context->psm_hw_ctxt); } else { _HFI_VDBG ("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n", (int)subcontext, (int)recvshc->subcontext_cnt, (int)recvshc->subcontext, psmi_hal_rhf_get_all_err_flags(rcv_ev->psm_hal_rhf)); return IPS_RECVHDRQ_BREAK; } } static psm2_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context); static psm2_error_t shrecvq_fini(ptl_t *ptl); static size_t ips_ptl_sizeof(void) { return sizeof(struct ptl_ips); } static int ips_ptl_epaddr_stats_num(void) { return sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); } static int ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags) { int num_stats = sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); int i; /* All stats are uint64_t */ for (i = 0; i < num_stats; i++) flags[i] = MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO; desc[0] = "errchecks sent"; desc[1] = "errchecks recv"; desc[2] = "naks sent"; desc[3] = "naks recv"; desc[4] = "connect reqs sent"; desc[5] = "disconnect reqs sent"; desc[6] = "tid grants sent"; desc[7] = "tid grants recv"; desc[8] = "send rexmit"; desc[9] = "congestion packets"; return num_stats; } int ips_ptl_epaddr_stats_get(psm2_epaddr_t epaddr, uint64_t *stats_o) { int i, num_stats = sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); uint64_t *stats_i = (uint64_t *) &epaddr->proto->epaddr_stats; for (i = 0; i < num_stats; i++) stats_o[i] = stats_i[i]; return num_stats; } static psm2_error_t psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current) { struct ptl_ips *ptl = (struct ptl_ips *)t->context; const uint64_t current_count = get_cycles(); psm2_error_t err; err = psmi_context_check_status(ptl->context); if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS) { int rc = psmi_hal_spio_process_events((struct ptl *)ptl); err = rc >= 0 ? PSM2_OK : PSM2_INTERNAL_ERR; } psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, current_count + ptl->status_cyc_timeout); return err; } static psm2_error_t ips_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; psm2_error_t err = PSM2_OK; uint32_t num_of_send_bufs = ep->hfi_num_sendbufs; uint32_t num_of_send_desc = ep->hfi_num_descriptors; uint32_t imm_size = ep->hfi_imm_size; const psmi_context_t *context = &ep->context; const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt) > 0); const uint64_t current_count = get_cycles(); /* Preconditions */ psmi_assert_always(ep != NULL); psmi_assert_always(ep->epaddr != NULL); psmi_assert_always(ep->epid != 0); psmi_assert_always(ep->hfi_num_sendbufs > 0); memset(ptl, 0, sizeof(struct ptl_ips)); ptl->ep = ep; /* back pointer */ ptl->epid = ep->epid; /* cache epid */ ptl->epaddr = ep->epaddr; /* cache a copy */ ptl->ctl = ctl; ptl->context = context; memset(ctl, 0, sizeof(*ctl)); /* Fill in the control structure */ ctl->ep = ep; ctl->ptl = ptl_gen; ctl->ep_poll = enable_shcontexts ? ips_ptl_shared_poll : ips_ptl_poll; ctl->ep_connect = ips_ptl_connect; ctl->ep_disconnect = ips_ptl_disconnect; ctl->mq_send = ips_proto_mq_send; ctl->mq_isend = ips_proto_mq_isend; ctl->am_get_parameters = ips_am_get_parameters; ctl->am_short_request = ips_am_short_request; ctl->am_short_reply = ips_am_short_reply; ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num; ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init; ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get; ctl->msg_size_thresh_query = ips_proto_msg_size_thresh_query; /* * Runtime flags in 'ptl' are different from runtime flags in 'context'. * In 'context', runtime flags reflect what the driver is capable of. * In 'ptl', runtime flags reflect the features we can or want to use in * the driver's supported runtime flags. */ /* * This timer is to be used to check the context's status at every * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS. This is useful to detect when * the link transitions from the DOWN state to the UP state. We can thus * stop aggregating link failure messages once we detect that the link is * up. */ psmi_timer_entry_init(&ptl->status_timer, psmi_context_check_status_callback, ptl); /* cache the context's status timeout in cycles */ ptl->status_cyc_timeout = ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS); /* * Retransmissions and pending operations are kept in a timer structure * (queue). The timerq is shared to various internal IPS interfaces so * that they too may schedule events on the timer queue. The timerq is * drained in the progress function. */ if ((err = psmi_timer_init(&ptl->timerq))) goto fail; /* start the context's status timer */ psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, current_count + ptl->status_cyc_timeout); /* * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings * are added/removed by the connect portion of the ips protocol and lookup * is made by the receive queue processing component. */ if ((err = ips_epstate_init(&ptl->epstate, context))) goto fail; /* * Context sharing, setup subcontext ureg page. */ if (enable_shcontexts) { struct ptl_shared *recvshc; recvshc = (struct ptl_shared *) psmi_calloc(ep, UNDEFINED, 1, sizeof(struct ptl_shared)); if (recvshc == NULL) { err = PSM2_NO_MEMORY; goto fail; } ptl->recvshc = recvshc; recvshc->ptl = ptl_gen; /* Initialize recvshc fields */ recvshc->context = psmi_hal_get_context(context->psm_hw_ctxt); recvshc->subcontext = psmi_hal_get_subctxt(context->psm_hw_ctxt); recvshc->subcontext_cnt = psmi_hal_get_subctxt_cnt(context->psm_hw_ctxt); psmi_assert_always(recvshc->subcontext_cnt <= PSM_HAL_MAX_SHARED_CTXTS); psmi_assert_always(recvshc->subcontext < recvshc->subcontext_cnt); /* * Using ep->context to avoid const modifier since this function * will modify the content in ep->context. */ if ((err = psmi_hal_subcontext_ureg_get(ptl_gen, recvshc->subcontext_ureg, context->psm_hw_ctxt))) goto fail; /* Note that the GEN1 HAL instance initializes struct ips_subcontext_ureg during context open. */ recvshc->context_lock = &recvshc->hwcontext_ctrl->context_lock; if (recvshc->subcontext == 0) { if (pthread_spin_init(recvshc->context_lock, PTHREAD_PROCESS_SHARED) != 0) { err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, "Couldn't initialize process-shared spin lock"); goto fail; } } } /* * Hardware send pio used by eager and control messages. */ if ((err = psmi_hal_spio_init(context, ptl_gen, &ptl->spioc))) goto fail; /* * Actual ips protocol handling. */ if ((err = ips_proto_init(context, ptl_gen, num_of_send_bufs, num_of_send_desc, imm_size, &ptl->timerq, &ptl->epstate, ptl->spioc, &ptl->proto))) goto fail; /* * Hardware receive hdr/egr queue, services incoming packets and issues * callbacks for protocol handling in proto_recv. It uses the epstate * interface to determine if a packet is known or unknown. */ if (!enable_shcontexts) { struct ips_recvhdrq_callbacks recvq_callbacks; recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown; recvq_callbacks.callback_subcontext = ips_subcontext_ignore; recvq_callbacks.callback_error = ips_proto_process_packet_error; if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, &recvq_callbacks, 0, &ptl->recvq, &ptl->recvq_state, PSM_HAL_CL_Q_RX_HDR_Q))) goto fail; } /* * Software receive hdr/egr queue, used in shared contexts. */ else if ((err = shrecvq_init(ptl_gen, context))) goto fail; /* * Receive thread, always initialized but not necessary creates a * pthread. */ if ((err = ips_ptl_rcvthread_init(ptl_gen, &ptl->recvq))) goto fail; fail: return err; } static psm2_error_t ips_ptl_fini(ptl_t *ptl_gen, int force, uint64_t timeout_in) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; psm2_error_t err = PSM2_OK; const int enable_shcontexts = (psmi_hal_get_subctxt_cnt(ptl->context->psm_hw_ctxt) > 0); if ((err = ips_proto_fini(&ptl->proto, force, timeout_in))) goto fail; /* We have to cancel the thread after terminating the protocol because * connect/disconnect packets use interrupts and the kernel doesn't * like to have no pollers waiting */ if ((err = ips_ptl_rcvthread_fini(ptl_gen))) goto fail; if ((err = ips_epstate_fini(&ptl->epstate))) goto fail; if ((err = psmi_hal_spio_fini(&ptl->spioc, ptl->context->psm_hw_ctxt))) goto fail; if ((err = psmi_timer_fini(&ptl->timerq))) goto fail; if (enable_shcontexts && (err = shrecvq_fini(ptl_gen))) goto fail; fail: return err; } static psm2_error_t ips_ptl_optctl(const void *core_obj, int optname, void *optval, uint64_t *optlen, int get) { psm2_error_t err = PSM2_OK; switch (optname) { case PSM2_IB_OPT_EP_SL: { /* Core object is psm2_epaddr */ psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj; ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr; /* If endpoint does not use IB ignore for set, complain for get */ if (epaddr->ptlctl->ep_connect != ips_ptl_connect) { if (get) err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Invalid EP transport"); goto exit_fn; } /* Sanity check option length */ if (*optlen < sizeof(uint8_t)) { err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Option value length error"); *optlen = sizeof(unsigned); goto exit_fn; } if (get) { /* Get returns the SL for the PIO flow */ *((uint8_t *) optval) = (uint8_t) ipsaddr-> flows[EP_FLOW_GO_BACK_N_PIO].path->pr_sl; } else { uint16_t new_sl; /* Sanity check if SL is within range */ new_sl = (uint16_t) *(uint8_t *) optval; if (new_sl > PSMI_SL_MAX) { err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Invalid SL value %u. %d<= SL <=%d.", new_sl, PSMI_SL_MIN, PSMI_SL_MAX); goto exit_fn; } /* Set new SL for all flows */ ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path-> pr_sl = new_sl; ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].path-> pr_sl = new_sl; } } break; case PSM2_IB_OPT_DF_SL: { /* Set default SL to be used by an endpoint for all communication */ /* Core object is psm2_epaddr */ psm2_ep_t ep = (psm2_ep_t) core_obj; /* Make sure ep is specified */ if (!ep) { err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Invalid PSM Endpoint"); goto exit_fn; } /* Sanity check option length */ if (*optlen < sizeof(uint8_t)) { err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Option value length error"); *optlen = sizeof(uint8_t); goto exit_fn; } if (get) { *((uint8_t *) optval) = ((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl; } else { uint16_t new_sl; /* Sanity check if SL is within range */ new_sl = (uint16_t) *(uint8_t *) optval; if (new_sl > PSMI_SL_MAX) { err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, "Invalid SL value %u. %d<= SL <=%d.", new_sl, PSMI_SL_MIN, PSMI_SL_MAX); goto exit_fn; } ((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl = (uint8_t) new_sl; } } break; default: err = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown PSM2_IB option %u.", optname); } exit_fn: return err; } static psm2_error_t ips_ptl_setopt(const void *component_obj, int optname, const void *optval, uint64_t optlen) { return ips_ptl_optctl(component_obj, optname, (void *)optval, &optlen, 0); } static psm2_error_t ips_ptl_getopt(const void *component_obj, int optname, void *optval, uint64_t *optlen) { return ips_ptl_optctl(component_obj, optname, optval, optlen, 1); } static uint32_t ips_ptl_rcvthread_is_enabled(const ptl_t *ptl) { return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); } psm2_error_t ips_ptl_poll(ptl_t *ptl_gen, int _ignored) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; const uint64_t current_count = get_cycles(); const int do_lock = PSMI_LOCK_DISABLED && psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); psm2_error_t err = PSM2_OK_NO_PROGRESS; psm2_error_t err2; if (!ips_recvhdrq_isempty(&ptl->recvq)) { if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq)) return err; if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { ips_recvhdrq_scan_cca(&ptl->recvq); } err = ips_recvhdrq_progress(&ptl->recvq); if (do_lock) ips_recvhdrq_unlock(&ptl->recvq); if_pf(err > PSM2_OK_NO_PROGRESS) return err; err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); if (err2 != PSM2_OK_NO_PROGRESS) return err2; else return err; } /* * Process timer expirations after servicing receive queues (some packets * may have been acked, some requests-to-send may have been queued). * * It's safe to look at the timer without holding the lock because it's not * incorrect to be wrong some of the time. */ if (psmi_timer_is_expired(&(ptl->timerq), current_count)) { if (do_lock) ips_recvhdrq_lock(&ptl->recvq); err = psmi_timer_process_expired(&(ptl->timerq), current_count); if (do_lock) ips_recvhdrq_unlock(&ptl->recvq); } return err; } PSMI_INLINE(int ips_try_lock_shared_context(struct ptl_shared *recvshc)) { return pthread_spin_trylock(recvshc->context_lock); } /* Unused PSMI_INLINE(void ips_lock_shared_context(struct ptl_shared *recvshc)) { pthread_spin_lock(recvshc->context_lock); } */ PSMI_INLINE(void ips_unlock_shared_context(struct ptl_shared *recvshc)) { pthread_spin_unlock(recvshc->context_lock); } psm2_error_t ips_ptl_shared_poll(ptl_t *ptl_gen, int _ignored) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; const uint64_t current_count = get_cycles(); psm2_error_t err = PSM2_OK_NO_PROGRESS; psm2_error_t err2; struct ptl_shared *recvshc = ptl->recvshc; psmi_assert(recvshc != NULL); /* The following header queue checks are speculative (but safe) * until this process has acquired the lock. The idea is to * minimize lock contention due to processes spinning on the * shared context. */ if (ips_recvhdrq_isempty(&recvshc->recvq)) { if (!ips_recvhdrq_isempty(&ptl->recvq) && ips_try_lock_shared_context(recvshc) == 0) { /* check that subcontext is empty while under lock to avoid * re-ordering of incoming packets (since packets from * hardware context will be processed immediately). */ if_pt(ips_recvhdrq_isempty(&recvshc->recvq)) { if (ptl->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { ips_recvhdrq_scan_cca(&ptl->recvq); } err = ips_recvhdrq_progress(&ptl->recvq); } ips_unlock_shared_context(recvshc); } } if_pf(err > PSM2_OK_NO_PROGRESS) return err; if (!ips_recvhdrq_isempty(&recvshc->recvq)) { if (recvshc->recvq.proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { ips_recvhdrq_scan_cca(&recvshc->recvq); } err2 = ips_recvhdrq_progress(&recvshc->recvq); if (err2 != PSM2_OK_NO_PROGRESS) { err = err2; } } if_pf(err > PSM2_OK_NO_PROGRESS) return err; /* * Process timer expirations after servicing receive queues (some packets * may have been acked, some requests-to-send may have been queued). */ err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); if (err2 != PSM2_OK_NO_PROGRESS) err = err2; return err; } int ips_ptl_recvq_isempty(const ptl_t *ptl_gen) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; struct ptl_shared *recvshc = ptl->recvshc; if (recvshc != NULL && !ips_recvhdrq_isempty(&recvshc->recvq)) return 0; return ips_recvhdrq_isempty(&ptl->recvq); } /* * Legacy ips_get_stat -- do nothing. */ int ips_get_stat(psm2_epaddr_t epaddr, ips_sess_stat *stats) { memset(stats, 0, sizeof(ips_sess_stat)); return 0; } static psm2_error_t shrecvq_init(ptl_t *ptl_gen, const psmi_context_t *context) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; struct ptl_shared *recvshc = ptl->recvshc; struct ips_recvhdrq_callbacks recvq_callbacks; psm2_error_t err = PSM2_OK; int i; /* Initialize (shared) hardware context recvq (ptl->recvq) */ /* NOTE: uses recvq in ptl structure for shared h/w context */ recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown; recvq_callbacks.callback_subcontext = ips_subcontext_process; recvq_callbacks.callback_error = ips_proto_process_packet_error; if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, &recvq_callbacks, recvshc->subcontext, &ptl->recvq, &recvshc->hwcontext_ctrl->recvq_state, PSM_HAL_CL_Q_RX_HDR_Q))) { goto fail; } /* Initialize software subcontext (recvshc->recvq). Subcontexts do */ /* not require the rcvhdr copy feature. */ recvq_callbacks.callback_subcontext = ips_subcontext_ignore; if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, &recvq_callbacks, recvshc->subcontext, &recvshc->recvq, &recvshc->recvq_state, PSM_HAL_GET_SC_CL_Q_RX_HDR_Q(recvshc->subcontext)))) { goto fail; } /* Initialize each recvshc->writeq for shared contexts */ for (i = 0; i < recvshc->subcontext_cnt; i++) { if ((err = ips_writehdrq_init(context, &recvshc->writeq[i], &recvshc->subcontext_ureg[i]-> writeq_state, i))) { goto fail; } } if (err == PSM2_OK) _HFI_DBG ("Context sharing in use: lid %d, context %d, sub-context %d\n", (int)psm2_epid_nid(ptl->epid), recvshc->context, recvshc->subcontext); fail: return err; } static psm2_error_t shrecvq_fini(ptl_t *ptl_gen) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; psm2_error_t err = PSM2_OK; int i; /* disable my write header queue before deallocation */ i = ptl->recvshc->subcontext; ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0; psmi_free(ptl->recvshc); return err; } psm2_error_t ips_ptl_connect(ptl_t *ptl_gen, int numep, const psm2_epid_t *array_of_epid, const int *array_of_epid_mask, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; psm2_error_t err; psm2_ep_t ep; psm2_epid_t *epid_array = NULL; psm2_error_t *error_array = NULL; psm2_epaddr_t *epaddr_array = NULL; ips_epaddr_t *ipsaddr_master, *ipsaddr; int *mask_array = NULL; int i; PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); err = ips_proto_connect(&ptl->proto, numep, array_of_epid, array_of_epid_mask, array_of_errors, array_of_epaddr, timeout_in); if (err) return err; psmi_assert_always(ptl->ep->mctxt_master == ptl->ep); if (ptl->ep->mctxt_next == ptl->ep) return err; /* make the additional mutil-context connections. */ epid_array = (psm2_epid_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epid_t) * numep); mask_array = (int *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(int) * numep); error_array = (psm2_error_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_error_t) * numep); epaddr_array = (psm2_epaddr_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epaddr_t) * numep); if (!epid_array || !mask_array || !error_array || !epaddr_array) { goto fail; } ep = ptl->ep->mctxt_next; while (ep != ep->mctxt_master) { /* Setup the mask array and epid array. */ for (i = 0; i < numep; i++) { if (array_of_epid_mask[i] && array_of_errors[i] == PSM2_OK) { ipsaddr_master = (ips_epaddr_t *) array_of_epaddr[i]; ipsaddr = ipsaddr_master->next; mask_array[i] = 0; while (ipsaddr != ipsaddr_master) { if (((psm2_epaddr_t) ipsaddr)->proto-> ep == ep) { mask_array[i] = 1; epid_array[i] = ((psm2_epaddr_t) ipsaddr)-> epid; break; } ipsaddr = ipsaddr->next; } } else { mask_array[i] = 0; } } /* Make the real protocol connections. */ err = ips_proto_connect(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, numep, epid_array, mask_array, error_array, epaddr_array, timeout_in); if (err) goto fail; ep = ep->mctxt_next; } fail: if (epid_array) psmi_free(epid_array); if (mask_array) psmi_free(mask_array); if (error_array) psmi_free(error_array); if (epaddr_array) psmi_free(epaddr_array); return err; } psm2_error_t ips_ptl_disconnect(ptl_t *ptl_gen, int force, int numep, psm2_epaddr_t array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t array_of_errors[], uint64_t timeout_in) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; int *array_of_epaddr_mask_internal, i; psm2_error_t err; /* * Copy true values from array_of_epaddr_mask, provided that their * respective epaddr is an ips one. * Newly created mask will be used for the protocol disconnect call * instead. */ PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); array_of_epaddr_mask_internal = psmi_calloc(ptl->ep, UNDEFINED, sizeof(int), numep); if (!array_of_epaddr_mask_internal) return PSM2_NO_MEMORY; for (i = 0; i < numep; ++i) { if (array_of_epaddr_mask[i] && array_of_epaddr[i] && array_of_epaddr[i]->ptlctl->ptl == ptl_gen) { array_of_epaddr_mask_internal[i] = 1; } } err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr, array_of_epaddr_mask_internal, array_of_errors, timeout_in); psmi_free(array_of_epaddr_mask_internal); return err; } /* Only symbol we expose out of here */ struct ptl_ctl_init psmi_ptl_ips = { ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt, ips_ptl_getopt }; struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread = { ips_ptl_rcvthread_is_enabled, ips_ptl_rcvthread_transfer_ownership, }; opa-psm2-PSM2_11.2.185/ptl_ips/ptl_fwd.h000066400000000000000000000050661370564314600175120ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PTL_FWD_IPS_H #define _PTL_FWD_IPS_H #include "ptl.h" typedef struct ips_epaddr ips_epaddr_t; typedef struct ips_msgctl ips_msgctl_t; /* Symbol in ips ptl */ extern struct ptl_ctl_init psmi_ptl_ips; extern struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread; #endif /* _PTL_FWD_IPS_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ptl_ips.h000066400000000000000000000144041370564314600175210ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _IPS_PTL_H #define _IPS_PTL_H #include "psm_user.h" #include "ips_proto.h" #include "ips_stats.h" #include "ips_subcontext.h" struct ptl_shared; /* * PTL at the ips level (for OPA) * * This PTL structure glues all the ips components together. * * * ips timer, shared by various components, allows each component to * schedule time-based expiration callbacks on the timerq. * * HW receive queue * * send control block to handle eager messages * * instantiation of the ips protocol * * endpoint state, to map endpoint indexes into structures * * Receive-side * * ----[ proto ] * / ^ ^ * | | | * | packet packet * | known unknown * add_endpt \ / * | | * `----> [epstate] * ^ * | * lookup_endpt * | * [recvq] * | * poll * */ /* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */ /* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */ struct ptl_ips { psm2_ep_t ep; /* back ptr */ psm2_epid_t epid; /* cached from ep */ psm2_epaddr_t epaddr; /* cached from ep */ ips_epaddr_t *ipsaddr; /* cached from epaddr */ ptl_ctl_t *ctl; /* cached from init */ const psmi_context_t *context; /* cached from init */ void *spioc; /* PIO send control (opaque ptr) */ struct ips_proto proto; /* protocol instance: timerq, epstate, spio */ struct psmi_timer_ctrl timerq; struct ips_epstate epstate; /* map incoming packets */ struct ips_recvhdrq_state recvq_state; struct ips_recvhdrq recvq; /* HW recvq: epstate, proto */ /* timer to check the context's status */ struct psmi_timer status_timer; /* context's status check timeout in cycles -- cached */ uint64_t status_cyc_timeout; /* Shared contexts context */ struct ptl_shared *recvshc; /* Rcv thread context */ struct ptl_rcvthread *rcvthread; } #ifndef PACK_STRUCT_STL #define PACK_STRUCT_STL /* nothing */ #endif __attribute__ ((PACK_STRUCT_STL aligned(16))); /* * Sample implementation of shared contexts context. * * In shared mode, the hardware queue is serviced by more than one process. * Each process also mirrors the hardware queue in software (represented by an * ips_recvhdrq). For packets we service in the hardware queue that are not * destined for us, we write them in other processes's receive queues * (represented by an ips_writehdrq). * */ struct ptl_shared { ptl_t *ptl; /* backptr to main ptl */ uint32_t context; uint32_t subcontext; uint32_t subcontext_cnt; pthread_spinlock_t *context_lock; struct ips_subcontext_ureg *subcontext_ureg[PSM_HAL_MAX_SHARED_CTXTS]; struct ips_hwcontext_ctrl *hwcontext_ctrl; struct ips_recvhdrq recvq; /* subcontext receive queue */ struct ips_recvhdrq_state recvq_state; /* subcontext receive queue state */ struct ips_writehdrq writeq[PSM_HAL_MAX_SHARED_CTXTS]; /* peer subcontexts */ }; /* * Connect/disconnect are wrappers around psm proto's connect/disconnect, * mostly to abstract away PSM-specific stuff from ips internal structures */ psm2_error_t ips_ptl_connect(ptl_t *ptl, int numep, const psm2_epid_t *array_of_epid, const int *array_of_epid_mask, psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in); psm2_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep, psm2_epaddr_t array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t array_of_errors[], uint64_t timeout_in); /* * Generic Poll function for ips-level ptl */ psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); psm2_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored); /* * Support for receive thread */ psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq); psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl); void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl, ptl_t *to_ptl); #endif /* _IPS_PTL_H */ opa-psm2-PSM2_11.2.185/ptl_ips/ptl_rcvthread.c000066400000000000000000000362701370564314600207100ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #include #include "psm_user.h" #include "psm2_hal.h" #include "psm_mq_internal.h" #include "ptl_ips.h" #include "ips_proto.h" struct ptl_rcvthread; static void *ips_ptl_pollintr(void *recvthreadc); static psm2_error_t rcvthread_initstats(ptl_t *ptl); static psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc); struct ptl_rcvthread { const psmi_context_t *context; const ptl_t *ptl; struct ips_recvhdrq *recvq; pthread_t hdrq_threadid; uint64_t t_start_cyc; int pipefd[2]; /* stats and some for scheduling */ uint64_t pollcnt; uint64_t pollcnt_to; uint64_t pollcyc; uint64_t pollok; /* For scheduling interrupt thread */ int timeout_period_min; int timeout_period_max; int timeout_shift; uint64_t pollok_last; uint64_t pollcnt_last; uint32_t last_timeout; }; #ifdef PSM_CUDA /* This is a global cuda context (extern declaration in psm_user.h) * stored to provide hints during a cuda failure * due to a null cuda context. */ CUcontext ctxt; #endif /* * The receive thread knows about the ptl interface, so it can muck with it * directly. */ psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; psm2_error_t err = PSM2_OK; struct ptl_rcvthread *rcvc; ptl->rcvthread = psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread)); if (ptl->rcvthread == NULL) { err = PSM2_NO_MEMORY; goto fail; } rcvc = ptl->rcvthread; rcvc->recvq = recvq; rcvc->ptl = ptl_gen; rcvc->context = ptl->context; rcvc->t_start_cyc = get_cycles(); #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED) PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); #endif if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) && (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED))){ if ((err = rcvthread_initsched(rcvc))) goto fail; /* Create a pipe so we can synchronously terminate the thread */ if (pipe(rcvc->pipefd) != 0) { err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, "Cannot create a pipe for receive thread: %s\n", strerror(errno)); goto fail; } psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); if (pthread_create(&rcvc->hdrq_threadid, NULL, ips_ptl_pollintr, ptl->rcvthread)) { close(rcvc->pipefd[0]); close(rcvc->pipefd[1]); err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, "Cannot start receive thread: %s\n", strerror(errno)); goto fail; } } if ((err = rcvthread_initstats(ptl_gen))) goto fail; fail: return err; } psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl_gen) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread; uint64_t t_now; psm2_error_t err = PSM2_OK; PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); if (ptl->rcvthread == NULL) return err; if (rcvc->hdrq_threadid && psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) { t_now = get_cycles(); /* Disable interrupts then kill the receive thread */ if (psmi_context_interrupt_isenabled ((psmi_context_t *) ptl->context)) if ((err = psmi_context_interrupt_set((psmi_context_t *) ptl-> context, 0))) goto fail; /* Close the pipe so we can have the thread synchronously exit. On Linux just closing the pipe does not wake up the receive thread. */ if (write(rcvc->pipefd[1], (const void *)&t_now, sizeof(uint64_t)) == -1 || close(rcvc->pipefd[1]) == -1) { _HFI_VDBG ("unable to close pipe to receive thread cleanly\n"); } pthread_join(rcvc->hdrq_threadid, NULL); psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); rcvc->hdrq_threadid = 0; if (_HFI_PRDBG_ON) { _HFI_PRDBG_ALWAYS ("rcvthread poll success %lld/%lld times, " "thread cancelled in %.3f us\n", (long long)rcvc->pollok, (long long)rcvc->pollcnt, (double)cycles_to_nanosecs(get_cycles() - t_now) / 1e3); } } psmi_free(ptl->rcvthread); ptl->rcvthread = NULL; fail: return err; } void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_ptl_gen) { struct ptl_rcvthread *rcvc; psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); struct ptl_ips *from_ptl = (struct ptl_ips *)from_ptl_gen; struct ptl_ips *to_ptl = (struct ptl_ips *)to_ptl_gen; to_ptl->rcvthread = from_ptl->rcvthread; from_ptl->rcvthread = NULL; rcvc = to_ptl->rcvthread; rcvc->recvq = &to_ptl->recvq; rcvc->context = to_ptl->context; rcvc->ptl = to_ptl_gen; } psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) { union psmi_envvar_val env_to; char buf[192]; char *rcv_freq = buf; int no_timeout = 0; int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT }; snprintf(buf, sizeof(buf) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); buf[sizeof(buf) - 1] = '\0'; if (!psmi_getenv("PSM2_RCVTHREAD_FREQ", "Thread timeouts (per sec) ", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)rcv_freq, &env_to)) { /* not using default values */ int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals); int invalid = 0; if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) || (nparsed > 1 && tvals[1] == 0)) { no_timeout = 1; } else { if (nparsed > 0 && tvals[0] > 1000) invalid = 1; if (nparsed > 1 && (tvals[1] > 1000 || tvals[1] < tvals[0])) invalid = 1; if (nparsed > 2 && tvals[2] > 10) invalid = 1; } if (invalid) { _HFI_INFO ("Overriding invalid request for RcvThread frequency" " settings of %s to be <%d:%d:%d>\n", env_to.e_str, RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); tvals[0] = RCVTHREAD_TO_MIN_FREQ; tvals[1] = RCVTHREAD_TO_MAX_FREQ; tvals[2] = RCVTHREAD_TO_SHIFT; } } if (no_timeout) { rcvc->last_timeout = -1; _HFI_PRDBG("PSM2_RCVTHREAD_FREQ set to only interrupt " "(no timeouts)\n"); } else { /* Convert freq to period in microseconds (for poll()) */ rcvc->timeout_period_max = 1000 / tvals[0]; rcvc->timeout_period_min = 1000 / tvals[1]; rcvc->timeout_shift = tvals[2]; /* Start in the middle of min and max */ rcvc->last_timeout = (rcvc->timeout_period_min + rcvc->timeout_period_max) / 2; _HFI_PRDBG("PSM2_RCVTHREAD_FREQ converted to period " "min=%dms,max=%dms,shift=%d\n", rcvc->timeout_period_min, rcvc->timeout_period_max, rcvc->timeout_shift); } return PSM2_OK; } static int rcvthread_next_timeout(struct ptl_rcvthread *rcvc) { uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last; if (pollok_diff > 0) { if (rcvc->last_timeout > rcvc->timeout_period_min) /* By default, be less aggressive, but there's a more aggressive * alternative if need be */ #if 1 rcvc->last_timeout >>= rcvc->timeout_shift; #else rcvc->last_timeout = rcvc->timeout_period_min; #endif } else { /* we had less progress */ if (rcvc->last_timeout < rcvc->timeout_period_max) rcvc->last_timeout <<= rcvc->timeout_shift; } rcvc->pollok_last = rcvc->pollok; rcvc->pollcnt_last = rcvc->pollcnt; return (int)rcvc->last_timeout; } extern int ips_in_rcvthread; /* * Receiver thread support. * * By default, polling in the driver asks the chip to generate an interrupt on * every packet. When the driver supports POLLURG we can switch the poll mode * to one that requests interrupts only for packets that contain an urgent bit * (and optionally enable interrupts for hdrq overflow events). When poll * returns an event, we *try* to make progress on the receive queue but simply * go back to sleep if we notice that the main thread is already making * progress. */ static void *ips_ptl_pollintr(void *rcvthreadc) { struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)rcvthreadc; struct ips_recvhdrq *recvq = rcvc->recvq; int fd_pipe = rcvc->pipefd[0]; psm2_ep_t ep; struct pollfd pfd[2]; int ret; int next_timeout = rcvc->last_timeout; uint64_t t_cyc; psm2_error_t err; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && ctxt != NULL) PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt); #endif PSM2_LOG_MSG("entering"); /* No reason to have many of these, keep this as a backup in case the * recvhdrq init function is misused */ psmi_assert_always(psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)); /* Switch driver to a mode where it can interrupt on urgent packets */ if (psmi_context_interrupt_set((psmi_context_t *) rcvc->context, 1) == PSM2_EP_NO_RESOURCES) { _HFI_PRDBG ("hfi_poll_type feature not present in driver, turning " "off internal progress thread\n"); return NULL; } _HFI_PRDBG("Enabled communication thread on URG packets\n"); while (1) { pfd[0].fd = psmi_hal_get_fd(rcvc->context->psm_hw_ctxt); pfd[0].events = POLLIN; pfd[0].revents = 0; pfd[1].fd = fd_pipe; pfd[1].events = POLLIN; pfd[1].revents = 0; ret = poll(pfd, 2, next_timeout); t_cyc = get_cycles(); if_pf(ret < 0) { if (errno == EINTR) _HFI_DBG("got signal, keep polling\n"); else psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Receive thread poll() error: %s", strerror(errno)); } else if (pfd[1].revents) { /* Any type of event on this fd means exit, should be POLLHUP */ _HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents); close(fd_pipe); break; } else { rcvc->pollcnt++; if (!PSMI_LOCK_TRY(psmi_creation_lock)) { if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) { if (PSMI_LOCK_DISABLED) { /* We do this check without acquiring the lock, no sense to * adding the overhead and it doesn't matter if we're * wrong. */ if (ips_recvhdrq_isempty(recvq)) continue; if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN) { ips_recvhdrq_scan_cca(recvq); } if (!ips_recvhdrq_trylock(recvq)) continue; err = ips_recvhdrq_progress(recvq); if (err == PSM2_OK) rcvc->pollok++; else rcvc->pollcyc += get_cycles() - t_cyc; ips_recvhdrq_unlock(recvq); } else { ep = psmi_opened_endpoint; if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { if(recvq->proto->flags & IPS_PROTO_FLAG_CCA_PRESCAN ) { ips_recvhdrq_scan_cca(recvq); } PSMI_UNLOCK(ep->mq->progress_lock); } /* Go through all master endpoints. */ do{ if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { /* If we time out, we service shm and hfi. If not, we * assume to have received an hfi interrupt and service * only hfi. */ err = psmi_poll_internal(ep, ret == 0 ? PSMI_TRUE : PSMI_FALSE); if (err == PSM2_OK) rcvc->pollok++; else rcvc->pollcyc += get_cycles() - t_cyc; PSMI_UNLOCK(ep->mq->progress_lock); } /* get next endpoint from multi endpoint list */ ep = ep->user_ep_next; } while(NULL != ep); } } PSMI_UNLOCK(psmi_creation_lock); } if (ret == 0) { /* change timeout only on timed out poll */ rcvc->pollcnt_to++; next_timeout = rcvthread_next_timeout(rcvc); } } } PSM2_LOG_MSG("leaving"); return NULL; } static uint64_t rcvthread_stats_pollok(void *context) { struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context; double ratio = 0.0; uint64_t ratio_u; if (rcvc->pollcnt > 0) ratio = (double)rcvc->pollok * 100.0 / rcvc->pollcnt; memcpy(&ratio_u, &ratio, sizeof(uint64_t)); return ratio_u; } static uint64_t rcvthread_stats_pollcyc(void *context) { struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context; /* log in milliseconds */ return (uint64_t) ((double)cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6); } static psm2_error_t rcvthread_initstats(ptl_t *ptl_gen) { struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread; struct psmi_stats_entry entries[] = { PSMI_STATS_DECL("intrthread schedule count", MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, NULL, &rcvc->pollcnt), PSMI_STATS_DECL("intrthread schedule success (%)", MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_TYPE_DOUBLE, rcvthread_stats_pollok, NULL), PSMI_STATS_DECL("intrthread timeout count", MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, NULL, &rcvc->pollcnt_to), PSMI_STATS_DECL("intrthread wasted time (ms)", MPSPAWN_STATS_REDUCTION_ALL, rcvthread_stats_pollcyc, NULL) }; /* If we don't want a thread, make sure we still initialize the counters * but set them to NaN instead */ if (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) { int i; static uint64_t ctr_nan = MPSPAWN_NAN; for (i = 0; i < (int)PSMI_STATS_HOWMANY(entries); i++) { entries[i].getfn = NULL; entries[i].u.val = &ctr_nan; } } return psmi_stats_register_type(PSMI_STATS_NO_HEADING, PSMI_STATSTYPE_RCVTHREAD, entries, PSMI_STATS_HOWMANY(entries), rcvc); } opa-psm2-PSM2_11.2.185/ptl_self/000077500000000000000000000000001370564314600160315ustar00rootroot00000000000000opa-psm2-PSM2_11.2.185/ptl_self/Makefile000066400000000000000000000062541370564314600175000ustar00rootroot00000000000000# # This file is provided under a dual BSD/GPLv2 license. When using or # redistributing this file, you may do so under either license. # # GPL LICENSE SUMMARY # # Copyright(c) 2015 Intel Corporation. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # Contact Information: # Intel Corporation, www.intel.com # # BSD LICENSE # # Copyright(c) 2015 Intel Corporation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of Intel Corporation nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Copyright (c) 2003-2014 Intel Corporation. All rights reserved. # OUTDIR = . this_srcdir = $(shell readlink -m .) top_srcdir := $(this_srcdir)/.. INCLUDES += -I$(top_srcdir) ${TARGLIB}-objs := ptl.o ${TARGLIB}-objs := $(patsubst %.o, $(OUTDIR)/%.o, ${${TARGLIB}-objs}) DEPS := $(${TARGLIB}-objs:.o=.d) .PHONY: all clean IGNORE_DEP_TARGETS = clean all .DEFAULT: ${${TARGLIB}-objs} $(OUTDIR)/%.d: $(this_srcdir)/%.c $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) $< -MM -MF $@ -MQ $(@:.d=.o) $(OUTDIR)/%.o: $(this_srcdir)/%.c | ${DEPS} $(CC) $(CFLAGS) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ clean: @if [ -d $(OUTDIR) ]; then \ cd $(OUTDIR); \ rm -f *.o *.d *.gcda *.gcno; \ cd -; \ fi #ifeq prevents the deps from being included during clean #-include line is required to pull in auto-dependecies during 2nd pass ifeq ($(filter $(IGNORE_DEP_TARGETS), $(MAKECMDGOALS)),) -include ${DEPS} endif install: @echo "Nothing to do for install." opa-psm2-PSM2_11.2.185/ptl_self/ptl.c000066400000000000000000000267741370564314600170140ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ /* * This file implements the PSM PTL for self (loopback) */ #include "psm_user.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" struct ptl_self { psm2_ep_t ep; psm2_epid_t epid; psm2_epaddr_t epaddr; ptl_ctl_t *ctl; } __attribute__((aligned(16))); static psm2_error_t ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted) { psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr; if (recv_req->req_data.recv_msglen > 0) { psmi_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf, recv_req->req_data.recv_msglen); } psmi_mq_handle_rts_complete(recv_req); /* If the send is already marked complete, that's because it was internally * buffered. */ if (send_req->state == MQ_STATE_COMPLETE) { psmi_mq_stats_rts_account(send_req); if (send_req->req_data.buf != NULL && send_req->req_data.send_msglen > 0) psmi_mq_sysbuf_free(send_req->mq, send_req->req_data.buf); /* req was left "live" even though the sender was told that the * send was done */ psmi_mq_req_free(send_req); } else psmi_mq_handle_rts_complete(send_req); _HFI_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n", recv_req->req_data.buf, send_req, recv_req); return PSM2_OK; } static psm2_error_t self_mq_send_testwait(psm2_mq_req_t *ireq) { uint8_t *ubuf; psm2_mq_req_t req = *ireq; PSMI_LOCK_ASSERT(req->mq->progress_lock); /* We're waiting on a send request, and the matching receive has not been * posted yet. This is a deadlock condition in MPI but we accommodate it * here in the "self ptl" by using system-allocated memory. */ req->testwait_callback = NULL; /* no more calls here */ ubuf = req->req_data.buf; if (ubuf != NULL && req->req_data.send_msglen > 0) { req->req_data.buf = psmi_mq_sysbuf_alloc(req->mq, req->req_data.send_msglen); if (req->req_data.buf == NULL) return PSM2_NO_MEMORY; psmi_mq_mtucpy(req->req_data.buf, ubuf, req->req_data.send_msglen); } /* Mark it complete but don't free the req, it's freed when the receiver * does the match */ req->state = MQ_STATE_COMPLETE; *ireq = PSM2_MQ_REQINVALID; return PSM2_OK; } /* Self is different. We do everything as rendezvous. */ static psm2_error_t self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, void *context, psm2_mq_req_t *req_o) { psm2_mq_req_t send_req; psm2_mq_req_t recv_req; int rc; send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); if_pf(send_req == NULL) return PSM2_NO_MEMORY; #ifdef PSM_CUDA if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { psmi_cuda_set_attr_sync_memops(ubuf); send_req->is_buf_gpu_mem = 1; } else send_req->is_buf_gpu_mem = 0; #endif rc = psmi_mq_handle_rts(mq, epaddr, tag, len, NULL, 0, 1, ptl_handle_rtsmatch, &recv_req); send_req->req_data.tag = *tag; send_req->req_data.buf = (void *)ubuf; send_req->req_data.send_msglen = len; send_req->req_data.context = context; recv_req->ptl_req_ptr = (void *)send_req; recv_req->rts_sbuf = (uintptr_t) ubuf; recv_req->rts_peer = epaddr; if (rc == MQ_RET_MATCH_OK) ptl_handle_rtsmatch(recv_req, 1); else send_req->testwait_callback = self_mq_send_testwait; _HFI_VDBG("[self][b=%p][m=%d][t=%08x.%08x.%08x][match=%s][req=%p]\n", ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req); *req_o = send_req; return PSM2_OK; } static psm2_error_t self_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) { psm2_error_t err; psm2_mq_req_t req; err = self_mq_isend(mq, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len, NULL, &req); psmi_mq_wait_internal(&req); return err; } /* Fill in AM capabilities parameters */ static psm2_error_t self_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) { if (parameters == NULL) { return PSM2_PARAM_ERR; } /* Self is just a loop-back and has no restrictions. */ parameters->max_handlers = INT_MAX; parameters->max_nargs = INT_MAX; parameters->max_request_short = INT_MAX; parameters->max_reply_short = INT_MAX; return PSM2_OK; } static psm2_error_t self_am_short_request(psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { struct psm2_ep_am_handle_entry *hentry; psm2_ep_t ep = ((struct ptl_self *)(epaddr->ptlctl->ptl))->ep; struct psmi_am_token tok; tok.epaddr_incoming = epaddr; hentry = psm_am_get_handler_function(ep, handler); /* Note a guard here for hentry != NULL is not needed because at * initialization, a psmi_assert_always() assure the entry will be * non-NULL. */ if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { psm2_am_handler_2_fn_t hfn2 = (psm2_am_handler_2_fn_t)hentry->hfn; hfn2(&tok, args, nargs, src, len, hentry->hctx); } else { psm2_am_handler_fn_t hfn1 = (psm2_am_handler_fn_t)hentry->hfn; hfn1(&tok, args, nargs, src, len); } if (completion_fn) { completion_fn(completion_ctxt); } return PSM2_OK; } static psm2_error_t self_am_short_reply(psm2_am_token_t token, psm2_handler_t handler, psm2_amarg_t *args, int nargs, void *src, size_t len, int flags, psm2_am_completion_fn_t completion_fn, void *completion_ctxt) { struct psm2_ep_am_handle_entry *hentry; struct psmi_am_token *tok = token; struct ptl_self *ptl = (struct ptl_self *)tok->epaddr_incoming->ptlctl->ptl; psm2_ep_t ep = ptl->ep; hentry = psm_am_get_handler_function(ep, handler); /* Note a guard here for hentry != NULL is not needed because at * initialization, a psmi_assert_always() assure the entry will be * non-NULL. */ if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { psm2_am_handler_2_fn_t hfn2 = (psm2_am_handler_2_fn_t)hentry->hfn; hfn2(token, args, nargs, src, len, hentry->hctx); } else { psm2_am_handler_fn_t hfn1 = (psm2_am_handler_fn_t)hentry->hfn; hfn1(token, args, nargs, src, len); } if (completion_fn) { completion_fn(completion_ctxt); } return PSM2_OK; } static psm2_error_t self_connect(ptl_t *ptl_gen, int numep, const psm2_epid_t array_of_epid[], const int array_of_epid_mask[], psm2_error_t array_of_errors[], psm2_epaddr_t array_of_epaddr[], uint64_t timeout_ns) { struct ptl_self *ptl = (struct ptl_self *)ptl_gen; psmi_assert_always(ptl->epaddr != NULL); psm2_error_t err = PSM2_OK; int i; PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); for (i = 0; i < numep; i++) { if (!array_of_epid_mask[i]) continue; if (array_of_epid[i] == ptl->epid) { array_of_epaddr[i] = ptl->epaddr; array_of_epaddr[i]->ptlctl = ptl->ctl; array_of_epaddr[i]->epid = ptl->epid; if (psmi_epid_set_hostname(psm2_epid_nid(ptl->epid), psmi_gethostname(), 0)) { err = PSM2_NO_MEMORY; goto fail; } psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr); array_of_errors[i] = PSM2_OK; } else { array_of_epaddr[i] = NULL; array_of_errors[i] = PSM2_EPID_UNREACHABLE; } } fail: return err; } static psm2_error_t self_disconnect(ptl_t *ptl_gen, int force, int numep, psm2_epaddr_t array_of_epaddr[], const int array_of_epaddr_mask[], psm2_error_t array_of_errors[], uint64_t timeout_in) { struct ptl_self *ptl = (struct ptl_self *)ptl_gen; int i; for (i = 0; i < numep; i++) { if (array_of_epaddr_mask[i] == 0) continue; if (array_of_epaddr[i] == ptl->epaddr) { psmi_epid_remove(ptl->ep, ptl->epid); array_of_errors[i] = PSM2_OK; } } return PSM2_OK; } static size_t self_ptl_sizeof(void) { return sizeof(struct ptl_self); } ustatic psm2_error_t self_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) { struct ptl_self *ptl = (struct ptl_self *)ptl_gen; psmi_assert_always(ep != NULL); psmi_assert_always(ep->epaddr != NULL); psmi_assert_always(ep->epid != 0); ptl->ep = ep; ptl->epid = ep->epid; ptl->epaddr = ep->epaddr; ptl->ctl = ctl; memset(ctl, 0, sizeof(*ctl)); /* Fill in the control structure */ ctl->ptl = ptl_gen; ctl->ep = ep; ctl->ep_poll = NULL; ctl->ep_connect = self_connect; ctl->ep_disconnect = self_disconnect; ctl->mq_send = self_mq_send; ctl->mq_isend = self_mq_isend; ctl->am_get_parameters = self_am_get_parameters; ctl->am_short_request = self_am_short_request; ctl->am_short_reply = self_am_short_reply; /* No stats in self */ ctl->epaddr_stats_num = NULL; ctl->epaddr_stats_init = NULL; ctl->epaddr_stats_get = NULL; return PSM2_OK; } static psm2_error_t self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns) { return PSM2_OK; /* nothing to do */ } static psm2_error_t self_ptl_setopt(const void *component_obj, int optname, const void *optval, uint64_t optlen) { /* No options for SELF PTL at the moment */ return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown SELF ptl option %u.", optname); } static psm2_error_t self_ptl_getopt(const void *component_obj, int optname, void *optval, uint64_t *optlen) { /* No options for SELF PTL at the moment */ return psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown SELF ptl option %u.", optname); } /* Only symbol we expose out of here */ struct ptl_ctl_init psmi_ptl_self = { self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt, self_ptl_getopt }; opa-psm2-PSM2_11.2.185/ptl_self/ptl_fwd.h000066400000000000000000000046111370564314600176430ustar00rootroot00000000000000/* This file is provided under a dual BSD/GPLv2 license. When using or redistributing this file, you may do so under either license. GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Contact Information: Intel Corporation, www.intel.com BSD LICENSE Copyright(c) 2015 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ #ifndef _PTL_FWD_SELF_H #define _PTL_FWD_SELF_H /* Symbol in am ptl */ extern struct ptl_ctl_init psmi_ptl_self; #endif opa-psm2-PSM2_11.2.185/rpm_release_extension000066400000000000000000000000041370564314600205300ustar00rootroot00000000000000185