likwid-3.1.3/perl/Template/Plugin/Scalar.pm000644 137545 027340 00000007271 12336605216 021042 0ustar00unrz254unrz000000 000000 #============================================================= -*-Perl-*- # # Template::Plugin::Scalar # # DESCRIPTION # Template Toolkit plugin module which allows you to call object methods # in scalar context. # # AUTHOR # Andy Wardley # # COPYRIGHT # Copyright (C) 2008 Andy Wardley. All Rights Reserved. # # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # #============================================================================ package Template::Plugin::Scalar; use base 'Template::Plugin'; use strict; use warnings; use Template::Exception; use Scalar::Util qw(); our $VERSION = 1.00; our $MONAD = 'Template::Monad::Scalar'; our $EXCEPTION = 'Template::Exception'; our $AUTOLOAD; sub load { my $class = shift; my $context = shift; # define .scalar vmethods for hash and list objects $context->define_vmethod( hash => scalar => \&scalar_monad ); $context->define_vmethod( list => scalar => \&scalar_monad ); return $class; } sub scalar_monad { # create a .scalar monad which wraps the hash- or list-based object # and delegates any method calls back to it, calling them in scalar # context, e.g. foo.scalar.bar becomes $MONAD->new($foo)->bar and # the monad calls $foo->bar in scalar context $MONAD->new(shift); } sub new { my ($class, $context, @args) = @_; # create a scalar plugin object which will lookup a variable subroutine # and call it. e.g. scalar.foo results in a call to foo() in scalar context my $self = bless { _CONTEXT => $context, }, $class; return $self; } sub AUTOLOAD { my $self = shift; my $item = $AUTOLOAD; $item =~ s/.*:://; return if $item eq 'DESTROY'; # lookup the named values my $stash = $self->{ _CONTEXT }->stash; my $value = $stash->{ $item }; if (! defined $value) { die $EXCEPTION->new( scalar => "undefined value for scalar call: $item" ); } elsif (ref $value eq 'CODE') { $value = $value->(@_); } return $value; } package Template::Monad::Scalar; our $EXCEPTION = 'Template::Exception'; our $AUTOLOAD; sub new { my ($class, $this) = @_; bless \$this, $class; } sub AUTOLOAD { my $self = shift; my $this = $$self; my $item = $AUTOLOAD; $item =~ s/.*:://; return if $item eq 'DESTROY'; my $method; if (Scalar::Util::blessed($this)) { # lookup the method... $method = $this->can($item); } else { die $EXCEPTION->new( scalar => "invalid object method: $item" ); } # ...and call it in scalar context my $result = $method->($this, @_); return $result; } 1; __END__ =head1 NAME Template::Plugin::Scalar - call object methods in scalar context =head1 SYNOPSIS [% USE scalar %] # TT2 calls object methods in array context by default [% object.method %] # force it to use scalar context [% object.scalar.method %] # also works with subroutine references [% scalar.my_sub_ref %] =head1 DESCRIPTION The Template Toolkit calls user-defined subroutines and object methods using Perl's array context by default. This plugin module provides a way for you to call subroutines and methods in scalar context. =head1 AUTHOR Andy Wardley Eabw@wardley.orgE L =head1 COPYRIGHT Copyright (C) 2008 Andy Wardley. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L =cut # Local Variables: # mode: perl # perl-indent-level: 4 # indent-tabs-mode: nil # End: # # vim: expandtab shiftwidth=4: likwid-3.1.3/src/includes/barrier.h000644 137545 027340 00000003630 12426160352 017510 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: barrier.h * * Description: Header File barrier Module * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef BARRIER_H #define BARRIER_H #include /** * @brief Initialize the barrier module * @param numberOfThreads The total number of threads in the barrier */ extern void barrier_init(int numberOfGroups); /** * @brief Destroy data structures of the barrier module */ extern void barrier_destroy(void); /** * @brief Register a thread for a barrier * @param threadId The id of the thread to register */ extern int barrier_registerGroup(int numThreads); extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId); /** * @brief Synchronize threads * @param threadId The id of the calling thread * @param numberOfThreads Total number of threads in the barrier */ extern void barrier_synchronize(BarrierData* barr); #endif /*BARRIER_H*/ likwid-3.1.3/bench/x86-64/update_avx.ptt000644 137545 027340 00000000554 12416714770 020152 0ustar00unrz254unrz000000 000000 STREAMS 1 TYPE DOUBLE FLOPS 0 BYTES 16 LOOP 16 vmovaps ymm1, [STR0 + GPR1 * 8] vmovaps [STR0 + GPR1 * 8] , ymm1 vmovaps ymm2, [STR0 + GPR1 * 8 + 32] vmovaps ymm3, [STR0 + GPR1 * 8 + 64] vmovaps ymm4, [STR0 + GPR1 * 8 + 96] vmovaps [STR0 + GPR1 * 8 + 32], ymm2 vmovaps [STR0 + GPR1 * 8 + 64], ymm3 vmovaps [STR0 + GPR1 * 8 + 96], ymm4 likwid-3.1.3/groups/k8/000755 137545 027340 00000000000 12426160161 015151 5ustar00unrz254unrz000000 000000 likwid-3.1.3/groups/interlagos/MEM.txt000644 137545 027340 00000001311 12336605216 020156 0ustar00unrz254unrz000000 000000 SHORT Main memory bandwidth in MBytes/s EVENTSET UPMC0 UNC_DRAM_ACCESSES_DCT0_ALL UPMC1 UNC_DRAM_ACCESSES_DCT1_ALL METRICS Runtime (RDTSC) [s] time Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 LONG Formulas: Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64 - Profiling group to measure memory bandwidth drawn by all cores of a socket. Note: As this group measures the accesses from all cores it only makes sense to measure with one core per socket, similiar as with the Intel Nehalem Uncore events. likwid-3.1.3/perl/Template/Plugin/Image.pm000644 137545 027340 00000026715 12336605216 020663 0ustar00unrz254unrz000000 000000 #============================================================= -*-Perl-*- # # Template::Plugin::Image # # DESCRIPTION # Plugin for encapsulating information about an image. # # AUTHOR # Andy Wardley # # COPYRIGHT # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # #============================================================================ package Template::Plugin::Image; use strict; use warnings; use base 'Template::Plugin'; use Template::Exception; use File::Spec; our $VERSION = 1.21; our $AUTOLOAD; BEGIN { if (eval { require Image::Info; }) { *img_info = \&Image::Info::image_info; } elsif (eval { require Image::Size; }) { *img_info = sub { my $file = shift; my @stuff = Image::Size::imgsize($file); return { "width" => $stuff[0], "height" => $stuff[1], "error" => # imgsize returns either a three letter file type # or an error message as third value (defined($stuff[2]) && length($stuff[2]) > 3 ? $stuff[2] : undef), }; } } else { die(Template::Exception->new("image", "Couldn't load Image::Info or Image::Size: $@")); } } #------------------------------------------------------------------------ # new($context, $name, \%config) # # Create a new Image object. Takes the pathname of the file as # the argument following the context and an optional # hash reference of configuration parameters. #------------------------------------------------------------------------ sub new { my $config = ref($_[-1]) eq 'HASH' ? pop(@_) : { }; my ($class, $context, $name) = @_; my ($root, $file, $type); # name can be a positional or named argument $name = $config->{ name } unless defined $name; return $class->throw('no image file specified') unless defined $name and length $name; # name can be specified as an absolute path or relative # to a root directory if ($root = $config->{ root }) { $file = File::Spec->catfile($root, $name); } else { $file = defined $config->{file} ? $config->{file} : $name; } # Make a note of whether we are using Image::Size or # Image::Info -- at least for the test suite $type = $INC{"Image/Size.pm"} ? "Image::Size" : "Image::Info"; # set a default (empty) alt attribute for tag() $config->{ alt } = '' unless defined $config->{ alt }; # do we want to check to see if file exists? bless { %$config, name => $name, file => $file, root => $root, type => $type, }, $class; } #------------------------------------------------------------------------ # init() # # Calls image_info on $self->{ file } #------------------------------------------------------------------------ sub init { my $self = shift; return $self if $self->{ size }; my $image = img_info($self->{ file }); return $self->throw($image->{ error }) if defined $image->{ error }; @$self{ keys %$image } = values %$image; $self->{ size } = [ $image->{ width }, $image->{ height } ]; $self->{ modtime } = (stat $self->{ file })[10]; return $self; } #------------------------------------------------------------------------ # attr() # # Return the width and height as HTML/XML attributes. #------------------------------------------------------------------------ sub attr { my $self = shift; my $size = $self->size(); return "width=\"$size->[0]\" height=\"$size->[1]\""; } #------------------------------------------------------------------------ # modtime() # # Return last modification time as a time_t: # # [% date.format(image.modtime, "%Y/%m/%d") %] #------------------------------------------------------------------------ sub modtime { my $self = shift; $self->init; return $self->{ modtime }; } #------------------------------------------------------------------------ # tag(\%options) # # Return an XHTML img tag. #------------------------------------------------------------------------ sub tag { my $self = shift; my $options = ref $_[0] eq 'HASH' ? shift : { @_ }; my $tag = 'attr(); # XHTML spec says that the alt attribute is mandatory, so who # are we to argue? $options->{ alt } = $self->{ alt } unless defined $options->{ alt }; if (%$options) { while (my ($key, $val) = each %$options) { my $escaped = escape( $val ); $tag .= qq[ $key="$escaped"]; } } $tag .= ' />'; return $tag; } sub escape { my ($text) = @_; for ($text) { s/&/&/g; s//>/g; s/"/"/g; } $text; } sub throw { my ($self, $error) = @_; die (Template::Exception->new('Image', $error)); } sub AUTOLOAD { my $self = shift; (my $a = $AUTOLOAD) =~ s/.*:://; $self->init; return $self->{ $a }; } 1; __END__ =head1 NAME Template::Plugin::Image - Plugin access to image sizes =head1 SYNOPSIS [% USE Image(filename) %] [% Image.width %] [% Image.height %] [% Image.size.join(', ') %] [% Image.attr %] [% Image.tag %] =head1 DESCRIPTION This plugin provides an interface to the L or L modules for determining the size of image files. You can specify the plugin name as either 'C' or 'C'. The plugin object created will then have the same name. The file name of the image should be specified as a positional or named argument. [% # all these are valid, take your pick %] [% USE Image('foo.gif') %] [% USE image('bar.gif') %] [% USE Image 'ping.gif' %] [% USE image(name='baz.gif') %] [% USE Image name='pong.gif' %] A C parameter can be used to specify the location of the image file: [% USE Image(root='/path/to/root', name='images/home.png') %] # image path: /path/to/root/images/home.png # img src: images/home.png In cases where the image path and image url do not match up, specify the file name directly: [% USE Image(file='/path/to/home.png', name='/images/home.png') %] The C parameter can be used to specify an alternate name for the image, for use in constructing an XHTML element (see the C method below). [% USE Image('home.png', alt="Home") %] You can also provide an alternate name for an C plugin object. [% USE img1 = image 'foo.gif' %] [% USE img2 = image 'bar.gif' %] The C method returns the image file name. [% img1.name %] # foo.gif The C and C methods return the width and height of the image, respectively. The C method returns a reference to a 2 element list containing the width and height. [% USE image 'foo.gif' %] width: [% image.width %] height: [% image.height %] size: [% image.size.join(', ') %] The C method returns the modification time of the file in question, suitable for use with the L plugin, for example: [% USE image 'foo.gif' %] [% USE date %] [% date.format(image.modtime, "%B, %e %Y") %] The C method returns the height and width as HTML/XML attributes. [% USE image 'foo.gif' %] [% image.attr %] Typical output: width="60" height="20" The C method returns a complete XHTML tag referencing the image. [% USE image 'foo.gif' %] [% image.tag %] Typical output: You can provide any additional attributes that should be added to the XHTML tag. [% USE image 'foo.gif' %] [% image.tag(class="logo" alt="Logo") %] Typical output: Note that the C attribute is mandatory in a strict XHTML C element (even if it's empty) so it is always added even if you don't explicitly provide a value for it. You can do so as an argument to the C method, as shown in the previous example, or as an argument [% USE image('foo.gif', alt='Logo') %] =head1 CATCHING ERRORS If the image file cannot be found then the above methods will throw an C error. You can enclose calls to these methods in a C block to catch any potential errors. [% TRY; image.width; CATCH; error; # print error END %] =head1 USING Image::Info At run time, the plugin tries to load L in preference to L. If L is found, then some additional methods are available, in addition to C, C, C, C, and C. These additional methods are named after the elements that L retrieves from the image itself. The types of methods available depend on the type of image (see L for more details). These additional methods will always include the following: =head2 file_media_type This is the MIME type that is appropriate for the given file format. The corresponding value is a string like: "C" or "C". =head2 file_ext The is the suggested file name extention for a file of the given file format. The value is a 3 letter, lowercase string like "C", "C". =head2 color_type The value is a short string describing what kind of values the pixels encode. The value can be one of the following: Gray GrayA RGB RGBA CMYK YCbCr CIELab These names can also be prefixed by "C" if the image is composed of indexes into a palette. Of these, only "C" is likely to occur. (It is similar to the TIFF field PhotometricInterpretation, but this name was found to be too long, so we used the PNG inpired term instead.) =head2 resolution The value of this field normally gives the physical size of the image on screen or paper. When the unit specifier is missing then this field denotes the squareness of pixels in the image. The syntax of this field is: "/" "/" The CresE>, CxresE> and CyresE> fields are numbers. The CunitE> is a string like C, C or C (denoting "dots per inch/cm/meter). =head2 SamplesPerPixel This says how many channels there are in the image. For some image formats this number might be higher than the number implied from the C. =head2 BitsPerSample This says how many bits are used to encode each of samples. The value is a reference to an array containing numbers. The number of elements in the array should be the same as C. =head2 Comment Textual comments found in the file. The value is a reference to an array if there are multiple comments found. =head2 Interlace If the image is interlaced, then this returns the interlace type. =head2 Compression This returns the name of the compression algorithm is used. =head2 Gamma A number indicating the gamma curve of the image (e.g. 2.2) =head1 AUTHOR Andy Wardley Eabw@wardley.orgE L =head1 COPYRIGHT Copyright (C) 1996-2007 Andy Wardley. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L, L =cut # Local Variables: # mode: perl # perl-indent-level: 4 # indent-tabs-mode: nil # End: # # vim: expandtab shiftwidth=4: likwid-3.1.3/groups/ivybridge/000755 137545 027340 00000000000 12426160161 016613 5ustar00unrz254unrz000000 000000 likwid-3.1.3/bench/x86-64/striad_plain.ptt000644 137545 027340 00000001044 12416714770 020456 0ustar00unrz254unrz000000 000000 STREAMS 3 TYPE DOUBLE FLOPS 2 BYTES 24 movss FPR5, [SCALAR] LOOP 4 movsd FPR1, [STR1 + GPR1*8] movsd FPR2, [STR1 + GPR1*8+8] movsd FPR3, [STR1 + GPR1*8+16] movsd FPR4, [STR1 + GPR1*8+24] mulsd FPR1, FPR5 addsd FPR1, [STR2 + GPR1*8] mulsd FPR2, FPR5 addsd FPR2, [STR2 + GPR1*8+8] mulsd FPR3, FPR5 addsd FPR3, [STR2 + GPR1*8+16] mulsd FPR4, FPR5 addsd FPR4, [STR2 + GPR1*8+24] movsd [STR0 + GPR1*8] , FPR1 movsd [STR0 + GPR1*8+8] , FPR2 movsd [STR0 + GPR1*8+16], FPR3 movsd [STR0 + GPR1*8+24], FPR4 likwid-3.1.3/src/applications/likwid-powermeter.c000644 137545 027340 00000042214 12426160352 022410 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: likwid-powermeter.c * * Description: An application to get information about power * consumption on architectures implementing the RAPL interface. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */ #define HELP_MSG \ fprintf(stdout, "\nlikwid-powermeter -- Version %d.%d \n\n",VERSION,RELEASE); \ fprintf(stdout, "A tool to print Power and Clocking information on Intel SandyBridge CPUS.\n"); \ fprintf(stdout, "Options:\n"); \ fprintf(stdout, "-h\t\t Help message\n"); \ fprintf(stdout, "-v\t\t Version information\n"); \ fprintf(stdout, "-M <0|1>\t set how MSR registers are accessed: 0=direct, 1=msrd \n"); \ fprintf(stdout, "-c \t specify sockets to measure\n"); \ fprintf(stdout, "-i\t\t print information from MSR_PKG_POWER_INFO register and Turbo Mode\n"); \ fprintf(stdout, "-s \t set measure duration in sec. (default 2s) \n"); \ fprintf(stdout, "-p\t\t print dynamic clocking and CPI values (requires executable)\n\n"); \ fprintf(stdout, "Usage: likwid-powermeter -s 4 -c 1 \n"); \ fprintf(stdout, "Alternative as wrapper: likwid-powermeter -c 1 ./a.out\n"); \ fflush(stdout); #define VERSION_MSG \ fprintf(stdout, "likwid-powermeter %d.%d \n\n",VERSION,RELEASE); \ fflush(stdout); int main (int argc, char** argv) { int socket_fd = -1; int optInfo = 0; int optClock = 0; int optStethoscope = 0; int optSockets = 0; int optTemp = 0; double runtime; int hasDRAM = 0; int hasPP0 = 0; int hasPP1 = 0; int c, i; bstring argString; bstring eventString = bfromcstr("CLOCK"); int numSockets=1; int numThreads=0; int threadsSockets[MAX_NUM_NODES*2]; int threads[MAX_NUM_THREADS]; const AffinityDomain* socketDomains[MAX_NUM_NODES*2]; threadsSockets[0] = 0; if (argc == 1) { HELP_MSG; exit (EXIT_SUCCESS); } while ((c = getopt (argc, argv, "+c:hiM:ps:vt")) != -1) { switch (c) { case 'c': CHECK_OPTION_STRING; numSockets = bstr_to_cpuset_physical((uint32_t*) threadsSockets, argString); bdestroy(argString); optSockets = 1; break; case 'h': HELP_MSG; exit (EXIT_SUCCESS); case 'i': optInfo = 1; break; case 'M': /* Set MSR Access mode */ CHECK_OPTION_STRING; accessClient_setaccessmode(str2int((char*) argString->data)); bdestroy(argString); break; case 'p': optClock = 1; break; case 's': CHECK_OPTION_STRING; optStethoscope = str2int((char*) argString->data); bdestroy(argString); break; case 'v': VERSION_MSG; exit (EXIT_SUCCESS); case 't': optTemp = 1; break; case '?': if (optopt == 's' || optopt == 'M' || optopt == 'c') { HELP_MSG; } else if (isprint (optopt)) { fprintf (stderr, "Unknown option `-%c'.\n", optopt); } else { fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); } exit( EXIT_FAILURE); default: HELP_MSG; exit (EXIT_SUCCESS); } } if (!lock_check()) { fprintf(stderr,"Access to performance counters is locked.\n"); exit(EXIT_FAILURE); } if (optClock && optind == argc) { fprintf(stderr,"Commandline option -p requires an executable.\n"); exit(EXIT_FAILURE); } if (optSockets && !optStethoscope && optind == argc) { fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n"); exit(EXIT_FAILURE); } if (optStethoscope == 0 && optind == argc && !optInfo) { fprintf(stderr,"Either -s or executable must be given on commandline.\n"); exit(EXIT_FAILURE); } if (cpuid_init() == EXIT_FAILURE) { fprintf(stderr, "CPU not supported\n"); exit(EXIT_FAILURE); } if (numSockets > cpuid_topology.numSockets) { fprintf(stderr, "System has only %d sockets but %d are given on commandline.\n", cpuid_topology.numSockets, numSockets); exit(EXIT_FAILURE); } numa_init(); affinity_init(); for (c = 0; c < numSockets; c++) { if (threadsSockets[c] >= cpuid_topology.numSockets) { fprintf(stderr, "System has no socket %d\n", threadsSockets[c]); exit(EXIT_FAILURE); } bstring socketStr = bformat("S%d",threadsSockets[c]); socketDomains[threadsSockets[c]] = affinity_getDomain(socketStr); } accessClient_init(&socket_fd); msr_init(socket_fd); timer_init(); /* check for supported processors */ if ((cpuid_info.model == SANDYBRIDGE_EP) || (cpuid_info.model == SANDYBRIDGE) || (cpuid_info.model == IVYBRIDGE) || (cpuid_info.model == IVYBRIDGE_EP) || (cpuid_info.model == HASWELL) || (cpuid_info.model == HASWELL_EX) || (cpuid_info.model == NEHALEM_BLOOMFIELD) || (cpuid_info.model == NEHALEM_LYNNFIELD) || (cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == ATOM_SILVERMONT_C) || (cpuid_info.model == ATOM_SILVERMONT_E) || (cpuid_info.model == ATOM_SILVERMONT_F1) || (cpuid_info.model == ATOM_SILVERMONT_F2) || (cpuid_info.model == ATOM_SILVERMONT_F3)) { if (numSockets == 0) { numSockets = numa_info.numberOfNodes; } for(int i=0; iprocessorList[0]); } } else { fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell/Silvermont processors!\n"); exit(EXIT_FAILURE); } double clock = (double) timer_getCpuClock(); fprintf(stdout, HLINE); fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name); fprintf(stdout, "CPU clock:\t%3.2f GHz \n", (float) clock * 1.E-09); fprintf(stdout, HLINE); fflush(stdout); if (optInfo) { if (power_info.turbo.numSteps != 0) { fprintf(stdout, "Base clock:\t%.2f MHz \n", power_info.baseFrequency ); fprintf(stdout, "Minimal clock:\t%.2f MHz \n", power_info.minFrequency ); fprintf(stdout, "Turbo Boost Steps:\n"); for (int i=0; i < power_info.turbo.numSteps; i++ ) { fprintf(stdout, "C%d %.2f MHz \n",i+1, power_info.turbo.steps[i] ); } } fprintf(stdout, HLINE); fflush(stdout); } if ((cpuid_info.model == SANDYBRIDGE_EP) || (cpuid_info.model == IVYBRIDGE_EP) || (cpuid_info.model == HASWELL_EX) || (cpuid_info.model == HASWELL)) { hasDRAM = 1; } if ((cpuid_info.model == SANDYBRIDGE_EP) || (cpuid_info.model == SANDYBRIDGE) || (cpuid_info.model == IVYBRIDGE_EP) || (cpuid_info.model == IVYBRIDGE) || (cpuid_info.model == HASWELL) || (cpuid_info.model == ATOM_SILVERMONT_E) || (cpuid_info.model == ATOM_SILVERMONT_F1) || (cpuid_info.model == ATOM_SILVERMONT_F2) || (cpuid_info.model == ATOM_SILVERMONT_F3)) { hasPP0 = 1; } if ((cpuid_info.model == HASWELL) || (cpuid_info.model == SANDYBRIDGE) || (cpuid_info.model == IVYBRIDGE)) { hasPP1 = 1; } if ((cpuid_info.model != SANDYBRIDGE) && (cpuid_info.model != SANDYBRIDGE_EP) && (cpuid_info.model != IVYBRIDGE) && (cpuid_info.model != IVYBRIDGE_EP) && (cpuid_info.model != HASWELL) && (cpuid_info.model != HASWELL_M1) && (cpuid_info.model != HASWELL_M2) && (cpuid_info.model != HASWELL_EX) && (cpuid_info.model != ATOM_SILVERMONT_C) && (cpuid_info.model != ATOM_SILVERMONT_E) && (cpuid_info.model != ATOM_SILVERMONT_F1) && (cpuid_info.model != ATOM_SILVERMONT_F2) && (cpuid_info.model != ATOM_SILVERMONT_F3)) { fprintf (stderr, "RAPL not supported on this processor!\n"); exit(EXIT_FAILURE); } if (optInfo) { fprintf(stdout, "Thermal Spec Power: %g Watts \n", power_info.tdp ); fprintf(stdout, "Minimum Power: %g Watts \n", power_info.minPower); fprintf(stdout, "Maximum Power: %g Watts \n", power_info.maxPower); fprintf(stdout, "Maximum Time Window: %g micro sec \n", power_info.maxTimeWindow); fprintf(stdout, HLINE); fflush(stdout); exit(EXIT_SUCCESS); } if (optClock) { affinity_init(); argString = bformat("S%u:0-%u", threadsSockets[0], socketDomains[threadsSockets[0]]->numberOfProcessors-1); for (int i=1; inumberOfProcessors-1); bconcat(argString, tExpr); } numThreads = bstr_to_cpuset(threads, argString); bdestroy(argString); perfmon_init(numThreads, threads, stdout); perfmon_setupEventSet(eventString, NULL); } { PowerData pDataPkg[MAX_NUM_NODES*2]; PowerData pDataDram[MAX_NUM_NODES*2]; PowerData pDataPP0[MAX_NUM_NODES*2]; PowerData pDataPP1[MAX_NUM_NODES*2]; fprintf(stdout, "Measure on sockets: %d", threadsSockets[0]); for (int i=1; iprocessorList[0]; if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM); if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0); if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1); power_start(&(pDataPkg[i]), cpuId, PKG); } } sleep(optStethoscope); if (optClock) { perfmon_stopCounters(); perfmon_printCounterResults(); perfmon_finalize(); } else { for (int i=0; iprocessorList[0]; power_stop(&(pDataPkg[i]), cpuId, PKG); if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1); if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0); if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM); } } runtime = (double) optStethoscope; } else { TimerData time; argv += optind; bstring exeString = bfromcstr(argv[0]); for (int i=1; i<(argc-optind); i++) { bconchar(exeString, ' '); bcatcstr(exeString, argv[i]); } fprintf(stdout, "Executing: %s\n",bdata(exeString)); fflush(stdout); if (optClock) { perfmon_startCounters(); } else { for (int i=0; iprocessorList[0]; if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM); if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0); if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1); power_start(&(pDataPkg[i]), cpuId, PKG); } timer_start(&time); } if (system(bdata(exeString)) == EOF) { fprintf(stderr, "Failed to execute %s!\n", bdata(exeString)); exit(EXIT_FAILURE); } if (optClock) { perfmon_stopCounters(); perfmon_printCounterResults(); perfmon_finalize(); } else { timer_stop(&time); for (int i=0; iprocessorList[0]; power_stop(&(pDataPkg[i]), cpuId, PKG); if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM); if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0); if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1); } runtime = timer_print(&time); } } if (!optClock) { fprintf(stdout, "Runtime: %g second \n",runtime); fprintf(stdout, HLINE); for (int i=0; iprocessorList[0]); fprintf(stdout, "Domain: PKG \n"); fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPkg[i]))); fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPkg[i])) / runtime ); if (hasDRAM) { fprintf(stdout, "Domain: DRAM \n"); fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataDram[i]))); fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataDram[i])) / runtime ); } if (hasPP0) { fprintf(stdout, "Domain: PP0 \n"); fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP0[i]))); fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP0[i])) / runtime ); } if (hasPP1) { fprintf(stdout, "Domain: PP1 \n"); fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP1[i]))); fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP1[i])) / runtime ); } fprintf(stdout, "\n"); } fflush(stdout); } } if ( optTemp && cpuid_hasFeature(TM2)) { printf("Current core temperatures:\n"); for (i = 0; i < numSockets; i++) { printf("Socket %d\n",threadsSockets[i]); for (c = 0; c < socketDomains[threadsSockets[i]]->numberOfProcessors; c++ ) { thermal_init(i); printf("Core %d: %u C\n", socketDomains[threadsSockets[i]]->processorList[c], thermal_read(socketDomains[threadsSockets[i]]->processorList[c])); } } } msr_finalize(); return EXIT_SUCCESS; } likwid-3.1.3/perl/templates/000755 137545 027340 00000000000 12426160162 016251 5ustar00unrz254unrz000000 000000 likwid-3.1.3/groups/nehalem/000755 137545 027340 00000000000 12426160161 016240 5ustar00unrz254unrz000000 000000 likwid-3.1.3/groups/core2/TLB.txt000644 137545 027340 00000001621 12424452417 017031 0ustar00unrz254unrz000000 000000 SHORT TLB miss rate/ratio EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 DTLB_MISSES_ANY PMC1 L1D_ALL_CACHE_REF METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 L1 DTLB request rate PMC1/FIXC0 DTLB miss rate PMC0/FIXC0 L1 DTLB miss ratio PMC0/PMC1 LONG Formulas: L1 DTLB request rate = L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_CACHE_REF - L1 DTLB request rate tells you how data intensive your code is or how many Data accesses you have in average per instruction. The DTLB miss rate gives a measure how often a TLB miss occured per instruction. And finally L1 DTLB miss ratio tells you how many of your memory references required caused a TLB miss in average. likwid-3.1.3/filters/csv000755 137545 027340 00000006307 12416746770 015526 0ustar00unrz254unrz000000 000000 #!/usr/bin/perl -w use strict; use warnings; my $FILTERTYPE = 'csv'; my $SEP = ','; my $NL = "\n"; if ($#ARGV < 1) { die "Filter failed! Please report bug.\n"; } my $filename = $ARGV[0]; my $fileType = $ARGV[1]; my $infile = $filename; open INFILE,"< $filename"; $filename =~ s/\.tmp/\.$FILTERTYPE/; open OUTFILE,"> $filename"; if ($fileType eq 'topology') { my $region = 'topo'; print OUTFILE 'THREADS'.$NL; while () { if (/Cache Topology/) { $region = 'cache'; print OUTFILE 'CACHES'.$NL; } elsif (/NUMA Topology/) { $region = 'numa'; print OUTFILE 'NUMA'.$NL; } if ($region eq 'topo') { if (/(CPU type):\t(.*)/) { print OUTFILE $1.$SEP.$2.$NL; } elsif (/([A-Za-z ]*):\t([0-9]*)/) { print OUTFILE $1.$SEP.$2.$NL; } elsif (/(HWThread)\t(Thread)\t\t(Core)\t\t(Socket)/) { print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL; } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) { print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL; } } elsif ($region eq 'cache') { if (/(Size):\t([0-9]*) ([kMB]*)/) { my $size = $2; if ($3 eq 'MB') { $size *= 1024; } print OUTFILE $1.'[kB]'.$SEP.$size.$NL; } elsif (/(Cache groups):\t*(.*)/) { my @groups = split('\) \(',$2); my $grpId = 0; foreach (@groups) { /([0-9 ]+)/; print OUTFILE 'Cache group '.$grpId.$SEP.$1.$NL; $grpId++; } } elsif (/(.*):\t*(.*)/) { print OUTFILE $1.$SEP.$2.$NL; } } elsif ($region eq 'numa') { if (/Domain ([0-9]*)/) { print OUTFILE 'Domain ID'.$SEP.$1.$NL; } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) { print OUTFILE 'Free Memory [MB]'.$SEP.$1.$NL; print OUTFILE 'Total Memory [MB]'.$SEP.$2.$NL; } elsif (/(.*):\t*[ ]*(.*)/) { print OUTFILE $1.$SEP.$2.$NL; } } } } elsif ($fileType eq 'perfctr') { my $header = 0; while () { if (/Event[ ]*\|[ ]*(core.*)\|/) { if (not $header) { my @col = split('\|',$1); my $numcol = $#col+1; print OUTFILE 'NumColumns'.$SEP.$numcol.$NL; print OUTFILE 'Event/Metric'; foreach (@col) { s/[ ]//g; print OUTFILE $SEP.$_; } print OUTFILE $NL; $header = 1; } }elsif (/STAT/) { }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) { my @col = split('\|',$2); print OUTFILE $1; foreach (@col) { s/[ ]//g; print OUTFILE $SEP.$_; } print OUTFILE $NL; } } } else { die "Filter failed! Unknown application type $fileType!\n"; } unlink($infile); close INFILE; close OUTFILE; likwid-3.1.3/perl/feedGnuplot000755 137545 027340 00000072525 12336605216 016474 0ustar00unrz254unrz000000 000000 #!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Time::HiRes qw( usleep ); use IO::Handle; use List::Util qw( first ); use Text::ParseWords; use threads; use threads::shared; use Thread::Queue; use Pod::Usage; our $VERSION = '1.11'; my %options; interpretCommandline(\%options); my $gnuplotVersion = getGnuplotVersion(); # list containing the plot data. Each element is a reference to a list, representing the data for # one curve. The first 'point' is a hash describing various curve parameters. The rest are all # references to lists of (x,y) tuples my @curves = (); # list mapping curve names to their indices in the @curves list my %curveIndices = (); # now start the data acquisition and plotting threads my $dataQueue; my $xwindow; my $streamingFinished : shared = undef; if($options{stream}) { if( $options{hardcopy}) { $options{stream} = undef; } $dataQueue = Thread::Queue->new(); my $addThr = threads->create(\&mainThread); my $plotThr = threads->create(\&plotThread); while(<>) { chomp; # place every line of input to the queue, so that the plotting thread can process it. if we are # using an implicit domain (x = line number), then we send it on the data queue also, since # $. is not meaningful in the plotting thread if(!$options{domain}) { $_ .= " $."; } $dataQueue->enqueue($_); } $streamingFinished = 1; $plotThr->join(); $addThr->join(); } else { mainThread(); } sub interpretCommandline { # if I'm using a self-plotting data file with a #! line, then $ARGV[0] will contain ALL of the # options and $ARGV[1] will contain the data file to plot. In this case I need to split $ARGV[0] so # that GetOptions() can parse it correctly. On the other hand, if I'm plotting normally (not with # #!) a file with spaces in the filename, I don't want to split the filename. Hopefully this logic # takes care of both those cases. if (exists $ARGV[0] && !-r $ARGV[0]) { unshift @ARGV, shellwords shift @ARGV; } my $options = shift; # everything off by default: # do not stream in the data by default # point plotting by default. # no monotonicity checks by default $options{ maxcurves } = 100; GetOptions($options, 'stream!', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!', 'circles', 'legend=s%', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s', 'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=f', 'xmax=f', 'y2min=f', 'y2max=f', 'zmin=f', 'zmax=f', 'y2=s@', 'curvestyle=s%', 'curvestyleall=s', 'extracmds=s@', 'size=s', 'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!', 'extraValuesPerPoint=i', 'help', 'dump') or pod2usage(1); # handle various cmdline-option errors if ( $options->{help} ) { pod2usage(0); } $options->{curvestyleall} = '' unless defined $options->{curvestyleall}; if ($options->{colormap}) { # colormap styles all curves with palette. Seems like there should be a way to do this with a # global setting, but I can't get that to work $options->{curvestyleall} .= ' palette'; } if ( $options->{'3d'} ) { if ( !$options->{domain} ) { print STDERR "--3d only makes sense with --domain\n"; exit -1; } if ( defined $options->{y2min} || defined $options->{y2max} || defined $options->{y2} ) { print STDERR "--3d does not make sense with --y2...\n"; exit -1; } if ( defined $options->{xlen} ) { print STDERR "--3d does not make sense with --xlen\n"; exit -1; } if ( defined $options->{monotonic} ) { print STDERR "--3d does not make sense with --monotonic\n"; exit -1; } } else { if(!$options->{colormap}) { if ( defined $options->{zmin} || defined $options->{zmax} || defined $options->{zlabel} ) { print STDERR "--zmin/zmax/zlabel only makes sense with --3d or --colormap\n"; exit -1; } } if ( defined $options->{square_xy} ) { print STDERR "--square_xy only makes sense with --3d\n"; exit -1; } } if(defined $options{xlen} && !defined $options{stream} ) { print STDERR "--xlen does not make sense without --stream\n"; exit -1; } # --xlen implies an order to the data, so I force monotonicity $options{monotonic} = defined $options{xlen}; } sub getGnuplotVersion { open(GNUPLOT_VERSION, 'gnuplot --version |') or die "Couldn't run gnuplot"; my ($gnuplotVersion) = =~ /gnuplot\s*(\d*\.\d*)/; if (!$gnuplotVersion) { print STDERR "Couldn't find the version of gnuplot. Does it work? Trying anyway...\n"; $gnuplotVersion = 0; } close(GNUPLOT_VERSION); return $gnuplotVersion; } sub plotThread { while(! $streamingFinished) { sleep(1); $dataQueue->enqueue('Plot now'); } $dataQueue->enqueue(undef); } sub mainThread { my $valuesPerPoint = 1; if($options{extraValuesPerPoint}) { $valuesPerPoint += $options{extraValuesPerPoint}; } if($options{colormap}) { $valuesPerPoint++; } if($options{circles} ) { $valuesPerPoint++; } local *PIPE; my $dopersist = ''; if($gnuplotVersion >= 4.3) { $dopersist = '--persist' if(!$options{stream}); } if(exists $options{dump}) { *PIPE = *STDOUT; } else { open PIPE, "|gnuplot $dopersist" or die "Can't initialize gnuplot\n"; } autoflush PIPE 1; my $outputfile; my $outputfileType; if( $options{hardcopy}) { $outputfile = $options{hardcopy}; ($outputfileType) = $outputfile =~ /\.(eps|ps|pdf|png)$/; if(!$outputfileType) { die("Only .eps, .ps, .pdf and .png supported\n"); } my %terminalOpts = ( eps => 'postscript solid color enhanced eps', ps => 'postscript solid color landscape 10', pdf => 'pdfcairo solid color font ",10" size 11in,8.5in', png => 'png size 1280,1024' ); print PIPE "set terminal $terminalOpts{$outputfileType}\n"; print PIPE "set output \"$outputfile\"\n"; } else { print PIPE "set terminal x11\n"; } # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to # gnuplot $options{xmin} = '' unless defined $options{xmin}; $options{xmax} = '' unless defined $options{xmax}; $options{ymin} = '' unless defined $options{ymin}; $options{ymax} = '' unless defined $options{ymax}; $options{y2min} = '' unless defined $options{y2min}; $options{y2max} = '' unless defined $options{y2max}; $options{zmin} = '' unless defined $options{zmin}; $options{zmax} = '' unless defined $options{zmax}; print PIPE "set xtics\n"; if($options{y2}) { print PIPE "set ytics nomirror\n"; print PIPE "set y2tics\n"; # if any of the ranges are given, set the range print PIPE "set y2range [". $options{y2min} . ":" . $options{y2max} ."]\n" if length( $options{y2min} . $options{y2max} ); } # set up plotting style my $style = ''; if($options{lines}) { $style .= 'lines';} if($options{points}) { $style .= 'points';} if($options{circles}) { $options{curvestyleall} = "with circles $options{curvestyleall}"; } # if any of the ranges are given, set the range print PIPE "set xrange [". $options{xmin} . ":" . $options{xmax} ."]\n" if length( $options{xmin} . $options{xmax} ); print PIPE "set yrange [". $options{ymin} . ":" . $options{ymax} ."]\n" if length( $options{ymin} . $options{ymax} ); print PIPE "set zrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} ); print PIPE "set style data $style\n" if $style; print PIPE "set grid\n"; print(PIPE "set xlabel \"" . $options{xlabel } . "\"\n") if defined $options{xlabel}; print(PIPE "set ylabel \"" . $options{ylabel } . "\"\n") if defined $options{ylabel}; print(PIPE "set zlabel \"" . $options{zlabel } . "\"\n") if defined $options{zlabel}; print(PIPE "set y2label \"" . $options{y2label} . "\"\n") if defined $options{y2label}; print(PIPE "set title \"" . $options{title } . "\"\n") if defined $options{title}; if($options{square}) { # set a square aspect ratio. Gnuplot does this differently for 2D and 3D plots if(! $options{'3d'}) { $options{size} = '' unless defined $options{size}; $options{size} .= ' ratio -1'; } else { print(PIPE "set view equal xyz\n"); } } print(PIPE "set size $options{size}\n") if defined $options{size}; if($options{square_xy}) { print(PIPE "set view equal xy\n"); } if($options{colormap}) { print PIPE "set cbrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} ); } # For the specified values, set the legend entries to 'title "blah blah"' if($options{legend}) { foreach my $id (keys %{$options{legend}}) { setCurveLabel($id, $options{legend}{$id}); } } # add the extra curve options if($options{curvestyle}) { foreach my $id (keys %{$options{curvestyle}}) { addCurveOption($id, $options{curvestyle}{$id}); } } # For the values requested to be printed on the y2 axis, set that foreach (@{$options{y2}}) { addCurveOption($_, 'axes x1y2 linewidth 3'); } # add the extra global options if($options{extracmds}) { foreach (@{$options{extracmds}}) { print(PIPE "$_\n"); } } # regexp for a possibly floating point, possibly scientific notation number my $numRE = '-?\d*\.?\d+(?:[Ee][-+]?\d+)?'; # a point may be preceded by an id my $pointRE = $options{dataid} ? '(\w+)\s+' : '()'; $pointRE .= '(' . join('\s+', ($numRE) x $valuesPerPoint) . ')'; $pointRE = qr/$pointRE/; my @domain; my $haveNewData; # I should be using the // operator, but I'd like to be compatible with perl 5.8 while( $_ = (defined $dataQueue ? $dataQueue->dequeue() : <>)) { next if /^#/o; if($_ ne 'Plot now') { # parse the incoming data lines. The format is # x id0 dat0 id1 dat1 .... # where idX is the ID of the curve that datX corresponds to # # $options{domain} indicates whether the initial 'x' is given or not (if not, the line # number is used) # $options{dataid} indicates whether idX is given or not (if not, the point order in the # line is used) # 3d plots require $options{domain}, and dictate "x y" for the domain instead of just "x" if($options{domain}) { /($numRE)/go or next; $domain[0] = $1; if($options{'3d'}) { /($numRE)/go or next; $domain[1] = $1; } } else { # since $. is not meaningful in the plotting thread if we're using the data queue, we pass # $. on the data queue in that case if(defined $dataQueue) { s/ ([\d]+)$//o; $domain[0] = $1; } else { $domain[0] = $.; } } my $id = -1; while (/$pointRE/go) { if($1 ne '') {$id = $1;} else {$id++; } $haveNewData = 1; pushPoint(getCurve($id), [@domain, split( /\s+/, $2)]); } } elsif($options{stream}) { # only redraw a streaming plot if there's new data to plot next unless $haveNewData; $haveNewData = undef; if( $options{xlen} ) { pruneOldData($domain[0] - $options{xlen}); plotStoredData($domain[0] - $options{xlen}, $domain[0]); } else { plotStoredData(); } } } # finished reading in all. Plot what we have plotStoredData(); if ( $options{hardcopy}) { print PIPE "set output\n"; # sleep until the plot file exists, and it is closed. Sometimes the output is # still being written at this point usleep(100_000) until -e $outputfile; usleep(100_000) until(system("fuser -s \"$outputfile\"")); print "Wrote output to $outputfile\n"; return; } # we persist gnuplot, so we shouldn't need this sleep. However, once # gnuplot exits, but the persistent window sticks around, you can no # longer interactively zoom the plot. So we still sleep sleep(100000); } sub pruneOldData { my ($oldestx) = @_; foreach my $xy (@curves) { if( @$xy > 1 ) { if( my $firstInWindow = first {$xy->[$_][0] >= $oldestx} 1..$#$xy ) { splice( @$xy, 1, $firstInWindow-1 ); } else { splice( @$xy, 1); } } } } sub plotStoredData { my ($xmin, $xmax) = @_; print PIPE "set xrange [$xmin:$xmax]\n" if defined $xmin; # get the options for those curves that have any data my @nonemptyCurves = grep {@$_ > 1} @curves; my @extraopts = map {$_->[0]{options}} @nonemptyCurves; my $body = join(', ' , map({ '"-"' . $_} @extraopts) ); if($options{'3d'}) { print PIPE "splot $body\n"; } else { print PIPE "plot $body\n"; } foreach my $buf (@nonemptyCurves) { # send each point to gnuplot. Ignore the first "point" since it's the # curve options for my $elem (@{$buf}[1..$#$buf]) { print PIPE "@$elem\n"; } print PIPE "e\n"; } } sub updateCurveOptions { # generates the 'options' string for a curve, based on its legend title and its other options # These could be integrated into a single string, but that raises an issue in the no-title # case. When no title is specified, gnuplot will still add a legend entry with an unhelpful '-' # label. Thus I explicitly do 'notitle' for that case my ($curveoptions, $id) = @_; # use the given title, unless we're generating a legend automatically. Given titles # override autolegend my $title; if(defined $curveoptions->{title}) { $title = $curveoptions->{title}; } elsif( $options{autolegend} ) { $title = $id; } my $titleoption = defined $title ? "title \"$title\"" : "notitle"; my $extraoption = defined $options{curvestyleall} ? $options{curvestyleall} : ''; $curveoptions->{options} = "$titleoption $curveoptions->{extraoptions} $extraoption"; } sub getCurve { # This function returns the curve corresponding to a particular label, creating a new curve if # necessary if(scalar @curves >= $options{maxcurves}) { print STDERR "Tried to exceed the --maxcurves setting.\n"; print STDERR "Invoke with a higher --maxcurves limit if you really want to do this.\n"; exit; } my ($id) = @_; if( !exists $curveIndices{$id} ) { push @curves, [{extraoptions => ' '}]; # push a curve with no data and no options $curveIndices{$id} = $#curves; updateCurveOptions($curves[$#curves][0], $id); } return $curves[$curveIndices{$id}]; } sub addCurveOption { my ($id, $str) = @_; my $curve = getCurve($id); $curve->[0]{extraoptions} .= "$str "; updateCurveOptions($curve->[0], $id); } sub setCurveLabel { my ($id, $str) = @_; my $curve = getCurve($id); $curve->[0]{title} = $str; updateCurveOptions($curve->[0], $id); } # function to add a point to the plot. Assumes that the curve indexed by $idx already exists sub pushPoint { my ($curve, $xy) = @_; if($options{monotonic}) { if( @$curve > 1 && $xy->[0] < $curve->[$#{$curve}][0] ) { # the x-coordinate of the new point is in the past, so I wipe out all the data for this curve # and start anew splice( @$curve, 1, @$curve-1 ); } } push @$curve, $xy; } __END__ =head1 NAME feedGnuplot - A pipe-oriented frontend to Gnuplot =head1 SYNOPSIS Simple plotting of stored data: $ seq 5 | awk '{print 2*$1, $1*$1}' 2 1 4 4 6 9 8 16 10 25 $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --lines --points --legend 0="data 0" --title "Test plot" --y2 1 Simple real-time plotting example: plot how much data is received on the wlan0 network interface in bytes/second (uses bash, awk and Linux): $ while true; do sleep 1; cat /proc/net/dev; done | awk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' | feedGnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds =head1 DESCRIPTION This is a flexible, command-line-oriented frontend to Gnuplot. It creates plots from data coming in on STDIN or given in a filename passed on the commandline. Various data representations are supported, as is hardcopy output and streaming display of live data. A simple example: $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot You should see a plot with two curves. The C command generates some data to plot and the C reads it in from STDIN and generates the plot. The C invocation is just an example; more interesting things would be plotted in normal usage. No commandline-options are required for the most basic plotting. Input parsing is flexible; every line need not have the same number of points. New curves will be created as needed. The most commonly used functionality of gnuplot is supported directly by the script. Anything not directly supported can still be done with the C<--extracmds> and C<--curvestyle> options. Arbitrary gnuplot commands can be passed in with C<--extracmds>. For example, to turn off the grid, pass in C<--extracmds 'unset grid'>. As many of these options as needed can be passed in. To add arbitrary curve styles, use C<--curvestyle curveID=extrastyle>. Pass these more than once to affect more than one curve. To apply an extra style to I the curves, pass in C<--curvestyleall extrastyle>. =head2 Data formats By default, each value present in the incoming data represents a distinct data point, as demonstrated in the original example above (we had 10 numbers in the input and 10 points in the plot). If requested, the script supports more sophisticated interpretation of input data =head3 Domain selection If C<--domain> is passed in, the first value on each line of input is interpreted as the I-value for the rest of the data on that line. Without C<--domain> the I-value is the line number, and the first value on a line is a plain data point like the others. Default is C<--nodomain>. Thus the original example above produces 2 curves, with B<1,2,3,4,5> as the I-values. If we run the same command with --domain: $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --domain we get only 1 curve, with B<2,4,6,8,10> as the I-values. As many points as desired can appear on a single line, but all points on a line are associated with the I-value at the start of that line. =head3 Curve indexing By default, each column represents a separate curve. This is fine unless sparse data is to be plotted. With the C<--dataid> option, each point is represented by 2 values: a string identifying the curve, and the value itself. If we add C<--dataid> to the original example: $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --dataid --autolegend we get 5 different curves with one point in each. The first column, as produced by C, is B<2,4,6,8,10>. These are interpreted as the IDs of the curves to be plotted. The C<--autolegend> option adds a legend using the given IDs to label the curves. The IDs need not be numbers; generic strings are accepted. As many points as desired can appear on a single line. C<--domain> can be used in conjunction with C<--dataid>. =head3 Multi-value style support Depending on how gnuplot is plotting the data, more than one value may be needed to represent a single point. For example, the script has support to plot all the data with C<--circles>. This requires a radius to be specified for each point in addition to the position of the point. Thus, when plotting with C<--circles>, 2 numbers are read for each data point instead of 1. A similar situation exists with C<--colormap> where each point contains the position I the color. There are other gnuplot styles that require more data (such as error bars), but none of these are directly supported by the script. They can still be used, though, by specifying the specific style with C<--curvestyle>, and specifying how many extra values are needed for each point with C<--extraValuesPerPoint extra>. C<--extraValuesPerPoint> is ONLY needed for the styles not explicitly supported; supported styles set that variable automatically. =head3 3D data To plot 3D data, pass in C<--3d>. C<--domain> MUST be given when plotting 3D data to avoid domain ambiguity. If 3D data is being plotted, there are by definition 2 domain values instead of one (I as a function of I and I instead of I as a function of I). Thus the first 2 values on each line are interpreted as the domain instead of just 1. The rest of the processing happens the same way as before. =head2 Real-time streaming data To plot real-time data, pass in the C<--stream> option. Data will then be plotted as it is received, with the refresh rate limited to 1Hz (currently hard-coded). To plot only the most recent data (instead of I the data), C<--xlen windowsize> can be given. This will create an constantly-updating, scrolling view of the recent past. C should be replaced by the desired length of the domain window to plot, in domain units (passed-in values if C<--domain> or line numbers otherwise). =head2 Hardcopy output The script is able to produce hardcopy output with C<--hardcopy outputfile>. The output type is inferred from the filename with B<.ps>, B<.eps>, B<.pdf> and B<.png> currently supported. =head2 Self-plotting data files This script can be used to enable self-plotting data files. There are 2 ways of doing this: with a shebang (#!) or with inline perl data. =head3 Self-plotting data with a #! A self-plotting, executable data file C is formatted as $ cat data #!/usr/bin/feedGnuplot --lines --points 2 1 4 4 6 9 8 16 10 25 12 36 14 49 16 64 18 81 20 100 22 121 24 144 26 169 28 196 30 225 This is the shebang (#!) line followed by the data, formatted as before. The data file can be plotted simply with $ ./data The caveats here are that on Linux the whole #! line is limited to 127 charaters and that the full path to feedGnuplot must be given. The 127 character limit is a serious limitation, but this can likely be resolved with a kernel patch. I have only tried on Linux 2.6. =head3 Self-plotting data with perl inline data Perl supports storing data and code in the same file. This can also be used to create self-plotting files: $ cat plotdata.pl #!/usr/bin/perl use strict; use warnings; open PLOT, "| feedGnuplot --lines --points" or die "Couldn't open plotting pipe"; while( ) { my @xy = split; print PLOT "@xy\n"; } __DATA__ 2 1 4 4 6 9 8 16 10 25 12 36 14 49 16 64 18 81 20 100 22 121 24 144 26 169 28 196 30 225 This is especially useful if the logged data is not in a format directly supported by feedGnuplot. Raw data can be stored after the __DATA__ directive, with a small perl script to manipulate the data into a useable format and send it to the plotter. =head1 ARGUMENTS --[no]domain If enabled, the first element of each line is the domain variable. If not, the point index is used --[no]dataid If enabled, each data point is preceded by the ID of the data set that point corresponds to. This ID is interpreted as a string, NOT as just a number. If not enabled, the order of the point is used. As an example, if line 3 of the input is "0 9 1 20" '--nodomain --nodataid' would parse the 4 numbers as points in 4 different curves at x=3 '--domain --nodataid' would parse the 4 numbers as points in 3 different curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values '--nodomain --dataid' would parse the 4 numbers as points in 2 different curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the data values '--domain --dataid' would parse the 4 numbers as a single point at x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra value, so it is ignored. If another value followed 20, we'd get another point in curve ID 20 --[no]3d Do [not] plot in 3D. This only makes sense with --domain. Each domain here is an (x,y) tuple --colormap Show a colormapped xy plot. Requires extra data for the color. zmin/zmax can be used to set the extents of the colors. Automatically increments extraValuesPerPoint --[no]stream Do [not] display the data a point at a time, as it comes in --[no]lines Do [not] draw lines to connect consecutive points --[no]points Do [not] draw points --circles Plot with circles. This requires a radius be specified for each point. Automatically increments extraValuesPerPoint --xlabel xxx Set x-axis label --ylabel xxx Set y-axis label --y2label xxx Set y2-axis label. Does not apply to 3d plots --zlabel xxx Set y-axis label. Only applies to 3d plots --title xxx Set the title of the plot --legend curveID=legend Set the label for a curve plot. Use this option multiple times for multiple curves. With --dataid, curveID is the ID. Otherwise, it's the index of the curve, starting at 0 --autolegend Use the curve IDs for the legend. Titles given with --legend override these --xlen xxx When using --stream, sets the size of the x-window to plot. Omit this or set it to 0 to plot ALL the data. Does not make sense with 3d plots. Implies --monotonic --xmin xxx Set the range for the x axis. These are ignored in a streaming plot --xmax xxx Set the range for the x axis. These are ignored in a streaming plot --ymin xxx Set the range for the y axis. --ymax xxx Set the range for the y axis. --y2min xxx Set the range for the y2 axis. Does not apply to 3d plots. --y2max xxx Set the range for the y2 axis. Does not apply to 3d plots. --zmin xxx Set the range for the z axis. Only applies to 3d plots or colormaps. --zmax xxx Set the range for the z axis. Only applies to 3d plots or colormaps. --y2 xxx Plot the data specified by this curve ID on the y2 axis. Without --dataid, the ID is just an ordered 0-based index. Does not apply to 3d plots. --curvestyle curveID=style Additional styles per curve. With --dataid, curveID is the ID. Otherwise, it's the index of the curve, starting at 0. Use this option multiple times for multiple curves --curvestyleall xxx Additional styles for ALL curves. --extracmds xxx Additional commands. These could contain extra global styles for instance --size xxx Gnuplot size option --square Plot data with aspect ratio 1. For 3D plots, this controls the aspect ratio for all 3 axes --square_xy For 3D plots, set square aspect ratio for ONLY the x,y axes --hardcopy xxx If not streaming, output to a file specified here. Format inferred from filename --maxcurves xxx The maximum allowed number of curves. This is 100 by default, but can be reset with this option. This exists purely to prevent perl from allocating all of the system's memory when reading bogus data --monotonic If --domain is given, checks to make sure that the x- coordinate in the input data is monotonically increasing. If a given x-variable is in the past, all data currently cached for this curve is purged. Without --monotonic, all data is kept. Does not make sense with 3d plots. No --monotonic by default. --extraValuesPerPoint xxx How many extra values are given for each data point. Normally this is 0, and does not need to be specified, but sometimes we want extra data, like for colors or point sizes or error bars, etc. feedGnuplot options that require this (colormap, circles) automatically set it. This option is ONLY needed if unknown styles are used, with --curvestyleall for instance --dump Instead of printing to gnuplot, print to STDOUT. For debugging. =head1 ACKNOWLEDGEMENT This program is originally based on the driveGnuPlots.pl script from Thanassis Tsiodras. It is available from his site at L =head1 REPOSITORY L =head1 AUTHOR Dima Kogan, C<< >> =head1 LICENSE AND COPYRIGHT Copyright 2011 Dima Kogan. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See http://dev.perl.org/licenses/ for more information. =cut likwid-3.1.3/src/includes/perfmon_types.h000644 137545 027340 00000010327 12426160352 020755 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: perfmon_types.h * * Description: Header File of perfmon module. * Configures and reads out performance counters * on x86 based architectures. Supports multi threading. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef PERFMON_TYPES_H #define PERFMON_TYPES_H #include #include /* ##### EXPORTED TYPE DEFINITIONS #################################### */ typedef enum { PMC0 = 0, PMC1, PMC2, PMC3, PMC4, PMC5, PMC6, PMC7, PMC8, PMC9, PMC10, PMC11, PMC12, PMC13, PMC14, PMC15, PMC16, PMC17, PMC18, PMC19, PMC20, PMC21, PMC22, PMC23, PMC24, PMC25, PMC26, PMC27, PMC28, PMC29, PMC30, PMC31, PMC32, PMC33, PMC34, PMC35, PMC36, PMC37, PMC38, PMC39, PMC40, PMC41, PMC42, PMC43, PMC44, PMC45, PMC46, PMC47, PMC48, PMC49, PMC50, PMC51, PMC52, PMC53, PMC54, PMC55, PMC56, PMC57, PMC58, PMC59, PMC60, PMC61, PMC62, PMC63, PMC64, PMC65, PMC66, PMC67, PMC68, PMC69, PMC70, PMC71, PMC72, PMC73, PMC74, PMC75, PMC76, PMC77, PMC78, PMC79, PMC80, PMC81, PMC82, PMC83, PMC84, PMC85, PMC86, PMC87, PMC88, PMC89, PMC90, PMC91, PMC92, PMC93, PMC94, PMC95, PMC96, PMC97, PMC98, PMC99, PMC100, PMC101, PMC102, PMC103, PMC104, PMC105, PMC106, PMC107, PMC108, NUM_PMC} PerfmonCounterIndex; typedef enum { PMC = 0, FIXED, THERMAL, UNCORE, MBOX0, MBOX1, MBOX2, MBOX3, MBOXFIX, BBOX0, BBOX1, RBOX0, RBOX1, WBOX, SBOX0, SBOX1, SBOX2, CBOX0, CBOX1, CBOX2, CBOX3, CBOX4, CBOX5, CBOX6, CBOX7, CBOX8, CBOX9, CBOX10, CBOX11, CBOX12, CBOX13, CBOX14, PBOX, POWER, UBOX, NUM_UNITS} PerfmonType; typedef struct { char* key; PerfmonCounterIndex index; PerfmonType type; uint64_t configRegister; uint64_t counterRegister; uint64_t counterRegister2; PciDeviceIndex device; } PerfmonCounterMap; typedef struct { const char* key; PerfmonGroup index; int isUncore; const char* info; const char* config; int derivedCounters; const char ** derivedCounterNames; } PerfmonGroupMap; typedef struct { char* key; char* msg; } PerfmonGroupHelp; /* only used in westmereEX at the moment */ typedef struct { uint32_t ctrlRegister; uint32_t statusRegister; uint32_t ovflRegister; } PerfmonUnit; typedef struct { int init; int id; /* TODO id is only used for EX type processors */ double counterData; } PerfmonCounter; typedef struct { int processorId; PerfmonCounter counters[NUM_PMC]; } PerfmonThread; typedef struct { const char* name; const char* limit; uint16_t eventId; uint8_t umask; uint8_t cfgBits; uint8_t cmask; } PerfmonEvent; typedef struct { PerfmonEvent event; PerfmonCounterIndex index; double* result; } PerfmonEventSetEntry; typedef struct { int numberOfEvents; PerfmonEventSetEntry* events; } PerfmonEventSet; typedef struct { bstring label; double* value; } PerfmonResult; typedef struct { bstrList* header; int numRows; int numColumns; PerfmonResult* rows; } PerfmonResultTable; #endif /*PERFMON_TYPES_H*/ likwid-3.1.3/groups/sandybridge/000755 137545 027340 00000000000 12426160161 017122 5ustar00unrz254unrz000000 000000 likwid-3.1.3/groups/nehalem/BRANCH.txt000644 137545 027340 00000002056 12336605216 017746 0ustar00unrz254unrz000000 000000 SHORT Branch prediction miss rate/ratio EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 BR_INST_RETIRED_ALL_BRANCHES PMC1 BR_MISP_RETIRED_ALL_BRANCHES METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 Branch rate PMC0/FIXC0 Branch misprediction rate PMC1/FIXC0 Branch misprediction ratio PMC1/PMC0 Instructions per branch FIXC0/PMC0 LONG Formulas: Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES - The rates state how often in average a branch or a mispredicted branch occured per instruction retired in total. The Branch misprediction ratio sets directly into relation what ration of all branch instruction where mispredicted. Instructions per branch is 1/Branch rate. likwid-3.1.3/groups/nehalem/L3CACHE.txt000644 137545 027340 00000002256 12336605216 020015 0ustar00unrz254unrz000000 000000 SHORT L3 cache miss rate/ratio EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF UPMC0 UNC_L3_HITS_ANY UPMC1 UNC_L3_MISS_ANY UPMC2 UNC_L3_LINES_IN_ANY UPMC3 UNC_L3_LINES_OUT_ANY METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 L3 request rate UPMC0/FIXC0 L3 miss rate UPMC1/FIXC0 L3 miss ratio UPMC1/UPMC0 LONG Formulas: L3 request rate UNC_L3_HITS_ANY / INSTR_RETIRED_ANY L3 miss rate UNC_L3_MISS_ANY / INSTR_RETIRED_ANY L3 miss ratio UNC_L3_MISS_ANY / UNC_L3_HITS_ANY - This group measures the locality of your data accesses with regard to the L3 Cache. L3 request rate tells you how data intensive your code is or how many Data accesses you have in average per instruction. The L3 miss rate gives a measure how often it was necessary to get cachelines from memory. And finally L3 miss ratio tells you how many of your memory references required a cacheline to be loaded from a higher level. While the Data cache miss rate might be given by your algorithm you should try to get Data cache miss ratio as low as possible by increasing your cache reuse. likwid-3.1.3/src/loadData.s000644 137545 027340 00000000436 12417470124 016002 0ustar00unrz254unrz000000 000000 .intel_syntax noprefix .text .globl _loadData .type _loadData, @function _loadData : xor rax, rax .align 16 1: mov r8, [rsi + rax] mov r9, [rsi + rax + 64] mov r10, [rsi + rax + 128] mov r11, [rsi + rax + 192] add rax, 256 cmp rax, rdi jb 1b ret .size _loadData, .-_loadData likwid-3.1.3/src/threads.c000644 137545 027340 00000012320 12426160352 015675 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: threads.c * * Description: High level interface to pthreads * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ /* ##### HEADER FILE INCLUDES ######################################### */ #include #include #include #include #include /* ##### EXPORTED VARIABLES ########################################### */ pthread_barrier_t threads_barrier; ThreadData* threads_data; ThreadGroup* threads_groups; /* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */ static pthread_t* threads = NULL; static pthread_attr_t attr; static int numThreads = 0; /* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */ void threads_init(FILE* OUTSTREAM, int numberOfThreads) { int i; numThreads = numberOfThreads; threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t)); threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData)); for(i = 0; i < numThreads; i++) { threads_data[i].numberOfThreads = numThreads; threads_data[i].globalNumberOfThreads = numThreads; threads_data[i].globalThreadId = i; threads_data[i].threadId = i; threads_data[i].output = OUTSTREAM; } pthread_barrier_init(&threads_barrier, NULL, numThreads); pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); } void threads_create(void *(*startRoutine)(void*)) { int i; for(i = 0; i < numThreads; i++) { pthread_create(&threads[i], &attr, startRoutine, (void*) &threads_data[i]); } } void threads_createGroups(int numberOfGroups) { int i; int j; int numThreadsPerGroup; int globalId = 0; if (numThreads % numberOfGroups) { ERROR_PRINT(Not enough threads %d to create %d groups,numThreads,numberOfGroups); } else { numThreadsPerGroup = numThreads / numberOfGroups; } threads_groups = (ThreadGroup*) malloc(numberOfGroups * sizeof(ThreadGroup)); for (i = 0; i < numberOfGroups; i++) { threads_groups[i].numberOfThreads = numThreadsPerGroup; threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup * sizeof(int)); for (j = 0; j < numThreadsPerGroup; j++) { threads_data[globalId].threadId = j; threads_data[globalId].groupId = i; threads_data[globalId].numberOfGroups = numberOfGroups; threads_data[globalId].numberOfThreads = numThreadsPerGroup; threads_groups[i].threadIds[j] = globalId++; } } } void threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func) { int i; if (func == NULL) { for(i = 0; i < numThreads; i++) { threads_data[i].data = (*data); } } else { for(i = 0; i < numThreads; i++) { func( data, &threads_data[i].data); } } } void threads_registerDataThread(int threadId, ThreadUserData* data, threads_copyDataFunc func) { if (func == NULL) { threads_data[threadId].data = (*data); } else { func( data, &threads_data[threadId].data); } } void threads_registerDataGroup(int groupId, ThreadUserData* data, threads_copyDataFunc func) { int i; if (func == NULL) { for (i = 0; i < threads_groups[groupId].numberOfThreads; i++) { threads_data[threads_groups[groupId].threadIds[i]].data = (*data); } } else { for (i = 0; i < threads_groups[groupId].numberOfThreads; i++) { func( data, &threads_data[threads_groups[groupId].threadIds[i]].data); } } } void threads_join(void) { int i; for(i=0; i < numThreads; i++) { pthread_join(threads[i], NULL); } pthread_attr_destroy(&attr); pthread_barrier_destroy(&threads_barrier); } void threads_destroy(int numberOfGroups) { int i; free(threads_data); for(i=0;i. # # ======================================================================================= EVENT_DISPATCHED_FP_OP 0x00 PMC UMASK_DISPATCHED_FP_OP_ADD_PIPE 0x01 UMASK_DISPATCHED_FP_OP_MULTIPLY_PIPE 0x02 UMASK_DISPATCHED_FP_OP_STORE_PIPE 0x04 UMASK_DISPATCHED_FP_OP_ADD_PIPE_LOAD 0x08 UMASK_DISPATCHED_FP_OP_MULTIPLY_PIPE_LOAD 0x10 UMASK_DISPATCHED_FP_OP_STORE_PIPE_LOAD 0x20 EVENT_FPU_EMPTY 0x01 PMC UMASK_FPU_EMPTY 0x00 EVENT_DISPATCHED_FAST_FPU 0x02 PMC UMASK_DISPATCHED_FAST_FPU 0x00 EVENT_SSE_RETIRED 0x03 PMC UMASK_SSE_RETIRED_ADD_SINGLE_UOPS 0x01 UMASK_SSE_RETIRED_MULT_SINGLE_UOPS 0x02 UMASK_SSE_RETIRED_DIV_SINGLE_UOPS 0x04 UMASK_SSE_RETIRED_ADD_DOUBLE_UOPS 0x08 UMASK_SSE_RETIRED_MULT_DOUBLE_UOPS 0x10 UMASK_SSE_RETIRED_DIV_DOUBLE_UOPS 0x20 UMASK_SSE_RETIRED_ADD_SINGLE_FLOPS 0x41 UMASK_SSE_RETIRED_MULT_SINGLE_FLOPS 0x42 UMASK_SSE_RETIRED_DIV_SINGLE_FLOPS 0x44 UMASK_SSE_RETIRED_ADD_DOUBLE_FLOPS 0x48 UMASK_SSE_RETIRED_MULT_DOUBLE_FLOPS 0x50 UMASK_SSE_RETIRED_DIV_DOUBLE_FLOPS 0x60 EVENT_MOVE_RETIRED 0x04 PMC UMASK_MOVE_RETIRED_LOW_MERGE 0x01 UMASK_MOVE_RETIRED_HIGH_MERGE 0x02 UMASK_MOVE_RETIRED_MERGE 0x04 UMASK_MOVE_RETIRED_ALL 0x08 EVENT_SERIAL_RETIRED 0x05 PMC UMASK_SERIAL_RETIRED_SSE_BOTTOM_EXE 0x01 UMASK_SERIAL_RETIRED_SSE_BOTTOM_SERIAL 0x02 UMASK_SERIAL_RETIRED_X87_BOTTOM_EXE 0x04 UMASK_SERIAL_RETIRED_X87_BOTTOM_SERIAL 0x08 EVENT_SERIAL_CYCLES_FP 0x06 PMC UMASK_SERIAL_CYCLES_FP_EXE 0x01 UMASK_SERIAL_CYCLES_FP_SERIAL 0x02 EVENT_PIPELINE_RESTART_SELF_MOD 0x21 PMC UMASK_PIPELINE_RESTART_SELF_MOD 0x00 EVENT_PIPELINE_RESTART_PROBE_HIT 0x22 PMC UMASK_PIPELINE_RESTART_PROBE_HIT 0x00 EVENT_LS_BUFFER_FULL 0x23 PMC UMASK_LS_BUFFER_FULL 0x00 EVENT_LOCKED_OPERATION 0x24 PMC UMASK_LOCKED_OPERATION_INSTRUCTIONS 0x01 UMASK_LOCKED_OPERATION_SPECULATIVE_CYC 0x02 UMASK_LOCKED_OPERATION_NON_SPECULATIVE_CYC 0x04 UMASK_LOCKED_OPERATION_WAIT_CACHE_HIT_CYC 0x08 EVENT_RETIRED_CLFLUSH 0x26 PMC UMASK_RETIRED_CLFLUSH 0x00 EVENT_RETIRED_CPUID 0x27 PMC UMASK_RETIRED_CPUID 0x00 EVENT_STORE_TO_LOAD_FORWARD_CANCEL_ADDRESS_MISMATCH 0x2A PMC UMASK_STORE_TO_LOAD_FORWARD_CANCEL_ADDRESS_MISMATCH 0x01 UMASK_STORE_TO_LOAD_FORWARD_CANCEL_SIZE_MISMATCH 0x02 UMASK_STORE_TO_LOAD_FORWARD_CANCEL_MISALIGNED 0x04 EVENT_NUM_SMI 0x2B PMC UMASK_NUM_SMI 0x00 EVENT_DATA_CACHE_ACCESSES 0x40 PMC UMASK_DATA_CACHE_ACCESSES 0x00 EVENT_DATA_CACHE_MISSES 0x41 PMC UMASK_DATA_CACHE_MISSES 0x00 EVENT_DATA_CACHE_REFILLS 0x42 PMC UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE 0x01 UMASK_DATA_CACHE_REFILLS_L2_SHARED 0x02 UMASK_DATA_CACHE_REFILLS_L2_EXCLUSIVE 0x04 UMASK_DATA_CACHE_REFILLS_L2_OWNED 0x08 UMASK_DATA_CACHE_REFILLS_L2_MODIFIED 0x10 UMASK_DATA_CACHE_REFILLS_L2_ALL 0x1E EVENT_DATA_CACHE_REFILLS_NORTHBRIDGE 0x43 PMC UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE_INVALID 0x01 UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE_SHARED 0x02 UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE_EXCLUSIVE 0x04 UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE_OWNED 0x08 UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE_MODIFIED 0x10 UMASK_DATA_CACHE_REFILLS_NORTHBRIDGE_ALL 0x1E EVENT_DATA_CACHE_EVICTED 0x44 PMC UMASK_DATA_CACHE_EVICTED_INVALID 0x01 UMASK_DATA_CACHE_EVICTED_SHARED 0x02 UMASK_DATA_CACHE_EVICTED_EXCLUSIVE 0x04 UMASK_DATA_CACHE_EVICTED_OWNED 0x08 UMASK_DATA_CACHE_EVICTED_MODIFIED 0x10 UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA 0x20 UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA 0x40 UMASK_DATA_CACHE_EVICTED_ALL 0x1F EVENT_DTLB_L2_HIT 0x45 PMC UMASK_DTLB_L2_HIT_4K 0x01 UMASK_DTLB_L2_HIT_2M 0x02 UMASK_DTLB_L2_HIT_1G 0x04 UMASK_DTLB_L2_HIT_ALL 0x07 EVENT_DTLB_L2_MISS 0x46 PMC UMASK_DTLB_L2_MISS_4K 0x01 UMASK_DTLB_L2_MISS_2M 0x02 UMASK_DTLB_L2_MISS_1G 0x04 UMASK_DTLB_L2_MISS_ALL 0x07 EVENT_MISALIGNED_ACCESS 0x47 PMC UMASK_MISALIGNED_ACCESS 0x00 EVENT_LATE_CANCEL_ACCESS 0x48 PMC UMASK_LATE_CANCEL_ACCESS 0x00 EVENT_EARLY_CANCEL_ACCESS 0x49 PMC UMASK_EARLY_CANCEL_ACCESS 0x00 EVENT_SINGLE_BIT_ERRORS 0x4A PMC UMASK_SINGLE_BIT_ERRORS_SCRUBBER 0x01 UMASK_SINGLE_BIT_ERRORS_PIGGYBACK 0x02 UMASK_SINGLE_BIT_ERRORS_LOAD_PIPE 0x04 UMASK_SINGLE_BIT_ERRORS_STORE_PIPE 0x08 EVENT_PREFETCH_INSTRUCTION_DISPATCHED 0x4B PMC UMASK_PREFETCH_INSTRUCTION_DISPATCHED_LOAD 0x01 UMASK_PREFETCH_INSTRUCTION_DISPATCHED_STORE 0x02 UMASK_PREFETCH_INSTRUCTION_DISPATCHED_NTA 0x04 EVENT_DCACHE_LOCK_MISS 0x4C PMC UMASK_DCACHE_LOCK_MISS 0x02 EVENT_DTLB_L1_HIT 0x4D PMC UMASK_DTLB_L1_HIT_4K 0x01 UMASK_DTLB_L1_HIT_2M 0x02 UMASK_DTLB_L1_HIT_1G 0x04 EVENT_SW_PREFETCH_HIT 0x52 PMC UMASK_SW_PREFETCH_HIT_L1 0x01 UMASK_SW_PREFETCH_HIT_L2 0x08 EVENT_GLOBAL_TLB_FLUSH 0x54 PMC UMASK_GLOBAL_TLB_FLUSH 0x00 EVENT_MEMORY_REQUEST 0x65 PMC UMASK_MEMORY_REQUEST_NON_CACHEABLE 0x01 UMASK_MEMORY_REQUEST_WRITE_COMBINED 0x02 UMASK_MEMORY_REQUEST_STREAMING_STORE 0x80 EVENT_DATA_PREFETCHER 0x67 PMC UMASK_DATA_PREFETCHER_CANCELED 0x01 UMASK_DATA_PREFETCHER_ATTEMPTS 0x02 EVENT_NORTHBRIDGE_READ_RESPONSE 0x6C PMC UMASK_NORTHBRIDGE_READ_RESPONSE_EXCLUSIVE 0x01 UMASK_NORTHBRIDGE_READ_RESPONSE_MODIFIED 0x02 UMASK_NORTHBRIDGE_READ_RESPONSE_SHARED 0x04 UMASK_NORTHBRIDGE_READ_RESPONSE_ALL 0x07 UMASK_NORTHBRIDGE_READ_RESPONSE_OWNED 0x08 UMASK_NORTHBRIDGE_READ_RESPONSE_DATA_ERROR 0x10 EVENT_OCTWORDS_WRITE_TRANSFERS 0x6D PMC UMASK_OCTWORDS_WRITE_TRANSFERS 0x01 EVENT_CPU_CLOCKS_UNHALTED 0x76 PMC UMASK_CPU_CLOCKS_UNHALTED 0x00 EVENT_L2_REQUESTS 0x7D PMC UMASK_L2_REQUESTS_ICACHE_FILL 0x01 UMASK_L2_REQUESTS_DCACHE_FILL 0x02 UMASK_L2_REQUESTS_TLBCACHE_FILL 0x04 UMASK_L2_REQUESTS_ALL 0x07 UMASK_L2_REQUESTS_TAG_SNOOP_REQUEST 0x08 UMASK_L2_REQUESTS_CANCELLED_REQUEST 0x10 UMASK_L2_REQUESTS_HARDWARE_PREFETCH 0x20 EVENT_L2_MISSES 0x7E PMC UMASK_L2_MISSES_ICACHE_FILL 0x01 UMASK_L2_MISSES_DCACHE_FILL 0x02 UMASK_L2_MISSES_TLB_WALK 0x04 UMASK_L2_MISSES_ALL 0x07 UMASK_L2_MISSES_HARDWARE_PREFETCH 0x08 EVENT_L2_FILL 0x7F PMC UMASK_L2_FILL_VICTIMS 0x01 UMASK_L2_FILL_WRITEBACKS 0x02 UMASK_L2_FILL_ALL 0x03 EVENT_ICACHE_FETCHES 0x80 PMC UMASK_ICACHE_FETCHES 0x00 EVENT_ICACHE_MISSES 0x81 PMC UMASK_ICACHE_MISSES 0x00 EVENT_ICACHE_REFILLS_L2 0x82 PMC UMASK_ICACHE_REFILLS_L2 0x00 EVENT_ICACHE_REFILLS_MEM 0x83 PMC UMASK_ICACHE_REFILLS_MEM 0x00 EVENT_ITLB_L2_HIT 0x84 PMC UMASK_ITLB_L2_HIT 0x00 EVENT_ITLB_L2_MISS 0x85 PMC UMASK_ITLB_L2_MISS_4K 0x01 UMASK_ITLB_L2_MISS_2M 0x02 EVENT_PIPELINE_RESTART_STREAM_PROBE 0x86 PMC UMASK_PIPELINE_RESTART_STREAM_PROBE 0x00 EVENT_INSTRUCTION_FETCH_STALL 0x87 PMC UMASK_INSTRUCTION_FETCH_STALL 0x00 EVENT_RETURN_STACK_HITS 0x88 PMC UMASK_RETURN_STACK_HITS 0x00 EVENT_RETURN_STACK_OVERFLOW 0x89 PMC UMASK_RETURN_STACK_OVERFLOW 0x00 EVENT_ICACHE_VICTIMS 0x8B PMC UMASK_ICACHE_VICTIMS 0x00 EVENT_ICACHE_LINES_INVALIDATED 0x8C PMC UMASK_ICACHE_LINES_INVALIDATED_NOHIT 0x01 UMASK_ICACHE_LINES_INVALIDATED_HIT 0x02 EVENT_ITLB_RELOADS 0x99 PMC UMASK_ITLB_RELOADS 0x00 EVENT_ITLB_RELOADS_ABORTED 0x9A PMC UMASK_ITLB_RELOADS_ABORTED 0x00 EVENT_INSTRUCTIONS_RETIRED 0xC0 PMC UMASK_INSTRUCTIONS_RETIRED 0x00 EVENT_UOPS_RETIRED 0xC1 PMC UMASK_UOPS_RETIRED 0x00 EVENT_BRANCH_RETIRED 0xC2 PMC UMASK_BRANCH_RETIRED 0x00 EVENT_BRANCH_MISPREDICT_RETIRED 0xC3 PMC UMASK_BRANCH_MISPREDICT_RETIRED 0x00 EVENT_BRANCH_TAKEN_RETIRED 0xC4 PMC UMASK_BRANCH_TAKEN_RETIRED 0x00 EVENT_BRANCH_TAKEN_MISPREDICTED_RETIRED 0xC5 PMC UMASK_BRANCH_TAKEN_MISPREDICTED_RETIRED 0x00 EVENT_TRANSFER_FAR_CONTROL_RETIRED 0xC6 PMC UMASK_TRANSFER_FAR_CONTROL_RETIRED 0x00 EVENT_BRANCH_RESYNCS_RETIRED 0xC7 PMC UMASK_BRANCH_RESYNCS_RETIRED 0x00 EVENT_NEAR_RETURNS_RETIRED 0xC8 PMC UMASK_NEAR_RETURNS_RETIRED 0x00 EVENT_NEAR_RETURNS_MISPREDICTED_RETIRED 0xC9 PMC UMASK_NEAR_RETURNS_MISPREDICTED_RETIRED 0x00 EVENT_BRANCH_INDIRECT_MISPREDICT_RETIRED 0xCA PMC UMASK_BRANCH_INDIRECT_MISPREDICT_RETIRED 0x00 EVENT_FP_INSTRUCTIONS_RETIRED 0xCB PMC UMASK_FP_INSTRUCTIONS_RETIRED_X87 0x01 UMASK_FP_INSTRUCTIONS_RETIRED_MMX 0x02 UMASK_FP_INSTRUCTIONS_RETIRED_SSE 0x04 UMASK_FP_INSTRUCTIONS_RETIRED_ALL 0x07 EVENT_FASTPATH_RETIRED 0xCC PMC UMASK_FASTPATH_RETIRED_0 0x01 UMASK_FASTPATH_RETIRED_1 0x02 UMASK_FASTPATH_RETIRED_2 0x04 EVENT_INTERRUPTS_MASKED_CYCLES 0xCE PMC UMASK_INTERRUPTS_MASKED_CYCLES 0x00 EVENT_INTERRUPTS_TAKEN 0xCF PMC UMASK_INTERRUPTS_TAKEN 0x00 EVENT_DECODER_EMPTY_CYCLES 0xD0 PMC UMASK_DECODER_EMPTY_CYCLES 0x00 EVENT_DISPATCH_STALLS 0xD1 PMC UMASK_DISPATCH_STALLS 0x00 EVENT_DISPATCH_STALLS_BRANCH 0xD2 PMC UMASK_DISPATCH_STALLS_BRANCH 0x00 EVENT_DISPATCH_STALLS_SERIAL 0xD3 PMC UMASK_DISPATCH_STALLS_SERIAL 0x00 EVENT_DISPATCH_STALLS_SEGMENT_LOAD 0xD4 PMC UMASK_DISPATCH_STALLS_SEGMENT_LOAD 0x00 EVENT_DISPATCH_STALLS_ROB_FULL 0xD5 PMC UMASK_DISPATCH_STALLS_ROB_FULL 0x00 EVENT_DISPATCH_STALLS_RES_FULL 0xD6 PMC UMASK_DISPATCH_STALLS_RES_FULL 0x00 EVENT_DISPATCH_STALLS_FPU_FULL 0xD7 PMC UMASK_DISPATCH_STALLS_FPU_FULL 0x00 EVENT_DISPATCH_STALLS_LS_FULL 0xD8 PMC UMASK_DISPATCH_STALLS_LS_FULL 0x00 EVENT_DISPATCH_STALLS_ALL_QUIT 0xD9 PMC UMASK_DISPATCH_STALLS_ALL_QUIT 0x00 EVENT_DISPATCH_STALLS_DRAIN 0xDA PMC UMASK_DISPATCH_STALLS_DRAIN 0x00 EVENT_FPU_EXCEPTIONS 0xDB PMC UMASK_FPU_EXCEPTIONS_X87_RECLASS 0x01 UMASK_FPU_EXCEPTIONS_SSE_RETYPE 0x02 UMASK_FPU_EXCEPTIONS_SSE_RECLASS 0x04 UMASK_FPU_EXCEPTIONS_MICROTRAPS 0x08 UMASK_FPU_EXCEPTIONS_ALL 0x0F EVENT_X87_FLOPS_RETIRED 0x1C0 PMC UMASK_X87_FLOPS_RETIRED_ADD 0x01 UMASK_X87_FLOPS_RETIRED_MULT 0x02 UMASK_X87_FLOPS_RETIRED_DIV 0x04 EVENT_LFENCE_RETIRED 0x1D3 PMC UMASK_LFENCE_RETIRED 0x00 EVENT_SFENCE_RETIRED 0x1D4 PMC UMASK_SFENCE_RETIRED 0x00 EVENT_MFENCE_RETIRED 0x1D5 PMC UMASK_MFENCE_RETIRED 0x00 EVENT_DRAM_ACCESSES 0xE0 PMC UMASK_DRAM_ACCESSES_DCT0_HIT 0x01 UMASK_DRAM_ACCESSES_DCTO_MISS 0x02 UMASK_DRAM_ACCESSES_DCTO_CONFLICT 0x04 UMASK_DRAM_ACCESSES_DCTO_ALL 0x07 UMASK_DRAM_ACCESSES_DCT1_HIT 0x08 UMASK_DRAM_ACCESSES_DCT1_MISS 0x10 UMASK_DRAM_ACCESSES_DCT1_CONFLICT 0x20 UMASK_DRAM_ACCESSES_DCT1_ALL 0x38 EVENT_DRAM_PAGE_TABLE_OVERFLOW 0xE1 PMC UMASK_DRAM_PAGE_TABLE_OVERFLOW_ALL 0x00 UMASK_DRAM_PAGE_TABLE_OVERFLOW_DCT0 0x01 UMASK_DRAM_PAGE_TABLE_OVERFLOW_DCT1 0x02 EVENT_DRAM_COMMAND_SLOTS_MISSED 0xE2 PMC UMASK_DRAM_COMMAND_SLOTS_MISSED_ALL 0x00 UMASK_DRAM_COMMAND_SLOTS_MISSED_DCT0 0x01 UMASK_DRAM_COMMAND_SLOTS_MISSED_DCT1 0x02 EVENT_DRAM_CONTROLLER_TURNAROUNDS 0xE3 PMC UMASK_DRAM_CONTROLLER_TURNAROUNDS_DCT0_CHIP_SELECT 0x01 UMASK_DRAM_CONTROLLER_TURNAROUNDS_DCT0_READ_TO_WRITE 0x02 UMASK_DRAM_CONTROLLER_TURNAROUNDS_DCT0_WRITE_TO_READ 0x04 UMASK_DRAM_CONTROLLER_TURNAROUNDS_DCT1_CHIP_SELECT 0x08 UMASK_DRAM_CONTROLLER_TURNAROUNDS_DCT1_READ_TO_WRITE 0x10 UMASK_DRAM_CONTROLLER_TURNAROUNDS_DCT1_WRITE_TO_READ 0x20 EVENT_DRAM_CONTROLLER_BYPASS_SATURATION 0xE4 PMC UMASK_DRAM_CONTROLLER_BYPASS_SATURATION_HIGH 0x01 UMASK_DRAM_CONTROLLER_BYPASS_SATURATION_MEDIUM 0x02 UMASK_DRAM_CONTROLLER_BYPASS_SATURATION_DCT0 0x04 UMASK_DRAM_CONTROLLER_BYPASS_SATURATION_DCT1 0x08 EVENT_CACHE_BLOCK_COMMANDS 0xEA PMC UMASK_CACHE_BLOCK_COMMANDS_VICTIM 0x01 UMASK_CACHE_BLOCK_COMMANDS_READ 0x04 UMASK_CACHE_BLOCK_COMMANDS_READ_SHARED 0x08 UMASK_CACHE_BLOCK_COMMANDS_READ_MODIFIED 0x10 UMASK_CACHE_BLOCK_COMMANDS_DIRTY 0x20 EVENT_MEMORY_REQUESTS 0x1F0 PMC UMASK_MEMORY_REQUESTS_WRITE 0x01 UMASK_MEMORY_REQUESTS_READ 0x02 UMASK_MEMORY_REQUESTS_PREFETCH 0x04 UMASK_MEMORY_REQUESTS_WRITE_32 0x08 UMASK_MEMORY_REQUESTS_WRITE_64 0x10 UMASK_MEMORY_REQUESTS_READ_32 0x20 UMASK_MEMORY_REQUESTS_READ_64 0x40 UMASK_MEMORY_REQUESTS_READ_WHILE_WRITE 0x80 EVENT_L3_READ_REQUESTS 0x4E0 PMC UMASK_L3_READ_REQUEST_D_EXCLUSIVE_CORE_0 0x11 UMASK_L3_READ_REQUEST_I_SHARED_CORE_0 0x12 UMASK_L3_READ_REQUEST_MODIFY_CORE_0 0x14 UMASK_L3_READ_REQUEST_D_EXCLUSIVE_CORE_1 0x21 UMASK_L3_READ_REQUEST_I_SHARED_CORE_1 0x22 UMASK_L3_READ_REQUEST_MODIFY_CORE_1 0x24 UMASK_L3_READ_REQUEST_D_EXCLUSIVE_CORE_2 0x41 UMASK_L3_READ_REQUEST_I_SHARED_CORE_2 0x42 UMASK_L3_READ_REQUEST_MODIFY_CORE_2 0x44 UMASK_L3_READ_REQUEST_D_EXCLUSIVE_CORE_3 0x81 UMASK_L3_READ_REQUEST_I_SHARED_CORE_3 0x82 UMASK_L3_READ_REQUEST_MODIFY_CORE_3 0x84 UMASK_L3_READ_REQUEST_ALL_CORE_0 0x17 UMASK_L3_READ_REQUEST_ALL_CORE_1 0x27 UMASK_L3_READ_REQUEST_ALL_CORE_2 0x47 UMASK_L3_READ_REQUEST_ALL_CORE_3 0x87 UMASK_L3_READ_REQUEST_ALL_ALL_CORES 0xF7 EVENT_L3_MISSES 0x4E1 PMC UMASK_L3_MISSES_D_EXCLUSIVE_CORE_0 0x11 UMASK_L3_MISSES_I_SHARED_CORE_0 0x12 UMASK_L3_MISSES_MODIFY_CORE_0 0x14 UMASK_L3_MISSES_D_EXCLUSIVE_CORE_1 0x21 UMASK_L3_MISSES_I_SHARED_CORE_1 0x22 UMASK_L3_MISSES_MODIFY_CORE_1 0x24 UMASK_L3_MISSES_D_EXCLUSIVE_CORE_2 0x41 UMASK_L3_MISSES_I_SHARED_CORE_2 0x42 UMASK_L3_MISSES_MODIFY_CORE_2 0x44 UMASK_L3_MISSES_D_EXCLUSIVE_CORE_3 0x81 UMASK_L3_MISSES_I_SHARED_CORE_3 0x82 UMASK_L3_MISSES_MODIFY_CORE_3 0x84 UMASK_L3_MISSES_ALL_CORE_0 0x17 UMASK_L3_MISSES_ALL_CORE_1 0x27 UMASK_L3_MISSES_ALL_CORE_2 0x47 UMASK_L3_MISSES_ALL_CORE_3 0x87 UMASK_L3_MISSES_ALL_ALL_CORES 0xF7 EVENT_L3_FILLS 0x4E2 PMC UMASK_L3_FILLS_SHARED_CORE_0 0x11 UMASK_L3_FILLS_EXCLUSIVE_CORE_0 0x12 UMASK_L3_FILLS_OWNED_CORE_0 0x14 UMASK_L3_FILLS_MODIFY_CORE_0 0x18 UMASK_L3_FILLS_SHARED_CORE_1 0x21 UMASK_L3_FILLS_EXCLUSIVE_CORE_1 0x22 UMASK_L3_FILLS_OWNED_CORE_1 0x24 UMASK_L3_FILLS_MODIFY_CORE_1 0x28 UMASK_L3_FILLS_SHARED_CORE_2 0x41 UMASK_L3_FILLS_EXCLUSIVE_CORE_2 0x42 UMASK_L3_FILLS_OWNED_CORE_2 0x44 UMASK_L3_FILLS_MODIFY_CORE_2 0x48 UMASK_L3_FILLS_SHARED_CORE_3 0x81 UMASK_L3_FILLS_EXCLUSIVE_CORE_3 0x82 UMASK_L3_FILLS_OWNED_CORE_3 0x84 UMASK_L3_FILLS_MODIFY_CORE_3 0x88 UMASK_L3_FILLS_SHARED_ALL_CORES 0xF1 UMASK_L3_FILLS_EXCLUSIVE_ALL_CORES 0xF2 UMASK_L3_FILLS_OWNED_ALL_CORES 0xF4 UMASK_L3_FILLS_MODIFY_ALL_CORES 0xF8 UMASK_L3_FILLS_ALL_ALL_CORES 0xFF EVENT_L3_EVICTS 0x4E3 PMC UMASK_L3_EVICTS_SHARED 0x01 UMASK_L3_EVICTS_EXCLUSIVE 0x02 UMASK_L3_EVICTS_OWNED 0x04 UMASK_L3_EVICTS_MODIFY 0x08 EVENT_CPU_TO_DRAM 0x1E0 PMC UMASK_CPU_TO_DRAM_LOCAL_TO_0 0x01 UMASK_CPU_TO_DRAM_LOCAL_TO_1 0x02 UMASK_CPU_TO_DRAM_LOCAL_TO_2 0x04 UMASK_CPU_TO_DRAM_LOCAL_TO_3 0x08 UMASK_CPU_TO_DRAM_LOCAL_TO_4 0x10 UMASK_CPU_TO_DRAM_LOCAL_TO_5 0x20 UMASK_CPU_TO_DRAM_LOCAL_TO_6 0x40 UMASK_CPU_TO_DRAM_LOCAL_TO_7 0x80 EVENT_IO_TO_DRAM 0x1E1 PMC UMASK_IO_TO_DRAM_LOCAL_TO_0 0x01 UMASK_IO_TO_DRAM_LOCAL_TO_1 0x02 UMASK_IO_TO_DRAM_LOCAL_TO_2 0x04 UMASK_IO_TO_DRAM_LOCAL_TO_3 0x08 UMASK_IO_TO_DRAM_LOCAL_TO_4 0x10 UMASK_IO_TO_DRAM_LOCAL_TO_5 0x20 UMASK_IO_TO_DRAM_LOCAL_TO_6 0x40 UMASK_IO_TO_DRAM_LOCAL_TO_7 0x80 EVENT_CPU_READ_COMMAND_LATENCY 0x1E2 PMC UMASK_CPU_READ_COMMAND_LATENCY_READ_BLOCK 0x01 UMASK_CPU_READ_COMMAND_LATENCY_READ_BLOCK_SHARED 0x02 UMASK_CPU_READ_COMMAND_LATENCY_READ_BLOCK_MODIFIED 0x04 UMASK_CPU_READ_COMMAND_LATENCY_CHANGE_TO_DIRTY 0x08 UMASK_CPU_READ_COMMAND_LATENCY_LOCAL_TO_0 0x10 UMASK_CPU_READ_COMMAND_LATENCY_LOCAL_TO_1 0x20 UMASK_CPU_READ_COMMAND_LATENCY_LOCAL_TO_2 0x40 UMASK_CPU_READ_COMMAND_LATENCY_LOCAL_TO_3 0x80 EVENT_HYPERTRANSPORT_LINK0 0xF6 PMC UMASK_HYPERTRANSPORT_LINK0_COMMAND_DWORD_SENT 0x01 UMASK_HYPERTRANSPORT_LINK0_DATA_DWORD_SENT 0x02 UMASK_HYPERTRANSPORT_LINK0_BUFFER_DWORD_SENT 0x04 UMASK_HYPERTRANSPORT_LINK0_NOP_DWORD_SENT 0x08 UMASK_HYPERTRANSPORT_LINK0_ADDRESS_DWORD_SENT 0x10 UMASK_HYPERTRANSPORT_LINK0_PER_PACKET_CRC_SENT 0x20 UMASK_HYPERTRANSPORT_LINK0_SUBLINK_MASK 0x80 UMASK_HYPERTRANSPORT_LINK0_ALL_SENT 0x37 EVENT_HYPERTRANSPORT_LINK1 0xF7 PMC UMASK_HYPERTRANSPORT_LINK1_COMMAND_DWORD_SENT 0x01 UMASK_HYPERTRANSPORT_LINK1_DATA_DWORD_SENT 0x02 UMASK_HYPERTRANSPORT_LINK1_BUFFER_DWORD_SENT 0x04 UMASK_HYPERTRANSPORT_LINK1_NOP_DWORD_SENT 0x08 UMASK_HYPERTRANSPORT_LINK1_ADDRESS_DWORD_SENT 0x10 UMASK_HYPERTRANSPORT_LINK1_PER_PACKET_CRC_SENT 0x20 UMASK_HYPERTRANSPORT_LINK1_SUBLINK_MASK 0x80 UMASK_HYPERTRANSPORT_LINK1_ALL_SENT 0x37 EVENT_HYPERTRANSPORT_LINK2 0xF8 PMC UMASK_HYPERTRANSPORT_LINK2_COMMAND_DWORD_SENT 0x01 UMASK_HYPERTRANSPORT_LINK2_DATA_DWORD_SENT 0x02 UMASK_HYPERTRANSPORT_LINK2_BUFFER_DWORD_SENT 0x04 UMASK_HYPERTRANSPORT_LINK2_NOP_DWORD_SENT 0x08 UMASK_HYPERTRANSPORT_LINK2_ADDRESS_DWORD_SENT 0x10 UMASK_HYPERTRANSPORT_LINK2_PER_PACKET_CRC_SENT 0x20 UMASK_HYPERTRANSPORT_LINK2_SUBLINK_MASK 0x80 UMASK_HYPERTRANSPORT_LINK2_ALL_SENT 0x37 EVENT_HYPERTRANSPORT_LINK3 0x1F9 PMC UMASK_HYPERTRANSPORT_LINK3_COMMAND_DWORD_SENT 0x01 UMASK_HYPERTRANSPORT_LINK3_DATA_DWORD_SENT 0x02 UMASK_HYPERTRANSPORT_LINK3_BUFFER_DWORD_SENT 0x04 UMASK_HYPERTRANSPORT_LINK3_NOP_DWORD_SENT 0x08 UMASK_HYPERTRANSPORT_LINK3_ADDRESS_DWORD_SENT 0x10 UMASK_HYPERTRANSPORT_LINK3_PER_PACKET_CRC_SENT 0x20 UMASK_HYPERTRANSPORT_LINK3_SUBLINK_MASK 0x80 UMASK_HYPERTRANSPORT_LINK3_ALL_SENT 0x37 likwid-3.1.3/groups/interlagos/L3.txt000644 137545 027340 00000001377 12336605216 020032 0ustar00unrz254unrz000000 000000 SHORT L3 cache bandwidth in MBytes/s EVENTSET PMC0 L2_FILL_WB_FILL PMC1 L2_FILL_WB_WB PMC2 CPU_CLOCKS_UNHALTED METRICS Runtime (RDTSC) [s] time L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 L3 refill bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time L3 evict [MBytes/s] 1.0E-06*PMC1*64.0/time LONG Formulas: L3 bandwidth [MBytes/s] 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time L3 data volume [GBytes] 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64 L3 refill bandwidth [MBytes/s] 1.0E-06*L2_FILL_WB_FILL*64/time - Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the number of cacheline loaded from L3 to L2 and the number of modified cachelines evicted from the L2. likwid-3.1.3/src/includes/tree_types.h000644 137545 027340 00000002605 12426160352 020246 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: tree_types.h * * Description: Types file for tree module. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef TREE_TYPES_H #define TREE_TYPES_H /* For arbitrary trees llink are the children and * rlink are the neighbours */ typedef struct treeNode { int id; struct treeNode* llink; struct treeNode* rlink; } TreeNode; #endif /*TREE_TYPES_H*/ likwid-3.1.3/src/asciiTable.c000644 137545 027340 00000013371 12426160352 016312 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: asciiTable.c * * Description: Module implementing output of ascii table. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ /* ##### HEADER FILE INCLUDES ######################################### */ #include #include #include #include #include #include #include #include /* ##### LOCAL VARIABLES ########################################### */ static FILE* OUTPUT; /* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */ void asciiTable_setOutput(FILE* stream) { OUTPUT = stream; } TableContainer* asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels) { int i; TableContainer* container; OUTPUT = stdout; container = (TableContainer*) malloc(sizeof(TableContainer)); container->numRows = numRows; container->numColumns = numColumns; container->currentRow = 0; container->printed = 0; if (numColumns != headerLabels->qty) { ERROR_PRINT(Number of columns %d not equal to number of header labels %d,numColumns,headerLabels->qty); } container->header = bstrListCreate(); bstrListAlloc (container->header, numColumns); for(i=0; iheader->entry[i] = bstrcpy(headerLabels->entry[i]); } container->rows = (bstrList**) malloc( numRows * sizeof(bstrList*)); for(i=0; irows[i] = bstrListCreate(); bstrListAlloc (container->rows[i], numColumns); } return container; } void asciiTable_free(TableContainer* container) { int i; if(container == NULL) { ERROR_PLAIN_PRINT(Cannot free NULL reference); } bstrListDestroy(container->header); for(i=0; inumRows; i++) { bstrListDestroy(container->rows[i]); } free(container->rows); } void asciiTable_insertRow(TableContainer* container, int row, bstrList* fields) { int i; if (container->numColumns != fields->qty) { ERROR_PRINT(Number of colummns %d not equal to number of field labels %d,container->numColumns,fields->qty); } if (row >= container->numRows) { ERROR_PRINT(Number of Rows %d smaller than requested row index %d, container->numRows,row); } for(i=0; inumColumns; i++) { container->rows[row]->entry[i] = bstrcpy(fields->entry[i]); container->rows[row]->qty++; } } void asciiTable_appendRow(TableContainer* container, bstrList* fields) { asciiTable_insertRow(container, container->currentRow++, fields); } void asciiTable_setCurrentRow(TableContainer* container, int row) { container->currentRow = row; } void asciiTable_print(TableContainer* container) { int i; int j; int* boxwidth; boxwidth = (int*) malloc(container->numColumns * sizeof(int)); for (j=0; jnumColumns; j++) boxwidth[j] = 0; for (j=0; jnumColumns; j++) { boxwidth[j] = MAX(boxwidth[j],blength(container->header->entry[j])); } /* determine maximum label width in each column */ for (i=0; inumRows; i++) { for (j=0; jnumColumns; j++) { // btrimws(container->rows[i]->entry[j]); boxwidth[j] = MAX(boxwidth[j],blength(container->rows[i]->entry[j])); } } if (! container->printed) { /* Increase boxwidth with two spaces */ for (j=0; jnumColumns; j++) boxwidth[j] +=2; } /* print header */ for (j=0; jnumColumns; j++) { fprintf(OUTPUT,"+"); for (i=0;inumColumns; j++) { fprintf(OUTPUT,"|"); bJustifyCenter(container->header->entry[j],boxwidth[j]); fprintf(OUTPUT,"%s",bdata(container->header->entry[j])); } fprintf(OUTPUT,"|\n"); for (j=0; jnumColumns; j++) { fprintf(OUTPUT,"+"); for (i=0;inumRows; i++) { for (j=0; jnumColumns; j++) { fprintf(OUTPUT,"|"); bJustifyCenter(container->rows[i]->entry[j],boxwidth[j]); fprintf(OUTPUT,"%s",bdata(container->rows[i]->entry[j])); } fprintf(OUTPUT,"|\n"); } for (j=0; jnumColumns; j++) { fprintf(OUTPUT,"+"); for (i=0;iprinted = 1; free(boxwidth); } likwid-3.1.3/bench/x86-64/copy.ptt000644 137545 027340 00000000543 12336605216 016755 0ustar00unrz254unrz000000 000000 STREAMS 2 TYPE DOUBLE FLOPS 0 BYTES 16 LOOP 8 movaps FPR1, [STR0 + GPR1 * 8] movaps FPR2, [STR0 + GPR1 * 8 + 16] movaps FPR3, [STR0 + GPR1 * 8 + 32] movaps FPR4, [STR0 + GPR1 * 8 + 48] movaps [STR1 + GPR1 * 8] , FPR1 movaps [STR1 + GPR1 * 8 + 16], FPR2 movaps [STR1 + GPR1 * 8 + 32], FPR3 movaps [STR1 + GPR1 * 8 + 48], FPR4 likwid-3.1.3/perl/Template/VMethods.pm000644 137545 027340 00000031742 12336605216 020130 0ustar00unrz254unrz000000 000000 #============================================================= -*-Perl-*- # # Template::VMethods # # DESCRIPTION # Module defining virtual methods for the Template Toolkit # # AUTHOR # Andy Wardley # # COPYRIGHT # Copyright (C) 1996-2006 Andy Wardley. All Rights Reserved. # # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # REVISION # $Id: VMethods.pm 1245 2009-07-04 17:02:52Z abw $ # #============================================================================ package Template::VMethods; use strict; use warnings; use Scalar::Util 'blessed'; require Template::Stash; our $VERSION = 2.16; our $DEBUG = 0 unless defined $DEBUG; our $PRIVATE = $Template::Stash::PRIVATE; our $ROOT_VMETHODS = { inc => \&root_inc, dec => \&root_dec, }; our $TEXT_VMETHODS = { item => \&text_item, list => \&text_list, hash => \&text_hash, length => \&text_length, size => \&text_size, defined => \&text_defined, match => \&text_match, search => \&text_search, repeat => \&text_repeat, replace => \&text_replace, remove => \&text_remove, split => \&text_split, chunk => \&text_chunk, substr => \&text_substr, }; our $HASH_VMETHODS = { item => \&hash_item, hash => \&hash_hash, size => \&hash_size, each => \&hash_each, keys => \&hash_keys, values => \&hash_values, items => \&hash_items, pairs => \&hash_pairs, list => \&hash_list, exists => \&hash_exists, defined => \&hash_defined, delete => \&hash_delete, import => \&hash_import, sort => \&hash_sort, nsort => \&hash_nsort, }; our $LIST_VMETHODS = { item => \&list_item, list => \&list_list, hash => \&list_hash, push => \&list_push, pop => \&list_pop, unshift => \&list_unshift, shift => \&list_shift, max => \&list_max, size => \&list_size, defined => \&list_defined, first => \&list_first, last => \&list_last, reverse => \&list_reverse, grep => \&list_grep, join => \&list_join, sort => \&list_sort, nsort => \&list_nsort, unique => \&list_unique, import => \&list_import, merge => \&list_merge, slice => \&list_slice, splice => \&list_splice, }; #======================================================================== # root virtual methods #======================================================================== sub root_inc { no warnings; my $item = shift; ++$item; } sub root_dec { no warnings; my $item = shift; --$item; } #======================================================================== # text virtual methods #======================================================================== sub text_item { $_[0]; } sub text_list { [ $_[0] ]; } sub text_hash { { value => $_[0] }; } sub text_length { length $_[0]; } sub text_size { return 1; } sub text_defined { return 1; } sub text_match { my ($str, $search, $global) = @_; return $str unless defined $str and defined $search; my @matches = $global ? ($str =~ /$search/g) : ($str =~ /$search/); return @matches ? \@matches : ''; } sub text_search { my ($str, $pattern) = @_; return $str unless defined $str and defined $pattern; return $str =~ /$pattern/; } sub text_repeat { my ($str, $count) = @_; $str = '' unless defined $str; return '' unless $count; $count ||= 1; return $str x $count; } sub text_replace { my ($text, $pattern, $replace, $global) = @_; $text = '' unless defined $text; $pattern = '' unless defined $pattern; $replace = '' unless defined $replace; $global = 1 unless defined $global; if ($replace =~ /\$\d+/) { # replacement string may contain backrefs my $expand = sub { my ($chunk, $start, $end) = @_; $chunk =~ s{ \\(\\|\$) | \$ (\d+) }{ $1 ? $1 : ($2 > $#$start || $2 == 0) ? '' : substr($text, $start->[$2], $end->[$2] - $start->[$2]); }exg; $chunk; }; if ($global) { $text =~ s{$pattern}{ &$expand($replace, [@-], [@+]) }eg; } else { $text =~ s{$pattern}{ &$expand($replace, [@-], [@+]) }e; } } else { if ($global) { $text =~ s/$pattern/$replace/g; } else { $text =~ s/$pattern/$replace/; } } return $text; } sub text_remove { my ($str, $search) = @_; return $str unless defined $str and defined $search; $str =~ s/$search//g; return $str; } sub text_split { my ($str, $split, $limit) = @_; $str = '' unless defined $str; # we have to be very careful about spelling out each possible # combination of arguments because split() is very sensitive # to them, for example C behaves differently # to C<$space=' '; split($space, ...)> if (defined $limit) { return [ defined $split ? split($split, $str, $limit) : split(' ', $str, $limit) ]; } else { return [ defined $split ? split($split, $str) : split(' ', $str) ]; } } sub text_chunk { my ($string, $size) = @_; my @list; $size ||= 1; if ($size < 0) { # sexeger! It's faster to reverse the string, search # it from the front and then reverse the output than to # search it from the end, believe it nor not! $string = reverse $string; $size = -$size; unshift(@list, scalar reverse $1) while ($string =~ /((.{$size})|(.+))/g); } else { push(@list, $1) while ($string =~ /((.{$size})|(.+))/g); } return \@list; } sub text_substr { my ($text, $offset, $length, $replacement) = @_; $offset ||= 0; if(defined $length) { if (defined $replacement) { substr( $text, $offset, $length, $replacement ); return $text; } else { return substr( $text, $offset, $length ); } } else { return substr( $text, $offset ); } } #======================================================================== # hash virtual methods #======================================================================== sub hash_item { my ($hash, $item) = @_; $item = '' unless defined $item; return if $PRIVATE && $item =~ /$PRIVATE/; $hash->{ $item }; } sub hash_hash { $_[0]; } sub hash_size { scalar keys %{$_[0]}; } sub hash_each { # this will be changed in TT3 to do what hash_pairs() does [ %{ $_[0] } ]; } sub hash_keys { [ keys %{ $_[0] } ]; } sub hash_values { [ values %{ $_[0] } ]; } sub hash_items { [ %{ $_[0] } ]; } sub hash_pairs { [ map { { key => $_ , value => $_[0]->{ $_ } } } sort keys %{ $_[0] } ]; } sub hash_list { my ($hash, $what) = @_; $what ||= ''; return ($what eq 'keys') ? [ keys %$hash ] : ($what eq 'values') ? [ values %$hash ] : ($what eq 'each') ? [ %$hash ] : # for now we do what pairs does but this will be changed # in TT3 to return [ $hash ] by default [ map { { key => $_ , value => $hash->{ $_ } } } sort keys %$hash ]; } sub hash_exists { exists $_[0]->{ $_[1] }; } sub hash_defined { # return the item requested, or 1 if no argument # to indicate that the hash itself is defined my $hash = shift; return @_ ? defined $hash->{ $_[0] } : 1; } sub hash_delete { my $hash = shift; delete $hash->{ $_ } for @_; } sub hash_import { my ($hash, $imp) = @_; $imp = {} unless ref $imp eq 'HASH'; @$hash{ keys %$imp } = values %$imp; return ''; } sub hash_sort { my ($hash) = @_; [ sort { lc $hash->{$a} cmp lc $hash->{$b} } (keys %$hash) ]; } sub hash_nsort { my ($hash) = @_; [ sort { $hash->{$a} <=> $hash->{$b} } (keys %$hash) ]; } #======================================================================== # list virtual methods #======================================================================== sub list_item { $_[0]->[ $_[1] || 0 ]; } sub list_list { $_[0]; } sub list_hash { my $list = shift; if (@_) { my $n = shift || 0; return { map { ($n++, $_) } @$list }; } no warnings; return { @$list }; } sub list_push { my $list = shift; push(@$list, @_); return ''; } sub list_pop { my $list = shift; pop(@$list); } sub list_unshift { my $list = shift; unshift(@$list, @_); return ''; } sub list_shift { my $list = shift; shift(@$list); } sub list_max { no warnings; my $list = shift; $#$list; } sub list_size { no warnings; my $list = shift; $#$list + 1; } sub list_defined { # return the item requested, or 1 if no argument to # indicate that the hash itself is defined my $list = shift; return @_ ? defined $list->[$_[0]] : 1; } sub list_first { my $list = shift; return $list->[0] unless @_; return [ @$list[0..$_[0]-1] ]; } sub list_last { my $list = shift; return $list->[-1] unless @_; return [ @$list[-$_[0]..-1] ]; } sub list_reverse { my $list = shift; [ reverse @$list ]; } sub list_grep { my ($list, $pattern) = @_; $pattern ||= ''; return [ grep /$pattern/, @$list ]; } sub list_join { my ($list, $joint) = @_; join(defined $joint ? $joint : ' ', map { defined $_ ? $_ : '' } @$list); } sub _list_sort_make_key { my ($item, $fields) = @_; my @keys; if (ref($item) eq 'HASH') { @keys = map { $item->{ $_ } } @$fields; } elsif (blessed $item) { @keys = map { $item->can($_) ? $item->$_() : $item } @$fields; } else { @keys = $item; } # ugly hack to generate a single string using a delimiter that is # unlikely (but not impossible) to be found in the wild. return lc join('/*^UNLIKELY^*/', map { defined $_ ? $_ : '' } @keys); } sub list_sort { my ($list, @fields) = @_; return $list unless @$list > 1; # no need to sort 1 item lists return [ @fields # Schwartzian Transform ? map { $_->[0] } # for case insensitivity sort { $a->[1] cmp $b->[1] } map { [ $_, _list_sort_make_key($_, \@fields) ] } @$list : map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [ $_, lc $_ ] } @$list, ]; } sub list_nsort { my ($list, @fields) = @_; return $list unless @$list > 1; # no need to sort 1 item lists return [ @fields # Schwartzian Transform ? map { $_->[0] } # for case insensitivity sort { $a->[1] <=> $b->[1] } map { [ $_, _list_sort_make_key($_, \@fields) ] } @$list : map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [ $_, lc $_ ] } @$list, ]; } sub list_unique { my %u; [ grep { ++$u{$_} == 1 } @{$_[0]} ]; } sub list_import { my $list = shift; push(@$list, grep defined, map ref eq 'ARRAY' ? @$_ : undef, @_); return $list; } sub list_merge { my $list = shift; return [ @$list, grep defined, map ref eq 'ARRAY' ? @$_ : undef, @_ ]; } sub list_slice { my ($list, $from, $to) = @_; $from ||= 0; $to = $#$list unless defined $to; $from += @$list if $from < 0; $to += @$list if $to < 0; return [ @$list[$from..$to] ]; } sub list_splice { my ($list, $offset, $length, @replace) = @_; if (@replace) { # @replace can contain a list of multiple replace items, or # be a single reference to a list @replace = @{ $replace[0] } if @replace == 1 && ref $replace[0] eq 'ARRAY'; return [ splice @$list, $offset, $length, @replace ]; } elsif (defined $length) { return [ splice @$list, $offset, $length ]; } elsif (defined $offset) { return [ splice @$list, $offset ]; } else { return [ splice(@$list) ]; } } 1; __END__ =head1 NAME Template::VMethods - Virtual methods for variables =head1 DESCRIPTION The C module implements the virtual methods that can be applied to variables. Please see L for further information. =head1 AUTHOR Andy Wardley Eabw@wardley.orgE L =head1 COPYRIGHT Copyright (C) 1996-2007 Andy Wardley. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L, L =cut # Local Variables: # mode: perl # perl-indent-level: 4 # indent-tabs-mode: nil # End: # # vim: expandtab shiftwidth=4: likwid-3.1.3/src/libperfctr.c000644 137545 027340 00000053116 12426160352 016407 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: libperfctr.c * * Description: Marker API interface of module perfmon * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ /* ##### HEADER FILE INCLUDES ######################################### */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */ static int perfmon_numCounters=0; /* total number of counters */ static int perfmon_numCountersCore=0; /* max index of core counters */ static int perfmon_numCountersUncore=0; /* max index of conventional uncore counters */ static PerfmonCounterMap* perfmon_counter_map = NULL; static int socket_lock[MAX_NUM_NODES]; static int thread_socketFD[MAX_NUM_THREADS]; static int hasPCICounters = 0; static int likwid_init = 0; static BitMask counterMask; /* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */ #define gettid() syscall(SYS_gettid) /* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */ void str2BitMask(const char* str, BitMask* mask) { char* endptr; errno = 0; struct bstrList* tokens; bstring q = bfromcstralloc (60, str); tokens = bsplit(q,' '); for (int i=0; iqty; i++) { uint64_t val = strtoull((char*) tokens->entry[i]->data, &endptr, 16); if ((errno == ERANGE && val == LONG_MAX ) || (errno != 0 && val == 0)) { ERROR; } if (endptr == str) { ERROR_PLAIN_PRINT(No digits were found); } mask->mask[i] = val; } bstrListDestroy(tokens); bdestroy(q); } static int getProcessorID(cpu_set_t* cpu_set) { int processorId; for (processorId=0;processorIdcount++; /* Core specific counters */ for ( int i=0; iStartPMcounters[i] = (double) msr_tread( socket_fd, cpu_id, perfmon_counter_map[i].counterRegister); } } } /* Uncore specific counters */ if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) || lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)) { /* Conventional Uncore counters */ for ( int i=perfmon_numCountersCore; iStartPMcounters[i] = (double) msr_tread( socket_fd, cpu_id, perfmon_counter_map[i].counterRegister); } else { results->StartPMcounters[i] = (double) power_tread( socket_fd, cpu_id, perfmon_counter_map[i].counterRegister); } } } /* PCI Uncore counters */ if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) ) { for ( int i=perfmon_numCountersUncore; iStartPMcounters[perfmon_counter_map[i].index] = (double) counter_result; } } } } timer_start(&(results->startTime)); } #define READ_END_MEM_CHANNEL(channel, reg, cid) \ counter_result = pci_tread(socket_fd, cpu_id, channel, reg##_A); \ counter_result = (counter_result<<32) + \ pci_tread(socket_fd, cpu_id, channel, reg##_B); \ results->PMcounters[cid] += (double) counter_result - results->StartPMcounters[cid] /* TODO: Readout hash at the end. Compute result at the end of the function to * keep overhead in region low */ void likwid_markerStopRegion(const char* regionTag) { if (! likwid_init) { return; } TimerData timestamp; timer_stop(×tamp); int cpu_id = likwid_getProcessorId(); uint64_t res; int socket_fd = thread_socketFD[cpu_id]; double PMcounters[NUM_PMC]; /* Core specific counters */ for ( int i=0; istartTime.stop = timestamp.stop; results->time += timer_print(&(results->startTime)); bdestroy(tag); /* Accumulate the results */ /* Core counters */ for ( int i=0; iPMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]); } else { results->PMcounters[i] = PMcounters[i]; } } } /* Uncore counters */ if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)) { for ( int i=perfmon_numCountersCore; i= results->StartPMcounters[i]) { results->PMcounters[i] += power_info.energyUnit * (PMcounters[i] - results->StartPMcounters[i]); } else { results->PMcounters[i] += power_info.energyUnit * (((double)0xFFFFFFFF) - results->StartPMcounters[i] + PMcounters[i]); } } else { results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]); } } } } } int likwid_getProcessorId() { cpu_set_t cpu_set; CPU_ZERO(&cpu_set); sched_getaffinity(gettid(),sizeof(cpu_set_t), &cpu_set); return getProcessorID(&cpu_set); } #ifdef HAS_SCHEDAFFINITY int likwid_pinThread(int processorId) { int ret; cpu_set_t cpuset; pthread_t thread; thread = pthread_self(); CPU_ZERO(&cpuset); CPU_SET(processorId, &cpuset); ret = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); if (ret != 0) { ERROR; return FALSE; } return TRUE; } #endif int likwid_pinProcess(int processorId) { int ret; cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(processorId, &cpuset); ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); if (ret < 0) { ERROR; return FALSE; } return TRUE; } likwid-3.1.3/perl/Template/Stash/000755 137545 027340 00000000000 12426160162 017110 5ustar00unrz254unrz000000 000000 likwid-3.1.3/groups/phi/READ_MISS_RATIO.txt000644 137545 027340 00000000244 12336605216 020461 0ustar00unrz254unrz000000 000000 SHORT Miss ratio for data read EVENTSET PMC0 DATA_READ PMC1 DATA_READ_MISS METRICS Runtime (RDTSC) [s] time Miss ratio PMC1/PMC0 LONG Miss ratio for data read likwid-3.1.3/bench/x86-64/store_mem.ptt000644 137545 027340 00000000446 12336605216 017777 0ustar00unrz254unrz000000 000000 STREAMS 1 TYPE DOUBLE FLOPS 0 BYTES 8 movaps FPR1, [SCALAR] movaps FPR2, [SCALAR] movaps FPR3, [SCALAR] movaps FPR4, [SCALAR] LOOP 8 movntpd [STR0 + GPR1 * 8] , FPR1 movntpd [STR0 + GPR1 * 8 + 16], FPR2 movntpd [STR0 + GPR1 * 8 + 32], FPR3 movntpd [STR0 + GPR1 * 8 + 48], FPR4 likwid-3.1.3/groups/phi/MEM6.txt000644 137545 027340 00000000307 12336605216 016661 0ustar00unrz254unrz000000 000000 SHORT L2 Read Misses EVENTSET PMC0 L2_READ_MISS METRICS Runtime (RDTSC) [s] time L2 Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time L2 Read Data Volume [GBytes] 1.0E-09*PMC0*64.0 LONG Bla likwid-3.1.3/test/testmarker-omp.c000644 137545 027340 00000002233 12336605216 017412 0ustar00unrz254unrz000000 000000 #include #include #include #define SIZE 1000000 double sum = 0, a[SIZE], b[SIZE], c[SIZE]; main() { double alpha = 3.14; /* Initialize */ for (int i=0; i. # # ======================================================================================= EVENT_FPU_PIPE_ASSIGNMENT 0x00 PMC UMASK_FPU_PIPE_ASSIGNMENT_PIPE_0 0x01 UMASK_FPU_PIPE_ASSIGNMENT_PIPE_1 0x02 EVENT_FP_SCHEDULER_EMPTY 0x01 PMC UMASK_FP_SCHEDULER_EMPTY 0x00 EVENT_DISPATCHED_FAST_FPU 0x02 PMC UMASK_DISPATCHED_FAST_FPU 0x00 EVENT_RETIRED_FLOPS 0x03 PMC UMASK_RETIRED_FLOPS_DOUBLE_DIV 0x40 UMASK_RETIRED_FLOPS_DOUBLE_MUL 0x20 UMASK_RETIRED_FLOPS_DOUBLE_ADD 0x10 UMASK_RETIRED_FLOPS_DOUBLE_ALL 0x70 UMASK_RETIRED_FLOPS_SINGLE_DIV 0x04 UMASK_RETIRED_FLOPS_SINGLE_MUL 0x02 UMASK_RETIRED_FLOPS_SINGLE_ADD 0x01 UMASK_RETIRED_FLOPS_SINGLE_ALL 0x07 EVENT_RETIRED_SERIALIZING_OPS 0x05 PMC UMASK_RETIRED_SERIALIZING_OPS_SSE_BOTTOM 0x00 UMASK_RETIRED_SERIALIZING_OPS_SSE_CONTROL 0x01 UMASK_RETIRED_SERIALIZING_OPS_X87_BOTTOM 0x02 UMASK_RETIRED_SERIALIZING_OPS_X87_CONTROL 0x04 EVENT_RETIRED_X87_FLOPS 0x11 PMC UMASK_RETIRED_X87_FLOPS_ADD 0x00 UMASK_RETIRED_X87_FLOPS_MUL 0x01 UMASK_RETIRED_X87_FLOPS_DIV 0x02 EVENT_SEGMENT_REGISTER_LOADS 0x20 PMC UMASK_SEGMENT_REGISTER_LOADS_ES 0x00 UMASK_SEGMENT_REGISTER_LOADS_CS 0x01 UMASK_SEGMENT_REGISTER_LOADS_SS 0x02 UMASK_SEGMENT_REGISTER_LOADS_DS 0x04 UMASK_SEGMENT_REGISTER_LOADS_FS 0x10 UMASK_SEGMENT_REGISTER_LOADS_GS 0x20 UMASK_SEGMENT_REGISTER_LOADS_HS 0x40 EVENT_PIPELINE_RESTART_SELFMOD_CODE 0x21 PMC UMASK_PIPELINE_RESTART_SELFMOD_CODE 0x00 EVENT_PIPELINE_RESTART_PROBE_HIT 0x22 PMC UMASK_PIPELINE_RESTART_PROBE_HIT_INV 0x00 UMASK_PIPELINE_RESTART_PROBE_HIT_FILLS 0x01 EVENT_LOCKED_OPERATION 0x24 PMC UMASK_LOCKED_OPERATION_CYCLES_TO_UNLCOK 0x04 UMASK_LOCKED_OPERATION_CYCLES_TO_BUS_LOCK 0x02 UMASK_LOCKED_OPERATION_LOCKED_INSTR 0x01 EVENT_CLFLUSH_RETIRED 0x26 PMC UMASK_CLFLUSH_RETIRED 0x00 EVENT_CPUID_RETIRED 0x27 PMC UMASK_CPUID_RETIRED 0x00 EVENT_LS_DISPATCH 0x29 PMC UMASK_LS_DISPATCH_LOADS 0x01 UMASK_LS_DISPATCH_STORES 0x02 UMASK_LS_DISPATCH_LOAD_OP_STORES 0x04 EVENT_CANCELED_STORE_FORWARD 0x2A PMC UMASK_CANCELED_STORE_FORWARD_ADDR_MISMATCH 0x01 UMASK_CANCELED_STORE_FORWARD_SMALL_STORES 0x02 UMASK_CANCELED_STORE_FORWARD_MISALIGNED_STORES 0x04 EVENT_DATA_CACHE_ACCESSES 0x40 PMC UMASK_DATA_CACHE_ACCESSES 0x00 EVENT_DATA_CACHE_MISSES 0x41 PMC UMASK_DATA_CACHE_MISSES 0x00 EVENT_DATA_CACHE_REFILLS 0x42 PMC UMASK_DATA_CACHE_REFILLS_NONCACHED 0x01 UMASK_DATA_CACHE_REFILLS_SHARED 0x02 UMASK_DATA_CACHE_REFILLS_EXCLUSIVE 0x04 UMASK_DATA_CACHE_REFILLS_OWNED 0x08 UMASK_DATA_CACHE_REFILLS_MODIFIED 0x10 UMASK_DATA_CACHE_REFILLS_ALL 0x1E EVENT_DATA_CACHE_REFILLS_NB 0x43 PMC UMASK_DATA_CACHE_REFILLS_NB_NONCACHED 0x01 UMASK_DATA_CACHE_REFILLS_NB_SHARED 0x02 UMASK_DATA_CACHE_REFILLS_NB_EXCLUSIVE 0x04 UMASK_DATA_CACHE_REFILLS_NB_OWNED 0x08 UMASK_DATA_CACHE_REFILLS_NB_MODIFIED 0x10 UMASK_DATA_CACHE_REFILLS_NB_ALL 0x1E EVENT_DATA_CACHE_EVICTED 0x44 PMC UMASK_DATA_CACHE_EVICTED_INVALID 0x01 UMASK_DATA_CACHE_EVICTED_SHARED 0x02 UMASK_DATA_CACHE_EVICTED_EXCLUSIVE 0x04 UMASK_DATA_CACHE_EVICTED_OWNED 0x08 UMASK_DATA_CACHE_EVICTED_MODIFIED 0x10 UMASK_DATA_CACHE_EVICTED_ALL 0x1F EVENT_L2_DTLB_HIT 0x45 PMC UMASK_L2_DTLB_HIT 0x00 UMASK_L2_DTLB_HIT_ALL 0x01 EVENT_DTLB_MISS 0x46 PMC UMASK_DTLB_MISS_STORE_L1 0x01 UMASK_DTLB_MISS_LOAD_L1 0x02 UMASK_DTLB_MISS_STORE_L2 0x04 UMASK_DTLB_MISS_LOAD_L2 0x08 UMASK_DTLB_MISS_ALL 0x0F EVENT_MISALIGNED_ACCESS 0x47 PMC UMASK_MISALIGNED_ACCESS_16b 0x01 UMASK_MISALIGNED_ACCESS_4Kb 0x02 EVENT_PREFETCH_INSTR_DISPATCHED 0x4B PMC UMASK_PREFETCH_INSTR_DISPATCHED_LOAD 0x01 UMASK_PREFETCH_INSTR_DISPATCHED_STORE 0x02 UMASK_PREFETCH_INSTR_DISPATCHED_NTA 0x04 EVENT_DCACHE_LOCK_MISS 0x4C PMC UMASK_DCACHE_LOCK_MISS 0x02 EVENT_DTLB_L1_HIT 0x4D PMC UMASK_DTLB_L1_HIT_4K 0x01 UMASK_DTLB_L1_HIT_2M 0x02 EVENT_INEFFECTIVE_PREFETCHES 0x52 PMC UMASK_INEFFECTIVE_PREFETCHES_DATA_CACHE 0x01 UMASK_INEFFECTIVE_PREFETCHES_PENDING_FILL 0x02 UMASK_INEFFECTIVE_PREFETCHES_NO_MAB 0x04 UMASK_INEFFECTIVE_PREFETCHES_L2_HIT 0x08 EVENT_GLOBAL_TLB_FLUSH 0x54 PMC UMASK_GLOBAL_TLB_FLUSH 0x00 EVENT_GLOBAL_READ_BLOCK_OPS 0x62 PMC UMASK_GLOBAL_READ_BLOCK_OPS_READ 0x01 UMASK_GLOBAL_READ_BLOCK_OPS_RDBLK_MOD 0x02 UMASK_GLOBAL_READ_BLOCK_OPS_RDBLK_SHARED 0x04 UMASK_GLOBAL_READ_BLOCK_OPS_RDBLKSPEC 0x08 UMASK_GLOBAL_READ_BLOCK_OPS_RDBLKSPEC_MOD 0x10 UMASK_GLOBAL_READ_BLOCK_OPS_SPEC_SHARED 0x20 EVENT_MEMORY_REQUESTS 0x65 PMC UMASK_MEMORY_REQUESTS_UC 0x01 UMASK_MEMORY_REQUESTS_WC 0x02 UMASK_MEMORY_REQUESTS_SS 0x80 EVENT_DATA_PREFETCHER 0x67 PMC UMASK_DATA_PREFETCHER_ATTEMPTS 0x02 UMASK_DATA_PREFETCHER_HIT_MAB 0x08 #FIXME - Do we need to update mask details EVENT_MAB_REQ 0x68 PMC UMASK_MAB_REQ 0x00 EVENT_MAB_WAIT 0x69 PMC UMASK_MAB_WAIT 0x00 EVENT_RESPONSE_ON_CACHE_REFILLS 0x6C PMC UMASK_RESPONSE_ON_CACHE_REFILLS_EXCLUSIVE 0x01 UMASK_RESPONSE_ON_CACHE_REFILLS_MODIFIED 0x02 UMASK_RESPONSE_ON_CACHE_REFILLS_SHARED 0x04 UMASK_RESPONSE_ON_CACHE_REFILLS_OWNED 0x08 UMASK_RESPONSE_ON_CACHE_REFILLS_DATA_ERROR 0x10 UMASK_RESPONSE_ON_CACHE_REFILLS_CHANGE_TO_DIRTY 0x20 UMASK_RESPONSE_ON_CACHE_REFILLS_UNCACHEABLE 0x40 EVENT_DATA_WRITTEN 0x6D PMC UMASK_DATA_WRITTEN_LINE_EVICTION 0x01 UMASK_DATA_WRITTEN_INSN_ATTR__EVICTION 0x02 UMASK_DATA_WRITTEN_BYTE_ENABLE_UNCACHEABLE_IO_STORE 0x04 UMASK_DATA_WRITTEN_UNCACHEABLE_IO_STORE 0x08 UMASK_DATA_WRITTEN_BYTE_ENABLE_WRITE_COMBINE_FLUSH 0x10 UMASK_DATA_WRITTEN_WRITE_COMBINE_FLUSH 0x20 EVENT_CACHE_CROSS_INVALIDATES 0x75 PMC UMASK_CACHE_CROSS_INVALIDATES_DC_IC 0x01 UMASK_CACHE_CROSS_INVALIDATES_DC_DC 0x02 UMASK_CACHE_CROSS_INVALIDATES_IC_IC 0x04 UMASK_CACHE_CROSS_INVALIDATES_IC_DC 0x08 UMASK_CACHE_CROSS_INVALIDATES_IC_HITS_DC 0x10 UMASK_CACHE_CROSS_INVALIDATES_DC_PROBE_REJ_EARLY 0x20 UMASK_CACHE_CROSS_INVALIDATES_DC_PROBE_REJ_LATE 0x40 EVENT_CPU_CLOCKS_UNHALTED 0x76 PMC UMASK_CPU_CLOCKS_UNHALTED 0x00 EVENT_PDC_MISS 0x162 PMC UMASK_PDC_MISS_HOST_PDE 0x01 UMASK_PDC_MISS_HOST_PDPE 0x02 UMASK_PDC_MISS_HOST_PML4E 0x04 UMASK_PDC_MISS_GUEST_PDE 0x10 UMASK_PDC_MISS_GUEST_PDPE 0x20 UMASK_PDC_MISS_GUEST_PML4E 0x40 EVENT_INSTRUCTION_CACHE_FETCHES 0x080 PMC UMASK_INSTRUCTION_CACHE_FETCHES 0x00 EVENT_INSTRUCTION_CACHE_MISSES 0x081 PMC UMASK_INSTRUCTION_CACHE_MISSES 0x00 EVENT_INSTRUCTION_CACHE_L2_REFILLS 0x082 PMC UMASK_INSTRUCTION_CACHE_L2_REFILLS 0x00 EVENT_INSTRUCTION_CACHE_SYSTEM_REFILLS 0x083 PMC UMASK_INSTRUCTION_CACHE_SYSTEM_REFILLS 0x00 EVENT_ITLB_L1_MISS_L2_HIT 0x084 PMC UMASK_ITLB_L1_MISS_L2_HIT 0x00 EVENT_ITLB_L1_MISS_L2_MISS 0x085 PMC UMASK_ITLB_L1_MISS_L2_MISS_4KB 0x01 UMASK_ITLB_L1_MISS_L2_MISS_2MB 0x02 EVENT_INSTRUCTION_FETCH_STALL 0x087 PMC UMASK_INSTRUCTION_FETCH_STALL 0x00 EVENT_RETURN_STACK_HITS 0x088 PMC UMASK_RETURN_STACK_HITS 0x00 EVENT_RETURN_STACK_OVERFLOWS 0x089 PMC UMASK_RETURN_STACK_OVERFLOWS 0x00 EVENT_INSTRUCTION_CACHE_VICTIMS 0x08B PMC UMASK_INSTRUCTION_CACHE_VICTIMS 0x00 EVENT_ICACHE_LINES_INVALIDATED 0x8C PMC UMASK_ICACHE_LINES_INVALIDATED_LS_PROBE 0x01 UMASK_ICACHE_LINES_INVALIDATED_BU_PROBE 0x02 EVENT_ITLB_RELOADS 0x099 PMC UMASK_ITLB_RELOADS 0x00 EVENT_ITLB_RELOADS_ABORTED 0x09A PMC UMASK_ITLB_RELOADS_ABORTED 0x00 EVENT_RETIRED_INDIR_BRANCH 0x19A PMC UMASK_RETIRED_INDIR_BRANCH 0x01 UMASK_RETIRED_INDIR_MISPRED_BRANCH 0x02 EVENT_RETIRED_INSTRUCTIONS 0x0C0 PMC UMASK_RETIRED_INSTRUCTIONS 0x00 EVENT_RETIRED_UOPS 0x0C1 PMC UMASK_RETIRED_UOPS 0x00 EVENT_RETIRED_BRANCH_INSTR 0x0C2 PMC UMASK_RETIRED_BRANCH_INSTR 0x00 EVENT_RETIRED_MISPREDICTED_BRANCH_INSTR 0x0C3 PMC UMASK_RETIRED_MISPREDICTED_BRANCH_INSTR 0x00 EVENT_RETIRED_TAKEN_BRANCH_INSTR 0x0C4 PMC UMASK_RETIRED_TAKEN_BRANCH_INSTR 0x00 EVENT_RETIRED_TAKEN_MISPREDICTED_BRANCH_INSTR 0x0C5 PMC UMASK_RETIRED_TAKEN_MISPREDICTED_BRANCH_INSTR 0x00 EVENT_RETIRED_FAR_CONTROL_TRANSFERS 0x0C6 PMC UMASK_RETIRED_FAR_CONTROL_TRANSFERS 0x00 EVENT_RETIRED_BRANCH_RESYNCS 0x0C7 PMC UMASK_RETIRED_BRANCH_RESYNCS 0x00 EVENT_RETIRED_NEAR_RETURNS 0x0C8 PMC UMASK_RETIRED_NEAR_RETURNS 0x00 EVENT_RETIRED_NEAR_RETURNS_MISPRED 0x0C9 PMC UMASK_RETIRED_NEAR_RETURNS_MISPRED 0x00 EVENT_RETIRED_INDIRECT_BRANCHES_MISPRED 0x0CA PMC UMASK_RETIRED_INDIRECT_BRANCHES_MISPRED 0x00 EVENT_RETIRED_FP_INSTRUCTIONS 0x0CB PMC UMASK_RETIRED_FP_INSTRUCTIONS_X87_MMX 0x01 UMASK_RETIRED_FP_INSTRUCTIONS_SSE 0x02 UMASK_RETIRED_FP_INSTRUCTIONS_ALL 0x03 EVENT_INTERRUPTS_MASKED_CYCLES 0x0CD PMC UMASK_INTERRUPTS_MASKED_CYCLES 0x00 EVENT_INTERRUPTS_MASKED_CYCLES_INTERRUPTS_PENDING 0x0CE PMC UMASK_INTERRUPTS_MASKED_CYCLES_INTERRUPTS_ 0x00 EVENT_INTERRUPTS_TAKEN 0x0CF PMC UMASK_INTERRUPTS_TAKEN 0x00 EVENT_FPU_EXCEPTION 0x0DB PMC UMASK_FPU_EXCEPTION_X87_MICROFAULTS 0x01 UMASK_FPU_EXCEPTION_SSE_RETYPE__MICROFAULTS 0x02 UMASK_FPU_EXCEPTION_SSE_RECLASS_MICROFAULTS 0x04 UMASK_FPU_EXCEPTION_SSE_X87_MICROTRAPS 0x08 UMASK_FPU_EXCEPTION_ALL 0x0F EVENT_IBS_OPS 0x1CF PMC UMASK_IBS_OPS 0x0 EVENT_IBS_RETIRED_OPS 0x1D0 PMC UMASK_IBS_RETIRED_OPS 0x0 EVENT_UNC_MEMORY_CONTROLLER_BYPASS_COUNTER_SATURATION 0x0E4 UPMC UMASK_UNC_MEMORY_CONTROLLER_BYPASS_COUNTER_SATURATION_MEM_HIGH 0x01 UMASK_UNC_MEMORY_CONTROLLER_BYPASS_COUNTER_SATURATION_MEM_MED 0x02 UMASK_UNC_MEMORY_CONTROLLER_BYPASS_COUNTER_SATURATION__DCQ 0x04 UMASK_UNC_MEMORY_CONTROLLER_BYPASS_COUNTER_SATURATION_DCQ_SATURATED 0x20 EVENT_UNC_THERMAL_STATUS 0x0E8 UPMC UMASK_UNC_THERMAL_STATUS_HTC_TRIP_POINT_CROSSED 0x04 UMASK_UNC_THERMAL_STATUS_HTCP_INACTIVE 0x20 UMASK_UNC_THERMAL_STATUS_HTCP_ACTIVE 0x40 EVENT_UNC_CPU_REQUEST_TO_MEMORY 0x0E9 UPMC UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_LOCAL_CPU_MEM 0xA8 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_LOCAL_CPU_IO 0xA4 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_LOCAL_IO_MEM 0xA2 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_LOCAL_IO_IO 0xA1 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_CPU_MEM 0x98 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_CPU_IO 0x94 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_MEM 0x92 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_IO 0x91 UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_CPU_IO 0x64 UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_IO_IO 0x61 EVENT_UNC_CACHE_BLOCK_COMMANDS 0x0EA UPMC UMASK_UNC_CACHE_BLOCK_COMMANDS_VICTIM_BLOCK 0x01 UMASK_UNC_CACHE_BLOCK_COMMANDS_READ_BLOCK 0x04 UMASK_UNC_CACHE_BLOCK_COMMANDS_READ_BLOCK_SHARED 0x08 UMASK_UNC_CACHE_BLOCK_COMMANDS_READ_BLOCK_MODIFIED 0x10 UMASK_UNC_CACHE_BLOCK_COMMANDS_CHANGE_TO_DIRTY 0x20 EVENT_UNC_SIZED_COMMANDS 0x0EB UPMC UMASK_UNC_SIZED_COMMANDS_NP_WR_BYTE 0x01 UMASK_UNC_SIZED_COMMANDS_NP_WR_DW 0x02 UMASK_UNC_SIZED_COMMANDS_P_WR_BYTE 0x04 UMASK_UNC_SIZED_COMMANDS_P_WR_DW 0x08 UMASK_UNC_SIZED_COMMANDS_RE_BYTE 0x10 UMASK_UNC_SIZED_COMMANDS_RE_DW 0x20 EVENT_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS 0x0EC UPMC UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_PROBE_MISS 0x01 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_PROBE_HIT_CLEAN 0x02 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_PROBE_HIT_DIRTY_WITHOUT_CANCEL 0x04 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_PROBE_HIT_DIRTY_WITH_CANCEL 0x08 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_UPSTREAM_DISPLAY_READS 0x10 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_UPSTREAM_NON_DISPLAY_READS 0x20 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_UPSTREAM_ISOC_WRITED 0x40 UMASK_UNC_PROBE_RESPONSES_UPSTREAM_REQUESTS_UPSTREAM_NON_ISOC_WRITES 0x80 EVENT_UNC_CPU_TO_DRAM 0x1E0 UPMC UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_0 0x01 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_1 0x02 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_2 0x04 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_3 0x08 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_4 0x10 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_5 0x20 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_6 0x40 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_7 0x80 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_ALL 0xFF EVENT_UNC_IO_TO_DRAM 0x1E1 UPMC UMASK_UNC_IO_TO_DRAM_LOCAL_TO_0 0x01 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_1 0x02 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_2 0x04 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_3 0x08 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_4 0x10 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_5 0x20 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_6 0x40 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_7 0x80 UMASK_UNC_IO_TO_DRAM_LOCAL_TO_ALL 0xFF EVENT_UNC_CPU_READ_CMD_LATENCY_A 0x1E2 UPMC UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_0 0x11 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_0 0x12 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_0 0x14 UMASK_UNC_CPU_READ_CMD_LATENCY_A_DIRTY_TO_0 0x18 UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_0 0x1F UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_1 0x21 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_1 0x22 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_1 0x24 UMASK_UNC_CPU_READ_CMD_LATENCY_A_DIRTY_TO_1 0x28 UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_1 0x2F UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_2 0x41 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_2 0x42 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_2 0x44 UMASK_UNC_CPU_READ_CMD_LATENCY_A_DIRTY_TO_2 0x48 UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_3 0x4F UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_3 0x81 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_3 0x82 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_3 0x84 UMASK_UNC_CPU_READ_CMD_LATENCY_A_DIRTY_TO_3 0x88 UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_3 0x8F EVENT_UNC_CPU_READ_CMD_REQUESTS_A 0x1E3 UPMC UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_TO_0 0x11 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_MOD_TO_0 0x12 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_SHARED_TO_0 0x14 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_DIRTY_TO_0 0x18 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_ALL_TO_0 0x1F UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_TO_1 0x21 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_MOD_TO_1 0x22 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_SHARED_TO_1 0x24 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_DIRTY_TO_1 0x28 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_ALL_TO_1 0x2F UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_TO_2 0x41 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_MOD_TO_2 0x42 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_SHARED_TO_2 0x44 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_DIRTY_TO_2 0x48 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_ALL_TO_2 0x4F UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_TO_3 0x81 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_MOD_TO_3 0x82 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_READ_SHARED_TO_3 0x84 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_DIRTY_TO_3 0x88 UMASK_UNC_CPU_READ_CMD_REQUESTS_A_ALL_TO_3 0x8F EVENT_UNC_CPU_READ_CMD_LATENCY_B 0x1E4 UPMC UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_TO_4 0x11 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_MOD_TO_4 0x12 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_SHARED_TO_4 0x14 UMASK_UNC_CPU_READ_CMD_LATENCY_B_DIRTY_TO_4 0x18 UMASK_UNC_CPU_READ_CMD_LATENCY_B_ALL_TO_4 0x1F UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_TO_5 0x21 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_MOD_TO_5 0x22 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_SHARED_TO_5 0x24 UMASK_UNC_CPU_READ_CMD_LATENCY_B_DIRTY_TO_5 0x28 UMASK_UNC_CPU_READ_CMD_LATENCY_B_ALL_TO_5 0x2F UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_TO_6 0x41 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_MOD_TO_6 0x42 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_SHARED_TO_6 0x44 UMASK_UNC_CPU_READ_CMD_LATENCY_B_DIRTY_TO_6 0x48 UMASK_UNC_CPU_READ_CMD_LATENCY_B_ALL_TO_6 0x4F UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_TO_7 0x81 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_MOD_TO_7 0x82 UMASK_UNC_CPU_READ_CMD_LATENCY_B_READ_SHARED_TO_7 0x84 UMASK_UNC_CPU_READ_CMD_LATENCY_B_DIRTY_TO_7 0x88 UMASK_UNC_CPU_READ_CMD_LATENCY_B_ALL_TO_7 0x8F EVENT_UNC_CPU_READ_CMD_REQUESTS_B 0x1E5 UPMC UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_TO_4 0x11 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_MOD_TO_4 0x12 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_SHARED_TO_4 0x14 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_DIRTY_TO_4 0x18 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_ALL_TO_4 0x1F UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_TO_5 0x21 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_MOD_TO_5 0x22 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_SHARED_TO_5 0x24 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_DIRTY_TO_5 0x28 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_ALL_TO_5 0x2F UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_TO_6 0x41 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_MOD_TO_6 0x42 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_SHARED_TO_6 0x44 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_DIRTY_TO_6 0x48 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_ALL_TO_6 0x4F UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_TO_7 0x81 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_MOD_TO_7 0x82 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_READ_SHARED_TO_7 0x84 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_DIRTY_TO_7 0x88 UMASK_UNC_CPU_READ_CMD_REQUESTS_B_ALL_TO_7 0x8F EVENT_UNC_CPU_CMD_LATENCY 0x1E6 UPMC UMASK_UNC_CPU_CMD_LATENCY_READ_TO_0 0x11 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_0 0x12 UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_0 0x14 UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_0 0x17 UMASK_UNC_CPU_CMD_LATENCY_READ_TO_4 0x19 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_4 0x1A UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_4 0x1C UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_4 0x1F UMASK_UNC_CPU_CMD_LATENCY_READ_TO_1 0x21 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_1 0x22 UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_1 0x24 UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_1 0x27 UMASK_UNC_CPU_CMD_LATENCY_READ_TO_5 0x29 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_5 0x2A UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_5 0x2C UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_5 0x2F UMASK_UNC_CPU_CMD_LATENCY_READ_TO_2 0x41 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_2 0x42 UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_2 0x44 UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_2 0x47 UMASK_UNC_CPU_CMD_LATENCY_READ_TO_6 0x49 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_6 0x4A UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_6 0x4C UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_6 0x4F UMASK_UNC_CPU_CMD_LATENCY_READ_TO_3 0x81 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_3 0x82 UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_3 0x84 UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_3 0x87 UMASK_UNC_CPU_CMD_LATENCY_READ_TO_7 0x89 UMASK_UNC_CPU_CMD_LATENCY_WRITE_TO_7 0x8A UMASK_UNC_CPU_CMD_LATENCY_VICTIM_TO_7 0x8C UMASK_UNC_CPU_CMD_LATENCY_ALL_TO_7 0x8F EVENT_UNC_CPU_CMD_REQUESTS 0x1E7 UPMC UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_0 0x11 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_0 0x12 UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_0 0x14 UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_0 0x17 UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_4 0x19 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_4 0x1A UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_4 0x1C UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_4 0x1F UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_1 0x21 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_1 0x22 UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_1 0x24 UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_1 0x27 UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_5 0x29 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_5 0x2A UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_5 0x2C UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_5 0x2F UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_2 0x41 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_2 0x42 UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_2 0x44 UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_2 0x47 UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_6 0x49 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_6 0x4A UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_6 0x4C UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_6 0x4F UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_3 0x81 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_3 0x82 UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_3 0x84 UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_3 0x87 UMASK_UNC_CPU_CMD_REQUESTS_READ_TO_7 0x89 UMASK_UNC_CPU_CMD_REQUESTS_WRITE_TO_7 0x8A UMASK_UNC_CPU_CMD_REQUESTS_VICTIM_TO_7 0x8C UMASK_UNC_CPU_CMD_REQUESTS_ALL_TO_7 0x8F EVENT_REQ_CACHE_STATUS 0x1EB UPMC UMASK_EVENT_REQ_CACHE_STATUS_HIT_S 0x01 UMASK_EVENT_REQ_CACHE_STATUS_HIT_E 0x02 UMASK_EVENT_REQ_CACHE_STATUS_HIT_MUW_O 0x04 UMASK_EVENT_REQ_CACHE_STATUS_HIT_M 0x08 UMASK_EVENT_REQ_CACHE_STATUS_HIT_MISS 0x10 UMASK_EVENT_REQ_CACHE_STATUS_HIT_DIRECT_PROBE 0x20 UMASK_EVENT_REQ_CACHE_STATUS_HIT_TRACK_CHGTODIRTY 0x40 UMASK_EVENT_REQ_CACHE_STATUS_HIT_TRACK_RDBLKM 0x80 EVENT_UNC_MEMORY_CONTROLLER_REQUESTS 0x1F0 UPMC UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_WRITE_REQ_DCT 0x01 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_READ_REQ_DCT 0x02 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_PREFETCH_REQ_DCT 0x04 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_32_SIZED_WRITES 0x08 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_64_SIZED_WRITES 0x10 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_32_SIZED_READS 0x20 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_64_SIZED_READS 0x40 UMASK_UNC_MEMORY_CONTROLLER_REQUESTS_READ_WHILE_WRITE 0x80 EVENT_UNC_DRAM_ACCESS 0x3EC UPMC UMASK_UNC_DRAM_ACCESS_DCT0_PAGE_HIT 0x01 UMASK_UNC_DRAM_ACCESS_DCT0_PAGE_MISS 0x02 UMASK_UNC_DRAM_ACCESS_DCT0_PAGE_CONFLICT 0x04 UMASK_UNC_DRAM_ACCESSES_DCT0_ALL 0x07 UMASK_UNC_DRAM_ACCESS_DCT1_PAGE_HIT 0x08 UMASK_UNC_DRAM_ACCESS_DCT2_PAGE_MISS 0x10 UMASK_UNC_DRAM_ACCESS_DCT3_PAGE_CONFLICT 0x20 UMASK_UNC_DRAM_ACCESSES_DCT1_ALL 0x38 EVENT_UNC_DRAM_CONTROLLER_PAGE_TABLE_OVERFLOW 0x3ED UPMC UMASK_UNC_DRAM_CONTROLLER_PAGE_TABLE_OVERFLOW_DCT0 0x1 UMASK_UNC_DRAM_CONTROLLER_PAGE_TABLE_OVERFLOW_DCT1 0x2 EVENT_MEMORY_CONTROLLER_DRAM_CMD_SLOTS_MISSED 0x3EE UPMC UMASK_MEMORY_CONTROLLER_DRAM_CMD_SLOTS_MISSED_DCT0 0x1 UMASK_MEMORY_CONTROLLER_DRAM_CMD_SLOTS_MISSED_DCT1 0x2 EVENT_MEMORY_CONTROLLER_TURNAROUNDS 0x3EF UPMC UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT0_DIMM 0x01 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT0_READ_TO_WRITE 0x02 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT0_WRITE_TO_READ 0x04 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT1_DIMM 0x08 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT1_READ_TO_WRITE 0x10 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT1_WRITE_TO_READ 0x20 likwid-3.1.3/src/includes/numa_types.h000644 137545 027340 00000003004 12426160352 020241 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: numa_types.h * * Description: Types file for numa module. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef NUMA_TYPES_H #define NUMA_TYPES_H typedef struct { int id; uint64_t totalMemory; uint64_t freeMemory; int numberOfProcessors; uint32_t* processors; uint32_t* processorsCompact; int numberOfDistances; uint32_t* distances; } NumaNode; typedef struct { uint32_t numberOfNodes; NumaNode* nodes; } NumaTopology; #endif /*NUMA_TYPES_H*/ likwid-3.1.3/test/executable_tests/likwid-features.txt000644 137545 027340 00000000737 12416722717 023513 0ustar00unrz254unrz000000 000000 | EXIT 0 | GREP Performance monitoring | GREP CPU core id -h | EXIT 0 | GREP Help message -v | EXIT 0 | GREP likwid-features -c | EXIT 1 | GREP option requires an argument -s | EXIT 1 | GREP option requires an argument -u | EXIT 1 | GREP option requires an argument -c 0 | EXIT 0 | GREP Performance monitoring | GREP CPU core id -s HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id -u HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id likwid-3.1.3/bench/phi/triad_mem.ptt000644 137545 027340 00000001722 12336605216 017650 0ustar00unrz254unrz000000 000000 STREAMS 4 TYPE DOUBLE FLOPS 2 BYTES 32 LOOP 32 vprefetch0 [STR1 + GPR1 * 8 + 1024] vprefetch0 [STR2 + GPR1 * 8 + 1024] vprefetch0 [STR3 + GPR1 * 8 + 1024] vmovaps zmm0, [STR1 + GPR1*8] vmovaps zmm1, [STR1 + GPR1*8+64] vmovaps zmm2, [STR1 + GPR1*8+128] vmovaps zmm3, [STR1 + GPR1*8+192] vmovaps zmm4, [STR2 + GPR1*8] vmovaps zmm5, [STR2 + GPR1*8+64] vmovaps zmm6, [STR2 + GPR1*8+128] vmovaps zmm7, [STR2 + GPR1*8+192] vfmadd132pd zmm0, zmm4, [STR3 + GPR1*8] vfmadd132pd zmm1, zmm5, [STR3 + GPR1*8+64] vfmadd132pd zmm2, zmm6, [STR3 + GPR1*8+128] vfmadd132pd zmm3, zmm7, [STR3 + GPR1*8+192] vprefetch0 [STR0 + GPR1 * 8 + 1024] vmovnrngoaps [STR0 + GPR1 * 8], zmm0 clevict1 [STR0 + GPR1 * 8] vmovnrngoaps [STR0 + GPR1 * 8 + 64], zmm1 clevict1 [STR0 + GPR1 * 8 + 64] vmovnrngoaps [STR0 + GPR1 * 8 + 128], zmm2 clevict1 [STR0 + GPR1 * 8 + 128] vmovnrngoaps [STR0 + GPR1 * 8 + 192], zmm3 clevict1 [STR0 + GPR1 * 8 + 192] likwid-3.1.3/bench/phi/store.ptt000644 137545 027340 00000000520 12336605216 017036 0ustar00unrz254unrz000000 000000 STREAMS 1 TYPE DOUBLE FLOPS 0 BYTES 8 vmovaps zmm0, [SCALAR] vmovaps zmm1, [SCALAR] vmovaps zmm2, [SCALAR] vmovaps zmm3, [SCALAR] LOOP 32 vprefetch0 [STR0 + GPR1 * 8 + 1024] vmovaps [STR0 + GPR1 * 8] , zmm0 vmovaps [STR0 + GPR1 * 8 + 64], zmm1 vmovaps [STR0 + GPR1 * 8 + 128], zmm2 vmovaps [STR0 + GPR1 * 8 + 192], zmm3 likwid-3.1.3/groups/nehalem/FLOPS_X87.txt000644 137545 027340 00000000536 12336605216 020343 0ustar00unrz254unrz000000 000000 SHORT X87 MFlops/s EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 INST_RETIRED_X87 METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 X87 MFlops/s 1.0E-06*PMC0/time LONG Profiling group to measure X87 flop rate. likwid-3.1.3/groups/nehalemEX/FLOPS_SP.txt000644 137545 027340 00000002020 12336605216 020522 0ustar00unrz254unrz000000 000000 SHORT Single Precision MFlops/s EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time Packed MUOPS/s 1.0E-06*PMC0/time Scalar MUOPS/s 1.0E-06*PMC1/time SP MUOPS/s 1.0E-06*PMC2/time DP MUOPS/s 1.0E-06*PMC3/time LONG Formula: SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime - The Nehalem has not possibility to measure MFlops if mixed precision calculations are done. Therefore both Single as well as Double precision are measured to ensure the correctness of the measurements. You can check if your code was vectorized on the number of FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. likwid-3.1.3/src/access-daemon/Makefile000644 137545 027340 00000003271 12426160352 020246 0ustar00unrz254unrz000000 000000 # ======================================================================================= # # Filename: Makefile # # Description: accessDaemon Makefile # # Version: 3.1.3 # Released: 4.11.2014 # # Author: Jan Treibig (jt), jan.treibig@gmail.com # Project: likwid # # Copyright (C) 2014 Jan Treibig # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see . # # ======================================================================================= include ../../config.mk include ../../make/include_$(COMPILER).mk DAEMON_TARGET = likwid-accessD SETFREQ_TARGET = likwid-setFreq DEFINES = -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) INCLUDES = -I../includes ifeq ($(COMPILER),GCC) CFLAGS += -pedantic -Wall -Wextra -std=c99 endif CPPFLAGS := $(DEFINES) $(INCLUDES) Q= all: $(DAEMON_TARGET) $(SETFREQ_TARGET) $(DAEMON_TARGET): accessDaemon.c $(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c $(SETFREQ_TARGET): setFreq.c $(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c likwid-3.1.3/test/chaos.F90000644 137545 027340 00000002103 12336605216 015645 0ustar00unrz254unrz000000 000000 program testmarker use likwid implicit none integer, parameter :: n=10000000, nrep1 = 100, nrep2 = 100 integer :: i=0 real(kind=8) :: a(n), b(n), c(n), s do i = 1, n a(i) = 1.0/float(i) b(i) = 1.0 c(i) = float(i) end do call likwid_markerInit() ! dummy call likwid_markerStart("dummy") call dummy() call likwid_markerStop("dummy") ! sub call likwid_markerStart("sub") do i = 1, nrep1 call sub(n, a, b, c) end do call likwid_markerStop("sub") ! another call likwid_markerStart("another") do i = 1, nrep2 call another(n, a, s) b(i) = b(i) + s end do call likwid_markerStop("another") ! oncemore sub call likwid_markerStart("sub2") do i = 1, nrep1 call sub(n, a, b, c) end do call likwid_markerStop("sub2") call likwid_markerClose() print *,'job done' stop end subroutine sub(n, a, b, c) real*8 a(n), b(n), c(n) s = 0.0 do i = 1, n a(i) = sin( sqrt( exp( b(i) * c(i) - dble(i) ) ) ) end do return end subroutine another(n, a, s) real*8 a(n), s s = 0.0 do i = 1, n s = s + sin( sqrt( exp ( a(i) ) ) ) end do return end subroutine dummy() return end likwid-3.1.3/src/includes/timer.h000644 137545 027340 00000007210 12426160352 017200 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * Filename: timer.h * * Description: Measure runtime with getTimeOfday and rdtsc. * * A C module to measure runtime. There are two methods: with gettimeofday * for longer time periods (over 0.5 sec) and with rdtsc (read time stamp * counter) for shorter periods. There is a variation for measurements * with rdtsc of 100 cycles in the worst case. Therefore sensible * measurements should be over 1000 cycles. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef TIMER_H #define TIMER_H #include #define RDTSC(cpu_c) \ __asm__ volatile("xor %%eax,%%eax\n\t" \ "cpuid\n\t" \ "rdtsc\n\t" \ "movl %%eax, %0\n\t" \ "movl %%edx, %1\n\t" \ : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \ : : "%eax","%ebx","%ecx","%edx") #define RDTSC_CR(cpu_c) \ __asm__ volatile("rdtsc\n\t" \ "movl %%eax, %0\n\t" \ "movl %%edx, %1\n\t" \ : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \ : : "%eax","%ebx","%ecx","%edx") #define RDTSCP(cpu_c) \ __asm__ volatile("rdtscp\n\t" \ "movl %%eax, %0\n\t" \ "movl %%edx, %1\n\t" \ "cpuid\n\t" \ : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \ : : "%eax","%ebx","%ecx","%edx") #ifdef HAS_RDTSCP #define RDTSC_STOP(cpu_c) RDTSCP(cpu_c); #else #define RDTSC_STOP(cpu_c) RDTSC_CR(cpu_c); #endif extern void timer_init( void ); extern double timer_print( TimerData* ); extern uint64_t timer_printCycles( TimerData* ); extern uint64_t timer_getCpuClock( void ); extern uint64_t timer_getBaseline( void ); static inline void timer_start( TimerData* ); static inline void timer_stop ( TimerData* ); void timer_start( TimerData* time ) { #ifdef __x86_64 RDTSC(time->start); #endif #ifdef _ARCH_PPC uint32_t tbl, tbu0, tbu1; do { __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); } while (tbu0 != tbu1); time->start.int64 = (((uint64_t)tbu0) << 32) | tbl; #endif } void timer_stop( TimerData* time ) { #ifdef __x86_64 RDTSC_STOP(time->stop) #endif #ifdef _ARCH_PPC uint32_t tbl, tbu0, tbu1; do { __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); } while (tbu0 != tbu1); time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl; #endif } #endif /* TIMER_H */ likwid-3.1.3/bench/000755 137545 027340 00000000000 12426160162 014370 5ustar00unrz254unrz000000 000000 likwid-3.1.3/src/includes/perfmon_kabini.h000644 137545 027340 00000020577 12426160352 021056 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: perfmon_kabini.h * * Description: Header file of perfmon module for AMD Family16 * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #include #include #include static int perfmon_numCountersKabini = NUM_COUNTERS_KABINI; static int perfmon_numGroupsKabini = NUM_GROUPS_KABINI; static int perfmon_numArchEventsKabini = NUM_ARCH_EVENTS_KABINI; void perfmon_init_kabini(PerfmonThread *thread) { uint64_t flags = 0x0ULL; int cpu_id = thread->processorId; msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, 0x0ULL); msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, 0x0ULL); msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, 0x0ULL); msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, 0x0ULL); if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) || lock_acquire( (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id) ) { msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL0, 0x0ULL); msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL1, 0x0ULL); msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL2, 0x0ULL); msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL3, 0x0ULL); } //flags |= (1<<16); /* user mode flag */ /*msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, flags); msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, flags); msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, flags); msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, flags);*/ } void perfmon_setupCounterThread_kabini( int thread_id, PerfmonEvent* event, PerfmonCounterIndex index) { uint64_t flags = 0x0ULL; uint64_t reg = kabini_counter_map[index].configRegister; int cpu_id = perfmon_threadData[thread_id].processorId; perfmon_threadData[thread_id].counters[index].init = TRUE; /* only one thread accesses Uncore */ if ( (kabini_counter_map[index].type == UNCORE) && !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ) { return; } if (kabini_counter_map[index].type == PMC) { flags |= (1<<16); } /* AMD uses a 12 bit Event mask: [35:32][7:0] */ flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U)); if (perfmon_verbose) { printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n", cpu_id, LLU_CAST reg, LLU_CAST flags); } msr_write(cpu_id, reg , flags); } void perfmon_startCountersThread_kabini(int thread_id) { int haveLock = 0; uint64_t flags; int cpu_id = perfmon_threadData[thread_id].processorId; if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)) { haveLock = 1; } for ( int i=0; i. # # ======================================================================================= EVENT_DATA_MEM_REFS 0x43 PMC UMASK_DATA_MEM_REFS 0x00 EVENT_DCU_LINES_IN 0x45 PMC UMASK_DCU_LINES_IN 0x00 EVENT_DCU_M_LINES_IN 0x46 PMC UMASK_DCU_M_LINES_IN 0x00 EVENT_DCU_M_LINES_OUT 0x47 PMC UMASK_DCU_M_LINES_OUT 0x00 EVENT_DCU_MISS_OUTSTANDING 0x48 PMC UMASK_DCU_MISS_OUTSTANDING 0x00 EVENT_EMON_EST_TRANS 0x58 PMC UMASK_EMON_EST_TRANS 0x00 EVENT_BR_INST_EXEC 0x88 PMC UMASK_BR_INST_EXEC 0x00 EVENT_BR_MISSP_EXEC 0x89 PMC UMASK_BR_MISSP_EXEC 0x00 EVENT_BR_BAC_MISSP_EXEC 0x8A PMC UMASK_BR_BAC_MISSP_EXEC 0x00 EVENT_BR_CND_EXEC 0x8B PMC UMASK_BR_CND_EXEC 0x00 EVENT_BR_CND_MISSP_EXEC 0x8C PMC UMASK_BR_CND_MISSP_EXEC 0x00 EVENT_BR_IND_EXEC 0x8D PMC UMASK_BR_IND_EXEC 0x00 EVENT_BR_IND_MISSP_EXEC 0x8E PMC UMASK_BR_IND_MISSP_EXEC 0x00 EVENT_BR_RET_EXEC 0x8F PMC UMASK_BR_RET_EXEC 0x00 EVENT_BR_RET_MISSP_EXEC 0x90 PMC UMASK_BR_RET_MISSP_EXEC 0x00 EVENT_BR_RET_BAC_MISSP_EXEC 0x91 PMC UMASK_BR_RET_BAC_MISSP_EXEC 0x00 EVENT_BR_CALL_EXEC 0x92 PMC UMASK_BR_CALL_EXEC 0x00 EVENT_BR_CALL_MISSP_EXEC 0x93 PMC UMASK_BR_CALL_MISSP_EXEC 0x00 EVENT_BR_IND_CALL_EXEC 0x94 PMC UMASK_BR_IND_CALL_EXEC 0x00 EVENT_IFU_IFETCH 0x80 PMC UMASK_IFU_IFETCH 0x00 EVENT_IFU_IFETCH_MISS 0x81 PMC UMASK_IFU_IFETCH_MISS 0x00 EVENT_ITLB_MISS 0x85 PMC UMASK_ITLB_MISS 0x00 EVENT_IFU_MEM_STALL 0x86 PMC UMASK_IFU_MEM_STALL 0x00 EVENT_ILD_STALL 0x87 PMC UMASK_ILD_STALL 0x00 EVENT_L2_IFETCH 0x28 PMC UMASK_L2_IFETCH 0x0F EVENT_L2_LD 0x29 PMC UMASK_L2_LD_I_EXCLUDE_PRE 0x01 UMASK_L2_LD_S_EXCLUDE_PRE 0x02 UMASK_L2_LD_E_EXCLUDE_PRE 0x04 UMASK_L2_LD_M_EXCLUDE_PRE 0x08 UMASK_L2_LD_ALL_EXCLUDE_PRE 0x0F UMASK_L2_LD_I_PRE_ONLY 0x11 UMASK_L2_LD_S_PRE_ONLY 0x12 UMASK_L2_LD_E_PRE_ONLY 0x14 UMASK_L2_LD_M_PRE_ONLY 0x18 UMASK_L2_LD_ALL_PRE_ONLY 0x1F UMASK_L2_LD_I_ALL 0x2F UMASK_L2_LD_S_ALL 0x2F UMASK_L2_LD_E_ALL 0x2F UMASK_L2_LD_M_ALL 0x2F UMASK_L2_LD_ALL_ALL 0x2F EVENT_L2_ST 0x2A PMC UMASK_L2_ST 0x0F EVENT_L2_LINES_IN 0x24 PMC UMASK_L2_LINES_IN_I_EXCLUDE_PRE 0x01 UMASK_L2_LINES_IN_S_EXCLUDE_PRE 0x02 UMASK_L2_LINES_IN_E_EXCLUDE_PRE 0x04 UMASK_L2_LINES_IN_M_EXCLUDE_PRE 0x08 UMASK_L2_LINES_IN_ALL_EXCLUDE_PRE 0x0F UMASK_L2_LINES_IN_I_PRE_ONLY 0x11 UMASK_L2_LINES_IN_S_PRE_ONLY 0x12 UMASK_L2_LINES_IN_E_PRE_ONLY 0x14 UMASK_L2_LINES_IN_M_PRE_ONLY 0x18 UMASK_L2_LINES_IN_ALL_PRE_ONLY 0x1F UMASK_L2_LINES_IN_I_ALL 0x2F UMASK_L2_LINES_IN_S_ALL 0x2F UMASK_L2_LINES_IN_E_ALL 0x2F UMASK_L2_LINES_IN_M_ALL 0x2F UMASK_L2_LINES_IN_ALL_ALL 0x2F EVENT_L2_LINES_OUT 0x26 PMC UMASK_L2_LINES_OUT_I_EXCLUDE_PRE 0x01 UMASK_L2_LINES_OUT_S_EXCLUDE_PRE 0x02 UMASK_L2_LINES_OUT_E_EXCLUDE_PRE 0x04 UMASK_L2_LINES_OUT_M_EXCLUDE_PRE 0x08 UMASK_L2_LINES_OUT_ALL_EXCLUDE_PRE 0x0F UMASK_L2_LINES_OUT_I_PRE_ONLY 0x11 UMASK_L2_LINES_OUT_S_PRE_ONLY 0x12 UMASK_L2_LINES_OUT_E_PRE_ONLY 0x14 UMASK_L2_LINES_OUT_M_PRE_ONLY 0x18 UMASK_L2_LINES_OUT_ALL_PRE_ONLY 0x1F UMASK_L2_LINES_OUT_I_ALL 0x2F UMASK_L2_LINES_OUT_S_ALL 0x2F UMASK_L2_LINES_OUT_E_ALL 0x2F UMASK_L2_LINES_OUT_M_ALL 0x2F UMASK_L2_LINES_OUT_ALL_ALL 0x2F EVENT_L2_M_LINES_IN 0x25 PMC UMASK_L2_M_LINES_IN 0x00 EVENT_L2_M_LINES_OUT 0x27 PMC UMASK_L2_M_LINES_OUT_I_EXCLUDE_PRE 0x01 UMASK_L2_M_LINES_OUT_S_EXCLUDE_PRE 0x02 UMASK_L2_M_LINES_OUT_E_EXCLUDE_PRE 0x04 UMASK_L2_M_LINES_OUT_M_EXCLUDE_PRE 0x08 UMASK_L2_M_LINES_OUT_ALL_EXCLUDE_PRE 0x0F UMASK_L2_M_LINES_OUT_I_PRE_ONLY 0x11 UMASK_L2_M_LINES_OUT_S_PRE_ONLY 0x12 UMASK_L2_M_LINES_OUT_E_PRE_ONLY 0x14 UMASK_L2_M_LINES_OUT_M_PRE_ONLY 0x18 UMASK_L2_M_LINES_OUT_ALL_PRE_ONLY 0x1F UMASK_L2_M_LINES_OUT_I_ALL 0x2F UMASK_L2_M_LINES_OUT_S_ALL 0x2F UMASK_L2_M_LINES_OUT_E_ALL 0x2F UMASK_L2_M_LINES_OUT_M_ALL 0x2F UMASK_L2_M_LINES_OUT_ALL_ALL 0x2F EVENT_L2_RQSTS 0x2E PMC UMASK_L2_RQSTS 0x00 EVENT_L2_ADS 0x21 PMC UMASK_L2_ADS 0x00 EVENT_L2_DBUS_BUSY 0x22 PMC UMASK_L2_DBUS_BUSY 0x00 EVENT_L2_DBUS_BUSY_RD 0x23 PMC UMASK_L2_DBUS_BUSY_RD 0x00 EVENT_BUS_DRDY_CLOCKS 0x62 PMC UMASK_BUS_DRDY_CLOCKS_SELF 0x00 UMASK_BUS_DRDY_CLOCKS_ANY 0x20 EVENT_BUS_LOCK_CLOCKS 0x63 PMC UMASK_BUS_LOCK_CLOCKS_SELF 0x00 UMASK_BUS_LOCK_CLOCKS_ANY 0x20 EVENT_BUS_REQ_OUTSTANDING 0x60 PMC UMASK_BUS_REQ_OUTSTANDING_SELF 0x00 EVENT_BUS_TRAN_BRD 0x65 PMC UMASK_BUS_TRAN_BRD_SELF 0x00 UMASK_BUS_TRAN_BRD_ANY 0x20 EVENT_BUS_TRAN_RFO 0x66 PMC UMASK_BUS_TRAN_RFO_SELF 0x00 UMASK_BUS_TRAN_RFO_ANY 0x20 EVENT_BUS_TRAN_WB 0x67 PMC UMASK_BUS_TRAN_WB_SELF 0x00 UMASK_BUS_TRAN_WB_ANY 0x20 EVENT_BUS_TRAN_IFETCH 0x68 PMC UMASK_BUS_TRAN_IFETCH_SELF 0x00 UMASK_BUS_TRAN_IFETCH_ANY 0x20 EVENT_BUS_TRAN_INVAL 0x69 PMC UMASK_BUS_TRAN_INVAL_SELF 0x00 UMASK_BUS_TRAN_INVAL_ANY 0x20 EVENT_BUS_TRAN_PWR 0x6A PMC UMASK_BUS_TRAN_PWR_SELF 0x00 UMASK_BUS_TRAN_PWR_ANY 0x20 EVENT_BUS_TRANS_P 0x6B PMC UMASK_BUS_TRANS_P_SELF 0x00 UMASK_BUS_TRANS_P_ANY 0x20 EVENT_BUS_TRANS_IO 0x6C PMC UMASK_BUS_TRANS_IO_SELF 0x00 UMASK_BUS_TRANS_IO_ANY 0x20 EVENT_BUS_TRAN_DEF 0x6D PMC UMASK_BUS_TRAN_DEF_SELF 0x00 UMASK_BUS_TRAN_DEF_ANY 0x20 EVENT_BUS_TRAN_BURST 0x6E PMC UMASK_BUS_TRAN_BURST_SELF 0x00 UMASK_BUS_TRAN_BURST_ANY 0x20 EVENT_BUS_TRAN_ANY 0x70 PMC UMASK_BUS_TRAN_ANY_SELF 0x00 UMASK_BUS_TRAN_ANY_ANY 0x20 EVENT_BUS_TRAN_MEM 0x6F PMC UMASK_BUS_TRAN_MEM_SELF 0x00 UMASK_BUS_TRAN_MEM_ANY 0x20 EVENT_BUS_DATA_RCV 0x64 PMC UMASK_BUS_DATA_RCV_SELF 0x00 EVENT_BUS_BNR_DRV 0x61 PMC UMASK_BUS_BNR_DRV_SELF 0x00 EVENT_BUS_HIT_DRV 0x7A PMC UMASK_BUS_HIT_DRV_SELF 0x00 EVENT_BUS_HITM_DRV 0x7B PMC UMASK_BUS_HITM_DRV_SELF 0x00 EVENT_BUS_SNOOP_STALL 0x7E PMC UMASK_BUS_SNOOP_STALL_SELF 0x00 EVENT_FLOPS 0xC1 PMC UMASK_FLOPS 0x00 EVENT_FP_COMP_OPS_EXE 0x10 PMC UMASK_FP_COMP_OPS_EXE 0x00 EVENT_FP_ASSIST 0x11 PMC UMASK_FP_ASSIST 0x00 EVENT_MUL 0x12 PMC UMASK_MUL 0x00 EVENT_DIV 0x13 PMC UMASK_DIV 0x00 EVENT_CYCLES_DIV_BUSY 0x14 PMC UMASK_CYCLES_DIV_BUSY 0x00 EVENT_LD_BLOCKS 0x03 PMC UMASK_LD_BLOCKS 0x00 EVENT_SB_DRAINS 0x04 PMC UMASK_SB_DRAINS 0x00 EVENT_MISALIGN_MEM_REF 0x05 PMC UMASK_MISALIGN_MEM_REF 0x00 EVENT_EMON_KNI_PREF_DISPATCHED 0x07 PMC UMASK_EMON_KNI_PREF_DISPATCHED_NTA 0x00 UMASK_EMON_KNI_PREF_DISPATCHED_T1 0x01 UMASK_EMON_KNI_PREF_DISPATCHED_T2 0x02 UMASK_EMON_KNI_PREF_DISPATCHED_WEAK 0x03 EVENT_EMON_KNI_PREF_MISS 0x4B PMC UMASK_EMON_KNI_PREF_MISS_NTA 0x00 UMASK_EMON_KNI_PREF_MISS_T1 0x01 UMASK_EMON_KNI_PREF_MISS_T2 0x02 UMASK_EMON_KNI_PREF_MISS_WEAK 0x03 EVENT_INST_RETIRED 0xC0 PMC UMASK_INST_RETIRED 0x00 EVENT_UOPS_RETIRED 0xC2 PMC UMASK_UOPS_RETIRED 0x00 EVENT_INST_DECODED 0xD0 PMC UMASK_INST_DECODED 0x00 EVENT_EMON_SSE_SSE2_INST_RETIRED 0xD8 PMC UMASK_EMON_SSE_SSE2_INST_RETIRED_ALL_SP 0x00 UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_SP 0x01 UMASK_EMON_SSE_SSE2_INST_RETIRED_PACKED_DP 0x02 UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_DP 0x03 EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED 0xD9 PMC UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP 0x00 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP 0x01 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP 0x02 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP 0x03 EVENT_HW_INT_RX 0xC8 PMC UMASK_HW_INT_RX 0x00 EVENT_CYCLES_INT_MASKED 0xC6 PMC UMASK_CYCLES_INT_MASKED 0x00 EVENT_CYCLES_INT_PENDING_AND_MASKED 0xC7 PMC UMASK_CYCLES_INT_PENDING_AND_MASKED 0x00 EVENT_BR_INST_RETIRED 0xC4 PMC UMASK_BR_INST_RETIRED 0x00 EVENT_BR_MISS_PRED_RETIRED 0xC5 PMC UMASK_BR_MISS_PRED_RETIRED 0x00 EVENT_BR_TAKEN_RETIRED 0xC9 PMC UMASK_BR_TAKEN_RETIRED 0x00 EVENT_BR_MISS_PRED_TAKEN_RET 0xCA PMC UMASK_BR_MISS_PRED_TAKEN_RET 0x00 EVENT_EMON_SIMD_INSTR_RETIRED 0xCE PMC UMASK_EMON_SIMD_INSTR_RETIRED 0x00 EVENT_EMON_SYNCH_UOPS 0xD3 PMC UMASK_EMON_SYNCH_UOPS 0x00 EVENT_EMON_ESP_UOPS 0xD7 PMC UMASK_EMON_ESP_UOPS 0x00 EVENT_EMON_FUSED_UOPS_RET 0xDA PMC UMASK_EMON_FUSED_UOPS_RET 0x00 EVENT_EMON_UNFUSION 0xDB PMC UMASK_EMON_UNFUSION 0x00 EVENT_EMON_PREF_RQSTS_UP 0xF0 PMC UMASK_EMON_PREF_RQSTS_UP 0x00 EVENT_EMON_PREF_RQSTS_DN 0xF8 PMC UMASK_EMON_PREF_RQSTS_DN 0x00 EVENT_BR_INST_DECODED 0xE0 PMC UMASK_BR_INST_DECODED 0x00 EVENT_BTB_MISSES 0xE2 PMC UMASK_BTB_MISSES 0x00 EVENT_BR_BOGUS 0xE4 PMC UMASK_BR_BOGUS 0x00 EVENT_BACLEARS 0xE6 PMC UMASK_BACLEARS 0x00 EVENT_RESOURCE_STALLS 0xA2 PMC UMASK_RESOURCE_STALLS 0x00 EVENT_PARTIAL_RAT_STALL 0xD2 PMC UMASK_PARTIAL_RAT_STALL 0x00 EVENT_SEGMENT_REG_LOADS 0x06 PMC UMASK_SEGMENT_REG_LOADS 0x00 EVENT_CPU_CLK_UNHALTED 0x79 PMC UMASK_CPU_CLK_UNHALTED 0x00 EVENT_MMX_INSTR_EXEC 0xB0 PMC UMASK_MMX_INSTR_EXEC 0x00 EVENT_MMX_SAT_INSTR_EXEC 0xB1 PMC UMASK_MMX_SAT_INSTR_EXEC 0x00 EVENT_MMX_UOPS_EXEC 0xB2 PMC UMASK_MMX_UOPS_EXEC 0x0F EVENT_MMX_INSTR_TYPE_EXEC 0xB3 PMC UMASK_MMX_INSTR_TYPE_EXEC_PACKED_MUL 0x01 UMASK_MMX_INSTR_TYPE_EXEC_PACKED_SHIFT 0x02 UMASK_MMX_INSTR_TYPE_EXEC_PACK 0x04 UMASK_MMX_INSTR_TYPE_EXEC_UNPACK 0x08 UMASK_MMX_INSTR_TYPE_EXEC_PACKED_LOGICAL 0x10 UMASK_MMX_INSTR_TYPE_EXEC_PACKED_ARITHMETIC 0x20 EVENT_FP_MMX_TRANS 0xCC PMC UMASK_FP_MMX_TRANS_MMX_FP 0x00 UMASK_FP_MMX_TRANS_FP_MMX 0x01 EVENT_MMX_ASSIST 0xCD PMC UMASK_MMX_ASSIST 0x00 EVENT_MMX_INSTR_RET 0xCE PMC UMASK_MMX_INSTR_RET 0x00 EVENT_SEG_RENAME_STALLS 0xD4 PMC UMASK_SEG_RENAME_STALLS_ES 0x02 UMASK_SEG_RENAME_STALLS_DS 0x04 UMASK_SEG_RENAME_STALLS_FS 0x08 UMASK_SEG_RENAME_STALLS_ALL 0x0F EVENT_SEG_REG_RENAMES 0xD5 PMC UMASK_SEG_REG_RENAMES 0x02 UMASK_SEG_REG_RENAMES 0x04 UMASK_SEG_REG_RENAMES 0x08 UMASK_SEG_REG_RENAMES 0x0F EVENT_RET_SEG_RENAMES 0xD6 PMC UMASK_RET_SEG_RENAMES 0x00 likwid-3.1.3/src/includes/perfmon_nehalem.h000644 137545 027340 00000026171 12426160352 021226 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: perfmon_nehalem.h * * Description: Header File of perfmon module for Nehalem. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #include #include #include static int perfmon_numCountersNehalem = NUM_COUNTERS_NEHALEM; static int perfmon_numGroupsNehalem = NUM_GROUPS_NEHALEM; static int perfmon_numArchEventsNehalem = NUM_ARCH_EVENTS_NEHALEM; #define OFFSET_PMC 3 #define OFFSET_UPMC 7 void perfmon_init_nehalem(PerfmonThread *thread) { uint64_t flags = 0x0ULL; int cpu_id = thread->processorId; msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL); msr_write(cpu_id, MSR_PMC0, 0x0ULL); msr_write(cpu_id, MSR_PMC1, 0x0ULL); msr_write(cpu_id, MSR_PMC2, 0x0ULL); msr_write(cpu_id, MSR_PMC3, 0x0ULL); msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL); msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL); msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL); msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL); msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL); msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL); /* initialize fixed counters * FIXED 0: Instructions retired * FIXED 1: Clocks unhalted core * FIXED 2: Clocks unhalted ref */ //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL); // flags |= (1<<22); /* enable flag */ // flags |= (1<<16); /* user mode flag */ //setBit(flags,16); /* set user mode flag */ //setBit(flags,22); /* set enable flag */ /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags); msr_write(cpu_id, MSR_PERFEVTSEL1, flags); msr_write(cpu_id, MSR_PERFEVTSEL2, flags); msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/ if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) || lock_acquire( (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id) ) { /* UNCORE FIXED 0: Uncore cycles */ msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x01ULL); msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_FIXED_CTR0, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC0, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC1, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC2, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC3, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC4, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC5, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC6, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PMC7, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL); msr_write(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL); msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL); /* Preinit of PERFEVSEL registers */ //clearBit(flags,16); /* set enable flag */ /*msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, flags); msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, flags);*/ } } void perfmon_setupCounterThread_nehalem( int thread_id, PerfmonEvent* event, PerfmonCounterIndex index) { int haveLock = 0; uint64_t flags = 0x0ULL; uint64_t reg = nehalem_counter_map[index].configRegister; int cpu_id = perfmon_threadData[thread_id].processorId; uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL); if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)) { haveLock = 1; } perfmon_threadData[thread_id].counters[index].init = TRUE; if ( nehalem_counter_map[index].type == PMC ) { flags = (1<<16)|(1<<22); /* Intel with standard 8 bit event mask: [7:0] */ flags |= (event->umask<<8) + event->eventId; if (event->cfgBits != 0) /* set custom cfg and cmask */ { flags &= ~(0xFFFFU<<16); /* clear upper 16bits */ flags |= ((event->cmask<<8) + event->cfgBits)<<16; } msr_write(cpu_id, reg , flags); if (perfmon_verbose) { printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n", cpu_id, LLU_CAST reg, LLU_CAST flags); } } else if ( nehalem_counter_map[index].type == UNCORE ) { if(haveLock) { flags = (1<<22); /* Intel with standard 8 bit event mask: [7:0] */ flags |= (event->umask<<8) + event->eventId; if (event->cfgBits != 0) /* set custom cfg and cmask */ { flags &= ~(0xFFFFU<<16); /* clear upper 16bits */ flags |= ((event->cmask<<8) + event->cfgBits)<<16; } msr_write(cpu_id, reg , flags); if (perfmon_verbose) { printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n", cpu_id, LLU_CAST reg, LLU_CAST flags); } } } else if (nehalem_counter_map[index].type == FIXED) { fixed_flags |= (0x2 <<(index*4)); msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags); } } void perfmon_startCountersThread_nehalem(int thread_id) { int haveLock = 0; uint64_t flags = 0x0ULL; uint64_t uflags = 0x0ULL; int cpu_id = perfmon_threadData[thread_id].processorId; msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL); if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)) { haveLock = 1; msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL); /* Fixed Uncore counter */ uflags = 0x100000000ULL; } for ( int i=0; i. * * ======================================================================================= */ #ifndef THREADS_TYPES_H #define THREADS_TYPES_H #include #include typedef struct { int globalNumberOfThreads; int numberOfThreads; int globalThreadId; int threadId; int numberOfGroups; int groupId; double time; uint64_t cycles; FILE* output; ThreadUserData data; } ThreadData; typedef struct { int numberOfThreads; int* threadIds; } ThreadGroup; typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst); #endif /*THREADS_TYPES_H*/ likwid-3.1.3/test/accuracy/TESTS/FLOPS_AVX.txt000644 137545 027340 00000000423 12336605216 021131 0ustar00unrz254unrz000000 000000 REGEX_BENCH MFlops\/s:\s+([0-9]+) REGEX_PERF \|\s+DP MFlops\/s\s+\|\s+([0-9]+) TEST stream_avx RUNS 10 VARIANT 24kB 20000 VARIANT 128kB 10000 VARIANT 2MB 5000 VARIANT 1GB 50 TEST triad_avx RUNS 10 VARIANT 24kB 20000 VARIANT 128kB 10000 VARIANT 2MB 5000 VARIANT 1GB 50 likwid-3.1.3/groups/nehalem/L2.txt000644 137545 027340 00000001764 12336605216 017273 0ustar00unrz254unrz000000 000000 SHORT L2 cache bandwidth in MBytes/s EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 L1D_REPL PMC1 L1D_M_EVICT METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 LONG Formulas: L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64 - Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the number of cacheline allocated in the L1 and the number of modified cachelines evicted from the L1. Note that this bandwidth also includes data transfers due to a write allocate load on a store miss in L1. likwid-3.1.3/src/includes/libperfctr_types.h000644 137545 027340 00000003061 12426160352 021440 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: libperfctr_types.h * * Description: Types file for libperfctr module. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef LIBPERFCTR_H #define LIBPERFCTR_H #include typedef struct LikwidThreadResults{ bstring label; double time; TimerData startTime; uint32_t count; double StartPMcounters[NUM_PMC]; double PMcounters[NUM_PMC]; } LikwidThreadResults; typedef struct { bstring tag; double* time; uint32_t* count; double** counters; } LikwidResults; #endif /*LIBPERFCTR_H*/ likwid-3.1.3/bench/x86-64/peak_sse.ptt000644 137545 027340 00000002057 12416714770 017604 0ustar00unrz254unrz000000 000000 STREAMS 2 TYPE DOUBLE FLOPS 2 BYTES 16 INC 8 movaps FPR1, [SCALAR] sub GPR2, 4 sub STR0, 32 sub STR1, 32 mov GPR1, GPR2 neg GPR1 .align 16 1: movaps FPR2, [STR0 + GPR1 * 8 ] addpd FPR2, FPR1 mulpd FPR2, FPR1 movaps FPR6, [STR0 + GPR1 * 8 ] addpd FPR2, FPR1 mulpd FPR2, FPR1 pshufd FPR2, FPR1, 0x1 #movaps [STR1 + GPR1 * 8], FPR2 movaps FPR3, [STR0 + GPR1 * 8 + 16] addpd FPR3, FPR1 mulpd FPR3, FPR1 movaps FPR7, [STR0 + GPR1 * 8 + 16 ] addpd FPR3, FPR1 mulpd FPR3, FPR1 pshufd FPR3, FPR1, 0x1 #movaps [STR1 + GPR1 * 8 + 16], FPR3 movaps FPR4, [STR0 + GPR1 * 8 + 32] addpd FPR4, FPR1 mulpd FPR4, FPR1 movaps FPR8, [STR0 + GPR1 * 8 + 32 ] addpd FPR4, FPR1 mulpd FPR4, FPR1 pshufd FPR4, FPR1, 0x1 #movaps [STR1 + GPR1 * 8 + 32], FPR4 movaps FPR5, [STR0 + GPR1 * 8 + 48] addpd FPR5, FPR1 mulpd FPR5, FPR1 movaps FPR9, [STR0 + GPR1 * 8 + 48 ] addpd FPR5, FPR1 mulpd FPR5, FPR1 pshufd FPR5, FPR1, 0x1 #movaps [STR1 + GPR1 * 8 + 48], FPR5 add GPR1, 8 js 1b likwid-3.1.3/doc/likwid-features.1000644 137545 027340 00000004047 12424127407 017247 0ustar00unrz254unrz000000 000000 .TH LIKWID-FEATURES 1 likwid\- .SH NAME likwid-features \- print and toggle the flags of the MSR_IA32_MISC_ENABLE model specific register .SH SYNOPSIS .B likwid-features .RB [ \-vh ] .RB [ \-c .IR ] .RB [ \-s .IR ] .RB [ \-u .IR ] .SH DESCRIPTION .B likwid-features is a command line application to print the flags in the model specific register (MSR) MSR_IA32_MISC_ENABLE on Intel x86 processors. On Core2 processors it can be used to toggle the hardware prefetch flags. It does not work on AMD processors. For a documentation what flags are supported on which processor refer to the Intel Software Developer's Manual Volume 3B, Table B.2. The MSR are set individually for every core. The following hardware prefetchers can be toggled: .IP \[bu] .B HW_PREFETCHER: Hardware prefetcher. .IP \[bu] .B CL_PREFETCHER: Adjacent cache line prefetcher. .IP \[bu] .B DCU_PREFETCHER: When the DCU prefetcher detects multiple loads from the same line done within a time limit, the DCU prefetcher assumes the next line will be required. The next line is prefetched in to the L1 data cache from memory or L2. .IP \[bu] .B IP_PREFETCHER: The IP prefetcher is an L1 data cache prefetcher. The IP prefetcher looks for sequential load history to determine whether to prefetch the next expected data into the L1 cache from memory or L2. .SH OPTIONS .TP .B \-\^v prints version information to standard output, then exits. .TP .B \-\^h prints a help message to standard output, then exits. .TP .B \-\^c " " set on which processor core the MSR should be read .TP .B \-\^u " " specify which prefetcher to unset .TP .B \-\^s " " specify which prefetcher to set .SH AUTHOR Written by Jan Treibig . .SH BUGS Report Bugs on . .SH "SEE ALSO" likwid-perfctr(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1), likwid-setFrequencies(1) likwid-3.1.3/groups/silvermont/L2TOMEM.txt000644 137545 027340 00000001677 12412531610 020660 0ustar00unrz254unrz000000 000000 SHORT L2 to Mem load cache bandwidth in MBytes/s EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 MEM_UOPS_RETIRED_L2_MISS_LOADS METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 L2 to MEM load bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time L2 to MEM load data volume [GBytes] 1.0E-09*(PMC0)*64.0 LONG Formulas: L2 to MEM load bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64/time L2 to MEM load data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64 - Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the number of cacheline allocated in the L2 cache. Since there is no possibility to retrieve the evicted cache lines, this group measures only the load cache bandwidth. The group also output totally loaded data volume transfered between memory and L2. likwid-3.1.3/groups/nehalemEX/CACHE.txt000644 137545 027340 00000002333 12336605216 020047 0ustar00unrz254unrz000000 000000 SHORT Data cache miss rate/ratio EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 L1D_REPL PMC1 L1D_ALL_REF_ANY METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 Data cache misses PMC0 Data cache request rate PMC1/FIXC0 Data cache miss rate PMC0/FIXC0 Data cache miss ratio PMC0/PMC1 LONG Formulas: Data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY Data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY - This group measures the locality of your data accesses with regard to the L1 Cache. Data cache request rate tells you how data intensive your code is or how many Data accesses you have in average per instruction. The Data cache miss rate gives a measure how often it was necessary to get cachelines from higher levels of the memory hierarchy. And finally Data cache miss ratio tells you how many of your memory references required a cacheline to be loaded from a higher level. While the Data cache miss rate might be given by your algorithm you should try to get Data cache miss ratio as low as possible by increasing your cache reuse. likwid-3.1.3/groups/sandybridge/L3CACHE.txt000644 137545 027340 00000002354 12421737566 020707 0ustar00unrz254unrz000000 000000 SHORT L3 cache miss rate/ratio EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 L3 request rate (PMC0)/FIXC0 L3 miss rate PMC1/FIXC0 L3 miss ratio PMC1/PMC0 LONG Formulas: L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL - This group measures the locality of your data accesses with regard to the L3 Cache. L3 request rate tells you how data intensive your code is or how many Data accesses you have in average per instruction. The L3 miss rate gives a measure how often it was necessary to get cachelines from memory. And finally L3 miss ratio tells you how many of your memory references required a cacheline to be loaded from a higher level. While the Data cache miss rate might be given by your algorithm you should try to get Data cache miss ratio as low as possible by increasing your cache reuse. Note: This group might need to be revised! likwid-3.1.3/groups/phi/MEM4.txt000644 137545 027340 00000000324 12336605216 016656 0ustar00unrz254unrz000000 000000 SHORT L2 Victim requests EVENTSET PMC0 L2_VICTIM_REQ_WITH_DATA METRICS Runtime (RDTSC) [s] time Victim Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time Victim Data Volume [GBytes] 1.0E-09*PMC0*64.0 LONG Bla likwid-3.1.3/src/includes/perfmon_nehalemEX.h000644 137545 027340 00000100140 12426160352 021450 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: perfmon_nehalemEX.h * * Description: Header File of perfmon module for Nehalem EX. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #include #include #define NUM_COUNTERS_NEHALEMEX 7 //static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX; static int perfmon_numGroupsNehalemEX = NUM_GROUPS_NEHALEMEX; static int perfmon_numArchEventsNehalemEX = NUM_ARCH_EVENTS_NEHALEMEX; /* This SUCKS: There are only subtle difference between NehalemEX * and Westmere EX Uncore. Still one of them is that one field is * 1 bit shifted. Thank you Intel for this mess!!! Do you want * to change the register definitions for every architecture?*/ void perfmon_init_nehalemEX(PerfmonThread *thread) { uint64_t flags = 0x0ULL; int cpu_id = thread->processorId; perfmon_verbose = 1; msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL); msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL); msr_write(cpu_id, MSR_PMC0, 0x0ULL); msr_write(cpu_id, MSR_PMC1, 0x0ULL); msr_write(cpu_id, MSR_PMC2, 0x0ULL); msr_write(cpu_id, MSR_PMC3, 0x0ULL); msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL); msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL); msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL); msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL); msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL); msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL); /* initialize fixed counters * FIXED 0: Instructions retired * FIXED 1: Clocks unhalted core * FIXED 2: Clocks unhalted ref */ //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL); /* Preinit of PERFEVSEL registers */ //flags |= (1<<22); /* enable flag */ //flags |= (1<<16); /* user mode flag */ /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags); msr_write(cpu_id, MSR_PERFEVTSEL1, flags); msr_write(cpu_id, MSR_PERFEVTSEL2, flags); msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/ /* Initialize uncore */ /* MBOX */ thread->counters[PMC7].id = 0; thread->counters[PMC8].id = 1; thread->counters[PMC9].id = 2; thread->counters[PMC10].id = 3; thread->counters[PMC11].id = 4; thread->counters[PMC12].id = 5; westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL; westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS; westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL; thread->counters[PMC13].id = 0; thread->counters[PMC14].id = 1; thread->counters[PMC15].id = 2; thread->counters[PMC16].id = 3; thread->counters[PMC17].id = 4; thread->counters[PMC18].id = 5; westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL; westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS; westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL; /* BBOX */ thread->counters[PMC19].id = 0; thread->counters[PMC20].id = 1; thread->counters[PMC21].id = 2; thread->counters[PMC22].id = 3; westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL; westmereEX_PMunits[BBOX0].statusRegister = MSR_B0_PMON_BOX_STATUS; westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL; thread->counters[PMC23].id = 0; thread->counters[PMC24].id = 1; thread->counters[PMC25].id = 2; thread->counters[PMC26].id = 3; westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL; westmereEX_PMunits[BBOX1].statusRegister = MSR_B1_PMON_BOX_STATUS; westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL; /* RBOX */ thread->counters[PMC27].id = 0; thread->counters[PMC28].id = 1; thread->counters[PMC29].id = 2; thread->counters[PMC30].id = 3; thread->counters[PMC31].id = 4; thread->counters[PMC32].id = 5; thread->counters[PMC33].id = 6; thread->counters[PMC34].id = 7; westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL; westmereEX_PMunits[RBOX0].statusRegister = MSR_R0_PMON_BOX_STATUS; westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL; thread->counters[PMC35].id = 0; thread->counters[PMC36].id = 1; thread->counters[PMC37].id = 2; thread->counters[PMC38].id = 3; thread->counters[PMC39].id = 4; thread->counters[PMC40].id = 5; thread->counters[PMC41].id = 6; thread->counters[PMC42].id = 7; westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL; westmereEX_PMunits[RBOX1].statusRegister = MSR_R1_PMON_BOX_STATUS; westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL; /* WBOX */ thread->counters[PMC43].id = 0; thread->counters[PMC44].id = 1; thread->counters[PMC45].id = 2; thread->counters[PMC46].id = 3; thread->counters[PMC47].id = 31; westmereEX_PMunits[WBOX].ctrlRegister = MSR_W_PMON_BOX_CTRL; westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS; westmereEX_PMunits[WBOX].ovflRegister = MSR_W_PMON_BOX_OVF_CTRL; thread->counters[PMC48].id = 0; westmereEX_PMunits[UBOX].ctrlRegister = MSR_U_PMON_GLOBAL_CTRL; westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS; westmereEX_PMunits[UBOX].ovflRegister = MSR_U_PMON_GLOBAL_OVF_CTRL; /* Set IDs for all CBOXes */ for (int i=PMC49; i<=PMC88; i+= 5) { for(int j=0; j<5; j++) { thread->counters[i].id = j; } } westmereEX_PMunits[CBOX0].ctrlRegister = MSR_C0_PMON_BOX_CTRL; westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS; westmereEX_PMunits[CBOX0].ovflRegister = MSR_C0_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX1].ctrlRegister = MSR_C1_PMON_BOX_CTRL; westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS; westmereEX_PMunits[CBOX1].ovflRegister = MSR_C1_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX2].ctrlRegister = MSR_C2_PMON_BOX_CTRL; westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS; westmereEX_PMunits[CBOX2].ovflRegister = MSR_C2_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX3].ctrlRegister = MSR_C3_PMON_BOX_CTRL; westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS; westmereEX_PMunits[CBOX3].ovflRegister = MSR_C3_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX4].ctrlRegister = MSR_C4_PMON_BOX_CTRL; westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS; westmereEX_PMunits[CBOX4].ovflRegister = MSR_C4_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX5].ctrlRegister = MSR_C5_PMON_BOX_CTRL; westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS; westmereEX_PMunits[CBOX5].ovflRegister = MSR_C5_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX6].ctrlRegister = MSR_C6_PMON_BOX_CTRL; westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS; westmereEX_PMunits[CBOX6].ovflRegister = MSR_C6_PMON_BOX_OVF_CTRL; westmereEX_PMunits[CBOX7].ctrlRegister = MSR_C7_PMON_BOX_CTRL; westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS; westmereEX_PMunits[CBOX7].ovflRegister = MSR_C7_PMON_BOX_OVF_CTRL; thread->counters[PMC99].id = 0; thread->counters[PMC100].id = 1; thread->counters[PMC101].id = 2; thread->counters[PMC102].id = 3; westmereEX_PMunits[SBOX0].ctrlRegister = MSR_S0_PMON_BOX_CTRL; westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS; westmereEX_PMunits[SBOX0].ovflRegister = MSR_S0_PMON_BOX_OVF_CTRL; thread->counters[PMC103].id = 0; thread->counters[PMC104].id = 1; thread->counters[PMC105].id = 2; thread->counters[PMC106].id = 3; westmereEX_PMunits[SBOX1].ctrlRegister = MSR_S1_PMON_BOX_CTRL; westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS; westmereEX_PMunits[SBOX1].ovflRegister = MSR_S1_PMON_BOX_OVF_CTRL; if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) || lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)) { msr_write(cpu_id, MSR_W_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL); msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL); msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL); msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL); msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL); msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL); msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL); msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL); msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL); msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL); msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL); flags = 0x0UL; flags |= (1<<29); /* reset all */ msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, flags ); } } /* MBOX macros */ #define MBOX_GATE_NEHEX(NUM) \ flags = 0x41ULL; \ switch (event->cfgBits) \ { \ case 0x00: /* primary Event */ \ flags |= (event->eventId<<9); \ break; \ case 0x01: /* secondary Events */ \ /* TODO fvid index is missing defaults to 0 */ \ flags |= (1<<7); /* toggle flag mode */ \ flags |= (event->eventId<<19); \ switch (event->eventId) \ { \ case 0x00: /* CYCLES_DSP_FILL: DSP */ \ { \ uint64_t dsp_flags = 0x0ULL; \ dsp_flags |= (event->umask<<7); \ msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags); \ } \ break; \ case 0x01: /* CYCLES_SCHED_MODE: ISS */ \ { \ uint32_t iss_flags = 0x0UL; \ iss_flags |= (event->umask<<4); \ msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \ } \ break; \ case 0x05: /* CYCLES_PGT_STATE: PGT */ \ { \ uint32_t pgt_flags = 0x0UL; \ pgt_flags |= (event->umask<<6); \ msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \ } \ break; \ case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */ \ { \ uint32_t map_flags = 0x0UL; \ map_flags |= (event->umask<<6); \ msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags); \ } \ break; \ } \ break; \ case 0x02: /* DRAM_CMD: PLD/ISS */ \ flags |= (event->eventId<<9); \ { \ uint32_t pld_flags = 0x0UL; \ uint32_t iss_flags = 0x0UL; \ pld_flags |= (event->umask<<8); \ if (event->cmask != 0) \ { \ iss_flags |= (event->cmask<<7); \ pld_flags |= 1; /* toggle cmd flag */ \ } \ msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags); \ msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \ } \ break; \ case 0x03: /* DSP_FILL: DSP */ \ flags |= (event->eventId<<9); \ { \ uint64_t dsp_flags = 0x0ULL; \ dsp_flags |= (event->umask<<7); \ msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags); \ } \ break; \ case 0x04: /* DRAM_MISC: PLD */ \ flags |= (event->eventId<<9); \ { \ uint64_t pld_flags = 0x0ULL; \ switch (event->cmask) \ { \ case 0x0: \ pld_flags |= (1<<16); \ pld_flags |= (event->umask<<19); \ break; \ case 0x1: \ pld_flags |= (event->umask<<18); \ break; \ case 0x2: \ pld_flags |= (event->umask<<17); \ break; \ case 0x3: \ pld_flags |= (event->umask<<7); \ break; \ } \ msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags); \ } \ break; \ case 0x05: /* FRM_TYPE: ISS */ \ flags |= (event->eventId<<9); \ { \ uint32_t iss_flags = 0x0UL; \ iss_flags |= event->umask; \ msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \ } \ break; \ case 0x06: /* FVC_EV0: FVC */ \ flags |= (event->eventId<<9); \ { \ uint32_t fvc_flags = 0x0UL; \ fvc_flags |= (event->umask<<11); \ if (event->umask == 0x5) \ { \ fvc_flags |= (event->cmask<<5); \ } \ else \ { \ fvc_flags |= (event->cmask<<8); \ } \ msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \ VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \ } \ break; \ case 0x07: /* FVC_EV1: FVC */ \ flags |= (event->eventId<<9); \ { \ uint32_t fvc_flags = 0x0UL; \ fvc_flags |= (event->umask<<14); \ if (event->umask == 0x5) \ { \ fvc_flags |= (event->cmask<<5); \ } \ else \ { \ fvc_flags |= (event->cmask<<8); \ } \ msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \ VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \ } \ break; \ case 0x08: /* FVC_EV2: FVC */ \ flags |= (event->eventId<<9); \ { \ uint32_t fvc_flags = 0x0UL; \ fvc_flags |= (event->umask<<17); \ if (event->umask == 0x5) \ { \ fvc_flags |= (event->cmask<<5); \ } \ else \ { \ fvc_flags |= (event->cmask<<8); \ } \ msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \ VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \ } \ break; \ case 0x09: /* FVC_EV3: FVC(ZDP) */ \ flags |= (event->eventId<<9); \ { \ uint32_t fvc_flags = 0x0UL; \ fvc_flags |= (event->umask<<20); \ if (event->umask == 0x5) \ { \ fvc_flags |= (event->cmask<<5); \ } \ else \ { \ fvc_flags |= (event->cmask<<8); \ } \ msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags); \ } \ break; \ case 0x0A: /* ISS_SCHED: ISS */ \ flags |= (event->eventId<<9); \ { \ uint32_t iss_flags = 0x0UL; \ iss_flags |= (event->umask<<10); \ msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags); \ } \ break; \ case 0x0B: /* PGT_PAGE_EV: PGT */ \ flags |= (event->eventId<<9); \ { \ uint32_t pgt_flags = 0x0UL; \ pgt_flags |= event->umask; \ msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \ } \ break; \ case 0x0C: /* PGT_PAGE_EV2: PGT */ \ flags |= (event->eventId<<9); \ { \ uint32_t pgt_flags = 0x0UL; \ pgt_flags |= (event->umask<<11); \ msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags); \ } \ break; \ case 0x0D: /* THERM_TRP_DN: THR */ \ flags |= (event->eventId<<9); \ { \ uint32_t thr_flags = 0x0UL; \ thr_flags |= (1<<3); \ thr_flags |= (event->umask<<9); \ msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags); \ } \ break; \ } void perfmon_setupCounterThread_nehalemEX( int thread_id, PerfmonEvent* event, PerfmonCounterIndex index) { uint64_t flags = 0x0ULL; int haveLock = 0; uint64_t reg = counter_map[index].configRegister; int cpu_id = perfmon_threadData[thread_id].processorId; uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL); perfmon_threadData[thread_id].counters[index].init = TRUE; if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)) { haveLock = 1; } switch (counter_map[index].type) { case PMC: flags = (1<<22)|(1<<16); /* Intel with standard 8 bit event mask: [7:0] */ flags |= (event->umask<<8) + event->eventId; if (event->cfgBits != 0) /* set custom cfg and cmask */ { flags &= ~(0xFFFFU<<16); /* clear upper 16bits */ flags |= ((event->cmask<<8) + event->cfgBits)<<16; } msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL) break; case FIXED: fixed_flags |= (0x2<<(index*4)); msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags); break; case MBOX0: if (haveLock) { MBOX_GATE_NEHEX(0); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL) } break; case MBOX1: if (haveLock) { MBOX_GATE_NEHEX(1); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL) } break; case BBOX0: case BBOX1: if (haveLock) { flags = 0x1ULL; /* set enable bit */ flags |= (event->eventId<<1); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL) } break; case RBOX0: if (haveLock) { RBOX_GATE(0); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL) } break; case RBOX1: if (haveLock) { RBOX_GATE(1); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL) } break; case WBOX: if (haveLock) { if (event->eventId == 0xFF) /* Fixed Counter */ { flags = 0x1ULL; /* set enable bit */ } else { flags |= (1<<22); /* set enable bit */ flags |= (event->umask<<8) + event->eventId; } msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL) } break; case UBOX: if (haveLock) { flags = 0x0ULL; flags |= (1<<22); flags |= event->eventId; fprintf(stderr, "Setup UBOX with value 0x%llx in register 0x%llx, event 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, UBOX_CTRL) } break; case CBOX0: case CBOX1: case CBOX2: case CBOX3: case CBOX4: case CBOX5: case CBOX6: case CBOX7: if (haveLock) { flags = 0x0ULL; flags |= (1<<22); flags |= (event->umask<<8) + event->eventId; fprintf(stderr, "Setup CBOX with value 0x%llx in register 0x%llx, event 0x%x umask 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId, event->umask); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, CBOX_CTRL) } break; case SBOX0: case SBOX1: if (haveLock) { flags = 0x0ULL; flags |= (1<<22); flags |= (event->umask<<8); flags |= (event->eventId); msr_write(cpu_id, reg , flags); VERBOSEPRINTREG(cpu_id, reg, flags, SBOX_CTRL) } break; default: /* should never be reached */ break; } } /* Actions for Performance Monitoring Session: * * Core Counters (counter is always enabled in PERVSEL register): * 1) Disable counters in global ctrl Register MSR_PERF_GLOBAL_CTRL * 2) Zero according counter registers * 3) Set enable bit in global register flag * 4) Write global register flag * * Uncore Counters (only one core per socket): * 1) Set reset flag in global U Box control register * 2) Zero according counter registers * 3) Set enable bit in Box control register * 4) Write according uncore Box ctrl register * 3) Set enable bit in global U Box control register * */ void perfmon_startCountersThread_nehalemEX(int thread_id) { int haveLock = 0; uint64_t flags = 0x0ULL; uint32_t uflags[NUM_UNITS]; int enable_ubox = 0; int cpu_id = perfmon_threadData[thread_id].processorId; msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL); if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) { uint32_t ubflags = 0x0UL; ubflags |= (1<<29); /* reset all */ haveLock = 1; // msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags ); // VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL) } for ( int i=0; i UNCORE) { if(haveLock) { msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL); uflags[westmereEX_counter_map[i].type] |= (1<<(perfmon_threadData[thread_id].counters[i].id)); /* enable uncore counter */ if (westmereEX_counter_map[i].type == UBOX) { enable_ubox = 1; } } } } } VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL); if (haveLock) { for ( int i=0; i UNCORE) { if(haveLock) { perfmon_threadData[thread_id].counters[i].counterData = msr_read(cpu_id, westmereEX_counter_map[i].counterRegister); VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister, LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE); } } else { perfmon_threadData[thread_id].counters[i].counterData = msr_read(cpu_id, westmereEX_counter_map[i].counterRegister); VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister, LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE); } } } #if 0 flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS); printf ("Status: 0x%llX \n", LLU_CAST flags); if((flags & 0x3) || (flags & (0x3ULL<<32)) ) { printf ("Overflow occured \n"); } #endif } void perfmon_readCountersThread_nehalemEX(int thread_id) { int haveLock = 0; int cpu_id = perfmon_threadData[thread_id].processorId; if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) { haveLock = 1; } for ( int i=0; i UNCORE) { if(haveLock) { perfmon_threadData[thread_id].counters[i].counterData = msr_read(cpu_id, westmereEX_counter_map[i].counterRegister); } } else { perfmon_threadData[thread_id].counters[i].counterData = msr_read(cpu_id, westmereEX_counter_map[i].counterRegister); } } } } likwid-3.1.3/perl/templates/group.tt000644 137545 027340 00000014703 12377604752 020000 0ustar00unrz254unrz000000 000000 /* GENERATED FILE: DO NOTE EDIT */ #define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %] [% FOREACH group IN groups %] static const char* group_names_[% arch FILTER ucfirst %]_[% group.name %] [] = {[% FOREACH metric IN group.metrics %] "[% metric.label %]", [% END %] NULL}; [% END %] static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = { [% FOREACH group IN groups %] {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]", 0 [% FOREACH metric IN group.metrics %] +1 [% END %], group_names_[% arch FILTER ucfirst %]_[% group.name %] }, [% END %] }; void perfmon_getDerivedCounterValues[% arch FILTER ucfirst %](PerfmonGroup group, float * values, float * out_max, float * out_min){ double time = rdtscTime; double inverseClock = 1.0 /(double) timer_getCpuClock(); values[0] = time; out_min[0] = time; out_max[0] = time; switch ( group ) { [% FOREACH group IN groups %] case [% group.name %]:{ int threadId; int counter = 0; double sum,min,max; [% FOREACH metric IN group.metrics %] sum = 0; min = 1e300; max = 0; for(threadId=0; threadId < perfmon_numThreads; threadId++) { double cur = [% metric.rule %]; cur = isnan(cur) ? 0.0 : cur; sum += cur; max = max > cur ? max : cur; min = min < cur ? min : cur; } values[counter] = (float) sum / perfmon_numThreads; out_min[counter] = (float) min; out_max[counter] = (float) max; counter++; [% END %] return; } [% END %] default: fprintf (stderr, "perfmon_getDerivedCounterValues[% arch %]: Unknown group! Exiting!\n" ); exit (EXIT_FAILURE); break; } } void perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId) { int threadId; double time = rdtscTime; double inverseClock = 1.0 /(double) timer_getCpuClock(); PerfmonResultTable tableData; int numRows; int numColumns = perfmon_numThreads; bstring label; bstrList* fc; double** stat; double tmpValue; uint64_t cpi_instr = 0; uint64_t cpi_cyc = 0; int cpi_index = 0; switch ( groupId ) { [% FOREACH group IN groups %] case [% group.name %]: numRows = [% group.numRows %]; stat = (double**) malloc(numRows * sizeof(double*)); for (int i=0; i 1) { initStatisticTable(&tableData, fc, numRows); for (int i=0; i. * * ======================================================================================= */ /* ##### HEADER FILE INCLUDES ######################################### */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* ##### EXPORTED VARIABLES ########################################### */ CpuInfo cpuid_info; CpuTopology cpuid_topology; /* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */ static int largest_function = 0; /* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */ /* this was taken from the linux kernel */ #define CPUID \ __asm__ volatile ("cpuid" \ : "=a" (eax), \ "=b" (ebx), \ "=c" (ecx), \ "=d" (edx) \ : "0" (eax), "2" (ecx)) /* ##### VARIABLES - LOCAL TO THIS SOURCE FILE ###################### */ static char* pentium_m_b_str = "Intel Pentium M Banias processor"; static char* pentium_m_d_str = "Intel Pentium M Dothan processor"; static char* core_duo_str = "Intel Core Duo processor"; static char* core_2a_str = "Intel Core 2 65nm processor"; static char* core_2b_str = "Intel Core 2 45nm processor"; static char* atom_45_str = "Intel Atom 45nm processor"; static char* atom_32_str = "Intel Atom 32nm processor"; static char* atom_22_str = "Intel Atom 22nm processor"; static char* atom_silvermont_str = "Intel Atom (Silvermont) 22nm processor"; static char* atom_saltwell_str = "Intel Atom (Saltwell) 32nm processor"; static char* nehalem_bloom_str = "Intel Core Bloomfield processor"; static char* nehalem_lynn_str = "Intel Core Lynnfield processor"; static char* nehalem_west_str = "Intel Core Westmere processor"; static char* sandybridge_str = "Intel Core SandyBridge processor"; static char* ivybridge_str = "Intel Core IvyBridge processor"; static char* ivybridge_ep_str = "Intel Core IvyBridge EP processor"; static char* sandybridge_ep_str = "Intel Core SandyBridge EP processor"; static char* haswell_str = "Intel Core Haswell processor"; static char* haswell_ex_str = "Intel Core Haswell EX processor"; static char* nehalem_ex_str = "Intel Nehalem EX processor"; static char* westmere_ex_str = "Intel Westmere EX processor"; static char* xeon_mp_string = "Intel Xeon MP processor"; static char* xeon_phi_string = "Intel Xeon Phi Coprocessor"; static char* barcelona_str = "AMD Barcelona processor"; static char* shanghai_str = "AMD Shanghai processor"; static char* istanbul_str = "AMD Istanbul processor"; static char* magnycours_str = "AMD Magny Cours processor"; static char* interlagos_str = "AMD Interlagos processor"; static char* kabini_str = "AMD Family 16 model - Kabini processor"; static char* opteron_sc_str = "AMD Opteron single core 130nm processor"; static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor"; static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor"; static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor"; static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor"; static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor"; static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor"; static char* amd_k8_str = "AMD K8 architecture"; static char* unknown_intel_str = "Unknown Intel Processor"; static char* unknown_amd_str = "Unknown AMD Processor"; static volatile int init = 0; static uint32_t eax, ebx, ecx, edx; /* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */ static void initTopology(FILE* file) { size_t items; HWThread* hwThreadPool; CacheLevel* cacheLevels; TreeNode* currentNode; items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file); hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread)); items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file); cpuid_topology.threadPool = hwThreadPool; cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel)); items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file); cpuid_topology.cacheLevels = cacheLevels; cpuid_topology.topologyTree = NULL; tree_init(&cpuid_topology.topologyTree, 0); for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++) { if (!tree_nodeExists(cpuid_topology.topologyTree, hwThreadPool[i].packageId)) { tree_insertNode(cpuid_topology.topologyTree, hwThreadPool[i].packageId); } currentNode = tree_getNode(cpuid_topology.topologyTree, hwThreadPool[i].packageId); if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId)) { tree_insertNode(currentNode, hwThreadPool[i].coreId); } currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId); if (!tree_nodeExists(currentNode, i)) { tree_insertNode(currentNode, i); } } } static uint32_t amdGetAssociativity(uint32_t flag) { uint32_t asso= 0; switch ( flag ) { case 0x0: asso = 0; break; case 0x1: asso = 1; break; case 0x2: asso = 2; break; case 0x4: asso = 4; break; case 0x6: asso = 8; break; case 0x8: asso = 16; break; case 0xA: asso = 32; break; case 0xB: asso = 48; break; case 0xC: asso = 64; break; case 0xD: asso = 96; break; case 0xE: asso = 128; break; case 0xF: asso = 0; break; default: break; } return asso; } static int intelCpuidFunc_4(CacheLevel** cachePool) { int i; int level=0; int maxNumLevels=0; uint32_t valid=1; CacheLevel* pool; int threadsPerCpu = 0; int numThreadsPerSocket = cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore; while (valid) { eax = 0x04; ecx = level; CPUID; valid = extractBitField(eax,5,0); if (!valid) { break; } level++; } maxNumLevels = level; *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel)); pool = *cachePool; for (i=0; i < maxNumLevels; i++) { eax = 0x04; ecx = i; CPUID; pool[i].level = extractBitField(eax,3,5); pool[i].type = (CacheType) extractBitField(eax,5,0); pool[i].associativity = extractBitField(ebx,8,22)+1; pool[i].sets = ecx+1; pool[i].lineSize = extractBitField(ebx,12,0)+1; pool[i].size = pool[i].sets * pool[i].associativity * pool[i].lineSize; pool[i].threads = extractBitField(eax,10,14)+1; pool[i].inclusive = edx&0x2; /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT * turned off */ if (i < 3) { if ((cpuid_info.model == NEHALEM_BLOOMFIELD) || (cpuid_info.model == NEHALEM_LYNNFIELD) || (cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M) || (cpuid_info.model == SANDYBRIDGE) || (cpuid_info.model == SANDYBRIDGE_EP) || (cpuid_info.model == IVYBRIDGE) || (cpuid_info.model == IVYBRIDGE_EP) || (cpuid_info.model == HASWELL) || (cpuid_info.model == HASWELL_EX) || (cpuid_info.model == HASWELL_M1) || (cpuid_info.model == HASWELL_M2) || (cpuid_info.model == WESTMERE_EX) || (cpuid_info.model == NEHALEM_EX)) { if (cpuid_topology.numThreadsPerCore == 1) { pool[i].threads = 1; } } } /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes * too large in here. * See Documentation: Threads contains maximum number of threads supported * by the cache. * Limit threads per Socket then to the maximum possible value. If the number * of threads supported by the cache does not divide the threads on the socket * without remainder, the threads are adjusted to fit the multiple caches. */ if(pool[i].threads > numThreadsPerSocket) { pool[i].threads = numThreadsPerSocket; } else if (((double)numThreadsPerSocket)/((double)pool[i].threads) != (double)(numThreadsPerSocket/pool[i].threads)) { pool[i].threads = numThreadsPerSocket/ (int)ceil(((double)numThreadsPerSocket)/((double)pool[i].threads)); } /* For Intel Silvermont this is not enough. It returns 4 threads and 8 cores * for the L2 cache. But according to the data sheet, each 1MB L2 cache slice * is shared by 2 threads/cores. */ else if (pool[i].level == 2 && ((cpuid_info.model == ATOM_SILVERMONT_C) || (cpuid_info.model == ATOM_SILVERMONT_E) || (cpuid_info.model == ATOM_SILVERMONT_F1) || (cpuid_info.model == ATOM_SILVERMONT_F2) || (cpuid_info.model == ATOM_SILVERMONT_F3))) { pool[i].threads = 2; } } return maxNumLevels; } static int recheck_numHWThreads() { int cpucount = 0; char line[1024]; FILE* fp = fopen("/proc/cpuinfo","r"); if (fp != NULL) { while( fgets(line,1024,fp) ) { if (strncmp(line, "processor", 9) == 0) { cpucount++; } } } return cpucount; } /* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */ int cpuid_init (void) { int isIntel = 1; /* FIXME: Race condition??? */ if (init) return EXIT_SUCCESS; init =1; eax = 0x00; CPUID; largest_function = eax; if (ebx == 0x68747541U) { isIntel = 0; } eax = 0x01; CPUID; cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU); cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU); cpuid_info.stepping = (eax&0xFU); switch ( cpuid_info.family ) { case P6_FAMILY: switch ( cpuid_info.model ) { case PENTIUM_M_BANIAS: cpuid_info.name = pentium_m_b_str; break; case PENTIUM_M_DOTHAN: cpuid_info.name = pentium_m_d_str; break; case CORE_DUO: cpuid_info.name = core_duo_str; break; case CORE2_65: cpuid_info.name = core_2a_str; break; case CORE2_45: cpuid_info.name = core_2b_str; break; case NEHALEM_BLOOMFIELD: cpuid_info.name = nehalem_bloom_str; break; case NEHALEM_LYNNFIELD: cpuid_info.name = nehalem_lynn_str; break; case NEHALEM_WESTMERE_M: case NEHALEM_WESTMERE: cpuid_info.name = nehalem_west_str; break; case SANDYBRIDGE: cpuid_info.name = sandybridge_str; break; case SANDYBRIDGE_EP: cpuid_info.name = sandybridge_ep_str; break; case IVYBRIDGE: cpuid_info.name = ivybridge_str; break; case IVYBRIDGE_EP: cpuid_info.name = ivybridge_ep_str; break; case HASWELL: case HASWELL_M1: case HASWELL_M2: cpuid_info.name = haswell_str; break; case HASWELL_EX: cpuid_info.name = haswell_ex_str; break; case NEHALEM_EX: cpuid_info.name = nehalem_ex_str; break; case WESTMERE_EX: cpuid_info.name = westmere_ex_str; break; case XEON_MP: cpuid_info.name = xeon_mp_string; break; case ATOM_45: case ATOM: cpuid_info.name = atom_45_str; break; case ATOM_32: cpuid_info.name = atom_32_str; break; case ATOM_22: cpuid_info.name = atom_22_str; break; case ATOM_SILVERMONT_C: case ATOM_SILVERMONT_E: case ATOM_SILVERMONT_F1: case ATOM_SILVERMONT_F2: case ATOM_SILVERMONT_F3: cpuid_info.name = atom_silvermont_str; break; default: cpuid_info.name = unknown_intel_str; break; } break; case MIC_FAMILY: switch ( cpuid_info.model ) { case XEON_PHI: cpuid_info.name = xeon_phi_string; break; } break; case K8_FAMILY: if (isIntel) { ERROR_PLAIN_PRINT(Netburst architecture is not supported); } switch ( cpuid_info.model ) { case OPTERON_DC_E: cpuid_info.name = opteron_dc_e_str; break; case OPTERON_DC_F: cpuid_info.name = opteron_dc_f_str; break; case ATHLON64_X2: case ATHLON64_X2_F: cpuid_info.name = athlon64_str; break; case ATHLON64_F1: case ATHLON64_F2: cpuid_info.name = athlon64_f_str; break; case ATHLON64_X2_G: cpuid_info.name = athlon64_X2_g_str; break; case ATHLON64_G1: case ATHLON64_G2: cpuid_info.name = athlon64_g_str; break; case OPTERON_SC_1MB: cpuid_info.name = opteron_sc_str; break; default: cpuid_info.name = amd_k8_str; break; } break; case K10_FAMILY: switch ( cpuid_info.model ) { case BARCELONA: cpuid_info.name = barcelona_str; break; case SHANGHAI: cpuid_info.name = shanghai_str; break; case ISTANBUL: cpuid_info.name = istanbul_str; break; case MAGNYCOURS: cpuid_info.name = magnycours_str; break; default: cpuid_info.name = unknown_amd_str; break; } break; case K15_FAMILY: cpuid_info.name = interlagos_str; break; case K16_FAMILY: cpuid_info.name = kabini_str; break; default: return EXIT_FAILURE; break; } cpuid_info.featureFlags = 0; cpuid_info.features = (char*) malloc(200*sizeof(char)); cpuid_info.features[0] = 0; if (ecx & (1<<0)) { strcat(cpuid_info.features, "SSE3 "); cpuid_info.featureFlags |= (1<>8)&0xFFU); cpuid_info.perf_width_ctr = ((eax>>16)&0xFFU); cpuid_info.perf_num_fixed_ctr = (edx&0xFU); eax = 0x06; CPUID; if (eax & (1<<1)) { cpuid_info.turbo = 1; } else { cpuid_info.turbo = 0; } } FILE *file; char *filepath = TOSTRING(CFGFILE); if ((file = fopen(filepath, "rb")) != NULL) { //printf("Read config from file\n"); initTopology(file); fclose(file); } else { cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF); if (recheck_numHWThreads() != cpuid_topology.numHWThreads) { cpuid_topology.numHWThreads = recheck_numHWThreads(); } cpu_set_t cpuSet; CPU_ZERO(&cpuSet); sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet); cpuid_initTopology(); cpuid_initCacheTopology(); /* restore affinity mask of process */ sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet); } return EXIT_SUCCESS; } void cpuid_print (void) { printf("\nSupported Intel processors:\n"); printf("\t%s\n",core_2a_str); printf("\t%s\n",core_2b_str); printf("\t%s\n",xeon_mp_string); printf("\t%s\n",atom_45_str); printf("\t%s\n",atom_32_str); printf("\t%s\n",atom_22_str); printf("\t%s\n",nehalem_bloom_str); printf("\t%s\n",nehalem_lynn_str); printf("\t%s\n",nehalem_west_str); printf("\t%s (with Uncore support)\n",nehalem_ex_str); printf("\t%s (with Uncore support)\n",westmere_ex_str); printf("\t%s\n",sandybridge_str); printf("\t%s (with Uncore support)\n",sandybridge_ep_str); printf("\t%s\n",ivybridge_str); printf("\t%s (with Uncore support)\n",ivybridge_ep_str); printf("\t%s (with Uncore support)\n",haswell_str); printf("\t%s (no Uncore support)\n",haswell_ex_str); printf("\t%s\n",atom_silvermont_str); printf("\t%s\n",atom_saltwell_str); printf("\t%s\n\n",xeon_phi_string); printf("Supported AMD processors:\n"); printf("\t%s\n",opteron_sc_str); printf("\t%s\n",opteron_dc_e_str); printf("\t%s\n",opteron_dc_f_str); printf("\t%s\n",barcelona_str); printf("\t%s\n",shanghai_str); printf("\t%s\n",istanbul_str); printf("\t%s\n",magnycours_str); printf("\t%s\n",interlagos_str); printf("\t%s\n\n",kabini_str); } #define freeStrings \ bdestroy(filename); \ bdestroy(grepString); \ bdestroy(cpulist) int cpuid_isInCpuset(void) { int pos = 0; bstring grepString = bformat("Cpus_allowed_list:"); bstring filename = bformat("/proc/%d/status",getpid()); FILE* fp = fopen(bdata(filename),"r"); if (fp == NULL) { bdestroy(filename); bdestroy(grepString); return 0; } else { bstring cpulist; uint32_t tmpThreads[MAX_NUM_THREADS]; bstring src = bread ((bNread) fread, fp); if ((pos = binstr(src,0,grepString)) != BSTR_ERR) { int end = bstrchrp(src, 10, pos); pos = pos+blength(grepString); cpulist = bmidstr(src,pos, end-pos); btrimws(cpulist); if (bstr_to_cpuset_physical(tmpThreads, cpulist) < cpuid_topology.numHWThreads) { freeStrings; return 1; } else { freeStrings; return 0; } } return 0; } } void cpuid_initTopology(void) { uint32_t apicId; uint32_t bitField; int level; int prevOffset = 0; int currOffset = 0; cpu_set_t set; HWThread* hwThreadPool; int hasBLeaf = 0; int maxNumLogicalProcs; int maxNumLogicalProcsPerCore; int maxNumCores; TreeNode* currentNode; int width; /* check if 0x0B cpuid leaf is supported */ if (largest_function >= 0x0B) { eax = 0x0B; ecx = 0; CPUID; if (ebx) { hasBLeaf = 1; } } hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread)); tree_init(&cpuid_topology.topologyTree, 0); if (hasBLeaf) { for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++) { CPU_ZERO(&set); CPU_SET(i,&set); sched_setaffinity(0, sizeof(cpu_set_t), &set); eax = 0x0B; ecx = 0; CPUID; apicId = edx; hwThreadPool[i].apicId = apicId; for (level=0; level < 3; level++) { eax = 0x0B; ecx = level; CPUID; currOffset = eax&0xFU; switch ( level ) { case 0: /* SMT thread */ bitField = extractBitField(apicId, currOffset, 0); hwThreadPool[i].threadId = bitField; break; case 1: /* Core */ bitField = extractBitField(apicId, currOffset-prevOffset, prevOffset); hwThreadPool[i].coreId = bitField; break; case 2: /* Package */ bitField = extractBitField(apicId, 32-prevOffset, prevOffset); hwThreadPool[i].packageId = bitField; break; } prevOffset = currOffset; } } } else { switch ( cpuid_info.family ) { case MIC_FAMILY: case P6_FAMILY: eax = 0x01; CPUID; maxNumLogicalProcs = extractBitField(ebx,8,16); /* Check number of cores per package */ eax = 0x04; ecx = 0; CPUID; maxNumCores = extractBitField(eax,6,26)+1; maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores; for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++) { CPU_ZERO(&set); CPU_SET(i,&set); sched_setaffinity(0, sizeof(cpu_set_t), &set); eax = 0x01; CPUID; hwThreadPool[i].apicId = extractBitField(ebx,8,24); /* ThreadId is extracted from th apicId using the bit width * of the number of logical processors * */ hwThreadPool[i].threadId = extractBitField(hwThreadPool[i].apicId, getBitFieldWidth(maxNumLogicalProcsPerCore),0); /* CoreId is extracted from th apicId using the bitWidth * of the number of logical processors as offset and the * bit width of the number of cores as width * */ hwThreadPool[i].coreId = extractBitField(hwThreadPool[i].apicId, getBitFieldWidth(maxNumCores), getBitFieldWidth(maxNumLogicalProcsPerCore)); hwThreadPool[i].packageId = extractBitField(hwThreadPool[i].apicId, 8-getBitFieldWidth(maxNumLogicalProcs), getBitFieldWidth(maxNumLogicalProcs)); } break; case K8_FAMILY: /* AMD Bios manual Rev. 2.28 section 3.1 * Legacy method */ /*FIXME: This is a bit of a hack */ maxNumLogicalProcsPerCore = 1; maxNumLogicalProcs = 1; eax = 0x80000008; CPUID; maxNumCores = extractBitField(ecx,8,0)+1; for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++) { CPU_ZERO(&set); CPU_SET(i,&set); sched_setaffinity(0, sizeof(cpu_set_t), &set); eax = 0x01; CPUID; hwThreadPool[i].apicId = extractBitField(ebx,8,24); /* ThreadId is extracted from th apicId using the bit width * of the number of logical processors * */ hwThreadPool[i].threadId = extractBitField(hwThreadPool[i].apicId, getBitFieldWidth(maxNumLogicalProcsPerCore),0); /* CoreId is extracted from th apicId using the bitWidth * of the number of logical processors as offset and the * bit width of the number of cores as width * */ hwThreadPool[i].coreId = extractBitField(hwThreadPool[i].apicId, getBitFieldWidth(maxNumCores), 0); hwThreadPool[i].packageId = extractBitField(hwThreadPool[i].apicId, 8-getBitFieldWidth(maxNumCores), getBitFieldWidth(maxNumCores)); } break; case K16_FAMILY: case K15_FAMILY: case K10_FAMILY: /* AMD Bios manual Rev. 2.28 section 3.2 * Extended method */ eax = 0x80000008; CPUID; width = extractBitField(ecx,4,12); if (width == 0) { width = extractBitField(ecx,8,0)+1; } eax = 0x01; CPUID; maxNumLogicalProcs = extractBitField(ebx,8,16); maxNumCores = extractBitField(ecx,8,0)+1; for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++) { CPU_ZERO(&set); CPU_SET(i,&set); sched_setaffinity(0, sizeof(cpu_set_t), &set); eax = 0x01; CPUID; hwThreadPool[i].apicId = extractBitField(ebx,8,24); /* AMD only knows cores */ hwThreadPool[i].threadId = 0; hwThreadPool[i].coreId = extractBitField(hwThreadPool[i].apicId, width, 0); hwThreadPool[i].packageId = extractBitField(hwThreadPool[i].apicId, (8-width), width); } break; } } for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++) { /* Add node to Topology tree */ if (!tree_nodeExists(cpuid_topology.topologyTree, hwThreadPool[i].packageId)) { tree_insertNode(cpuid_topology.topologyTree, hwThreadPool[i].packageId); } currentNode = tree_getNode(cpuid_topology.topologyTree, hwThreadPool[i].packageId); if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId)) { tree_insertNode(currentNode, hwThreadPool[i].coreId); } currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId); if (!tree_nodeExists(currentNode, i)) { /* printf("WARNING: Thread already exists!\n"); */ tree_insertNode(currentNode, i); } } cpuid_topology.threadPool = hwThreadPool; cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree); currentNode = tree_getChildNode(cpuid_topology.topologyTree); cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode); currentNode = tree_getChildNode(currentNode); cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode); } void cpuid_initCacheTopology() { int maxNumLevels=0; int id=0; CacheLevel* cachePool = NULL; CacheType type = DATACACHE; switch ( cpuid_info.family ) { case MIC_FAMILY: case P6_FAMILY: if (largest_function >= 4) { maxNumLevels = intelCpuidFunc_4(&cachePool); } else { // intelCpuidFunc_2(&cachePool); } break; case K8_FAMILY: maxNumLevels = 2; cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel)); eax = 0x80000005; CPUID; cachePool[0].level = 1; cachePool[0].type = DATACACHE; cachePool[0].associativity = extractBitField(ecx,8,16); cachePool[0].lineSize = extractBitField(ecx,8,0); cachePool[0].size = extractBitField(ecx,8,24) * 1024; if ((cachePool[0].associativity * cachePool[0].lineSize) != 0) { cachePool[0].sets = cachePool[0].size/ (cachePool[0].associativity * cachePool[0].lineSize); } cachePool[0].threads = 1; cachePool[0].inclusive = 1; eax = 0x80000006; CPUID; cachePool[1].level = 2; cachePool[1].type = UNIFIEDCACHE; cachePool[1].associativity = amdGetAssociativity(extractBitField(ecx,4,12)); cachePool[1].lineSize = extractBitField(ecx,8,0); cachePool[1].size = extractBitField(ecx,16,16) * 1024; if ((cachePool[0].associativity * cachePool[0].lineSize) != 0) { cachePool[1].sets = cachePool[1].size/ (cachePool[1].associativity * cachePool[1].lineSize); } cachePool[1].threads = 1; cachePool[1].inclusive = 1; break; case K10_FAMILY: /* FIXME: Adds one level for the instruction cache on Intel * This fixes the level for the cores */ maxNumLevels = 3; cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel)); eax = 0x80000005; CPUID; cachePool[0].level = 1; cachePool[0].type = DATACACHE; cachePool[0].associativity = extractBitField(ecx,8,16); cachePool[0].lineSize = extractBitField(ecx,8,0); cachePool[0].size = extractBitField(ecx,8,24) * 1024; if ((cachePool[0].associativity * cachePool[0].lineSize) != 0) { cachePool[0].sets = cachePool[0].size/ (cachePool[0].associativity * cachePool[0].lineSize); } cachePool[0].threads = 1; cachePool[0].inclusive = 1; eax = 0x80000006; CPUID; cachePool[1].level = 2; cachePool[1].type = UNIFIEDCACHE; cachePool[1].associativity = amdGetAssociativity(extractBitField(ecx,4,12)); cachePool[1].lineSize = extractBitField(ecx,8,0); cachePool[1].size = extractBitField(ecx,16,16) * 1024; if ((cachePool[0].associativity * cachePool[0].lineSize) != 0) { cachePool[1].sets = cachePool[1].size/ (cachePool[1].associativity * cachePool[1].lineSize); } cachePool[1].threads = 1; cachePool[1].inclusive = 1; cachePool[2].level = 3; cachePool[2].type = UNIFIEDCACHE; cachePool[2].associativity = amdGetAssociativity(extractBitField(edx,4,12)); cachePool[2].lineSize = extractBitField(edx,8,0); cachePool[2].size = (extractBitField(edx,14,18)+1) * 524288; if ((cachePool[0].associativity * cachePool[0].lineSize) != 0) { cachePool[2].sets = cachePool[1].size/ (cachePool[1].associativity * cachePool[1].lineSize); } if (cpuid_info.model != MAGNYCOURS) { cachePool[2].threads = cpuid_topology.numCoresPerSocket; } else { cachePool[2].threads = cpuid_topology.numCoresPerSocket/2; cachePool[2].size /= 2 ; } cachePool[2].inclusive = 1; break; case K16_FAMILY: case K15_FAMILY: maxNumLevels = 0; cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel)); while (type) { ecx = id; eax = 0x8000001D; CPUID; type = (CacheType) extractBitField(eax,4,0); if ((type == DATACACHE) || (type == UNIFIEDCACHE)) { cachePool[maxNumLevels].level = extractBitField(eax,3,5); cachePool[maxNumLevels].type = type; cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1; cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1; cachePool[maxNumLevels].sets = extractBitField(ecx,32,0)+1; cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity * cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets; cachePool[maxNumLevels].threads = extractBitField(eax,12,14)+1; cachePool[maxNumLevels].inclusive = (edx & (0x1<<1)); maxNumLevels++; } id++; } break; default: ERROR_PLAIN_PRINT(Processor is not supported); break; } cpuid_topology.numCacheLevels = maxNumLevels; cpuid_topology.cacheLevels = cachePool; } likwid-3.1.3/perl/Template/Namespace/000755 137545 027340 00000000000 12426160162 017722 5ustar00unrz254unrz000000 000000 likwid-3.1.3/bench/x86-64/copy_plain.ptt000644 137545 027340 00000000533 12416714770 020144 0ustar00unrz254unrz000000 000000 STREAMS 2 TYPE DOUBLE FLOPS 0 BYTES 16 LOOP 4 movsd FPR1, [STR0 + GPR1 * 8] movsd FPR2, [STR0 + GPR1 * 8 + 8] movsd FPR3, [STR0 + GPR1 * 8 + 16] movsd FPR4, [STR0 + GPR1 * 8 + 24] movsd [STR1 + GPR1 * 8] , FPR1 movsd [STR1 + GPR1 * 8 + 8] , FPR2 movsd [STR1 + GPR1 * 8 + 16], FPR3 movsd [STR1 + GPR1 * 8 + 24], FPR4 likwid-3.1.3/perl/set_license.pl000755 137545 027340 00000012160 12426160336 017113 0ustar00unrz254unrz000000 000000 #!/usr/bin/perl -w use strict; use warnings; use File::Find; use File::Copy; my $mc = '#'; my $cc = ' *'; my $fc = '!'; #my $VERSION = ''; #my $DATE = ''; my $VERSION = '3.1.3'; my $DATE = '4.11.2014'; my $YEAR = '2014'; my $AUTHOR = 'Jan Treibig'; my $LICENSE = 'gpl'; sub print_copyright { my $fh = shift; my $cm = shift; if ($LICENSE eq 'gpl') { print $fh <. $cm $cm ======================================================================================= END } elsif ($LICENSE eq 'bsd') { print $fh < $filename.tmp"; print "Process $filename\n"; while( ) { if (/\/\*/ and !$enter) { $style = $cc; $enter = 1; $in_header = 1; print OUTFILE "/*\n"; print OUTFILE "$style =======================================================================================\n"; next; } elsif (/# =/ and !$enter) { $style = $mc; $enter = 1; $in_header = 1; print OUTFILE "$style =======================================================================================\n"; next; } elsif (/! =/ and !$enter) { $style = $fc; $enter = 1; $in_header = 1; print OUTFILE "$style =======================================================================================\n"; next; } elsif (!$enter) { print "Skip $filename: No header found!\n"; return; } if ($in_header) { if(/Filename:[ ]+([A-za-z0-9._\-]+)/) { if ($1 ne $filename) { print "File name mismatch: $filename header says $1\n"; } print OUTFILE "$_"; } elsif(/Version:/) { print OUTFILE "$style Version: $VERSION\n"; } elsif(/Released:/) { print OUTFILE "$style Released: $DATE\n"; } elsif(/Company:/) { #Skip company from header } elsif(/Copyright/) { $in_copyright = 1; # print OUTFILE "$style\n"; print_copyright(\*OUTFILE, $style); } elsif(/# =/ or /! =/) { $in_copyright = 0; $in_header = 0; } elsif (/\*\//) { $in_copyright = 0; $in_header = 0; print OUTFILE " */\n"; } elsif (/\* =/) { # Skip initial hline } else { if($in_copyright eq 0) { print OUTFILE "$_"; } } } else { print OUTFILE "$_"; } } close INFILE; close OUTFILE; unlink $filename or die "Failed to delete file $filename\n"; copy ("$filename.tmp", $filename) or die "Copy failed\n"; unlink "$filename.tmp" or die "Failed to delete file $filename\n"; } if (defined $ARGV[0]) { my $filename = $ARGV[0]; wanted($filename); exit (0); } my @directories; push @directories, 'src'; find(\&wanted, @directories); likwid-3.1.3/test/executable_tests/likwid-setFreq.txt000644 137545 027340 00000000415 12416722717 023277 0ustar00unrz254unrz000000 000000 | EXIT 1 | GREP Usage 0 | EXIT 1 | GREP Usage 0 0 | EXIT 1 | GREP Frequency must be greater than 0 0 -1 | EXIT 1 | GREP Frequency must be greater than 0 -1 -1 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to 100 0 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to likwid-3.1.3/kernel/Makefile000644 137545 027340 00000000467 12417761671 016254 0ustar00unrz254unrz000000 000000 obj-m := enable_rdpmc.o KERNELDIR ?= /lib/modules/$(shell uname -r)/build PWD := $(shell pwd) all: $(MAKE) -Wpacked -C $(KERNELDIR) M=$(PWD) modules modules_install: install -m 666 enable_rdpmc.ko /lib/modules/$(shell uname -r)/extra/ clean: rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c likwid-3.1.3/groups/k10/NUMA2.txt000644 137545 027340 00000001412 12336605216 016610 0ustar00unrz254unrz000000 000000 SHORT Bandwidth on the Hypertransport links EVENTSET PMC0 CPU_TO_DRAM_LOCAL_TO_4 PMC1 CPU_TO_DRAM_LOCAL_TO_5 PMC2 CPU_TO_DRAM_LOCAL_TO_6 PMC3 CPU_TO_DRAM_LOCAL_TO_7 METRICS Runtime (RDTSC) [s] time Hyper Transport link0 bandwidth (MBytes/s) 1.0E-06*PMC0*4.0/time Hyper Transport link1 bandwidth (MBytes/s) 1.0E-06*PMC1*4.0/time Hyper Transport link2 bandwidth (MBytes/s) 1.0E-06*PMC2*4.0/time Hyper Transport link3 bandwidth (MBytes/s) 1.0E-06*PMC3*4.0/time LONG Formulas: Hyper Transport linkn bandwidth (MBytes/s) 1.0E-06*HYPERTRANSPORT_LINK0_ALL_SENT*4.0/time - Profiling group to measure the bandwidth over the Hypertransport links. Can be used to detect NUMA problems. Usually there should be only limited traffic over the QPI links for optimal performance. likwid-3.1.3/perl/Template/Plugin/000755 137545 027340 00000000000 12426160162 017264 5ustar00unrz254unrz000000 000000 likwid-3.1.3/src/includes/cpuid_types.h000644 137545 027340 00000004741 12426160352 020416 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: cpuid_types.h * * Description: Types file for cpuid module. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #ifndef CPUID_TYPES_H #define CPUID_TYPES_H typedef enum { NOCACHE=0, DATACACHE, INSTRUCTIONCACHE, UNIFIEDCACHE, ITLB, DTLB} CacheType; typedef enum { NODE=0, SOCKET, CORE, THREAD} NodeLevel; typedef enum { SSE3=0, VSX, MMX, SSE, SSE2, MONITOR, ACPI, RDTSCP, VMX, EIST, TM, TM2, AES, RDRAND, SSSE3, SSE41, SSE42, AVX, FMA} FeatureBit; typedef struct { uint32_t family; uint32_t model; uint32_t stepping; uint64_t clock; int turbo; char* name; char* features; uint32_t featureFlags; uint32_t perf_version; uint32_t perf_num_ctr; uint32_t perf_width_ctr; uint32_t perf_num_fixed_ctr; int supportUncore; } CpuInfo; typedef struct { uint32_t threadId; uint32_t coreId; uint32_t packageId; uint32_t apicId; } HWThread; typedef struct { int level; CacheType type; int associativity; int sets; int lineSize; int size; int threads; int inclusive; } CacheLevel; typedef struct { uint32_t numHWThreads; uint32_t numSockets; uint32_t numCoresPerSocket; uint32_t numThreadsPerCore; uint32_t numCacheLevels; HWThread* threadPool; CacheLevel* cacheLevels; TreeNode* topologyTree; } CpuTopology; #endif /*CPUID_TYPES_H*/ likwid-3.1.3/kernel/enable_rdpmc.c000644 137545 027340 00000003330 12417761671 017363 0ustar00unrz254unrz000000 000000 /* * Read PMC in kernel mode. */ #include /* Needed by all modules */ #include /* Needed for KERN_INFO */ #define MODULE_PARAM(type, name, value, desc) \ type name = value; \ module_param(name, type, 0664); \ MODULE_PARM_DESC(name, desc) MODULE_PARAM(int, debug, 0, "Debug output"); static uint64_t printc4(void) { uint64_t output; // Read back CR4 to check the bit. __asm__("\t mov %%cr4,%0" : "=r"(output)); return output; } static void setc4b8(void * info) { // Set CR4, Bit 8 (9th bit from the right) to enable __asm__("push %rax\n\t" "mov %cr4,%rax;\n\t" "or $(1 << 8),%rax;\n\t" "mov %rax,%cr4;\n\t" "wbinvd\n\t" "pop %rax" ); if (debug) { printk(KERN_INFO "Processor %d, RDPMC_ENABLE_BIT=%llu\n", smp_processor_id(), printc4()); } } static void clearc4b8(void * info) { printc4(); __asm__("push %rax\n\t" "push %rbx\n\t" "mov %cr4,%rax;\n\t" "mov $(1 << 8), %rbx\n\t" "not %rbx\n\t" "and %rbx, %rax;\n\t" "mov %rax,%cr4;\n\t" "wbinvd\n\t" "pop %rbx\n\t" "pop %rax\n\t" ); if (debug) { printk(KERN_INFO "Processor %d, RDPMC_ENABLE_BIT=%llu\n", smp_processor_id(), printc4()); } } int start_module(void) { on_each_cpu(setc4b8, NULL, 0); return 0; } void stop_module(void) { on_each_cpu(clearc4b8, NULL, 0); } module_init(start_module); module_exit(stop_module) MODULE_AUTHOR("Thomas Roehl "); MODULE_DESCRIPTION("Enable RDPMC for userspace"); MODULE_LICENSE("GPL"); likwid-3.1.3/test/executable_tests/000755 137545 027340 00000000000 12426160162 017633 5ustar00unrz254unrz000000 000000 likwid-3.1.3/src/applications/likwid-pin.c000644 137545 027340 00000026773 12426160352 021021 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: likwid-pin.c * * Description: An application to pin a program including threads * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ /* ##### HEADER FILE INCLUDES ######################################### */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COLOR #include #endif /* ##### MACROS - LOCAL TO THIS SOURCE FILE ######################### */ #define HELP_MSG \ fprintf(stdout, "likwid-pin -- Version %d.%d \n\n",VERSION,RELEASE); \ fprintf(stdout, "\n"); \ fprintf(stdout, "Supported Options:\n"); \ fprintf(stdout, "-h\t Help message\n"); \ fprintf(stdout, "-v\t Version information\n"); \ fprintf(stdout, "-i\t Set numa interleave policy with all involved numa nodes\n"); \ fprintf(stdout, "-S\t Sweep memory in involved numa nodes\n"); \ fprintf(stdout, "-c\t comma separated list of processor ids or expression\n"); \ fprintf(stdout, "-s\t bitmask with threads to skip\n"); \ fprintf(stdout, "-p\t Print available domains with mapping on physical ids\n"); \ fprintf(stdout, " \t If used together with -c option outputs a physical processor ids.\n"); \ fprintf(stdout, "-d\t Delimiter used for using -p to output physical processor list, default is comma.\n\n"); \ fprintf(stdout, "-q\t Silent without output\n\n"); \ fprintf(stdout, "There are three possibilities to provide a thread to processor list:\n\n"); \ fprintf(stdout, "1. Thread list with physical or logical thread numberings and physical cores first.\n"); \ fprintf(stdout, "Example usage thread list: likwid-pin -c N:0,4-6 ./myApp\n"); \ fprintf(stdout, "You can pin with the following numberings:\n"); \ fprintf(stdout, "\t1. Physical numbering of OS.\n"); \ fprintf(stdout, "\t2. Logical numbering inside node. e.g. -c N:0-3\n"); \ fprintf(stdout, "\t3. Logical numbering inside socket. e.g. -c S0:0-3\n"); \ fprintf(stdout, "\t4. Logical numbering inside last level cache group. e.g. -c C0:0-3\n"); \ fprintf(stdout, "\t5. Logical numbering inside NUMA domain. e.g. -c M0:0-3\n"); \ fprintf(stdout, "\tYou can also mix domains separated by @, e.g. -c S0:0-3@S1:0-3 \n\n"); \ fprintf(stdout, "2. Expressions based thread list generation with compact processor numbering.\n"); \ fprintf(stdout, "Example usage expression: likwid-pin -c E:N:8 ./myApp\n"); \ fprintf(stdout, "This will generate a compact list of thread to processor mapping for the node domain with eight threads.\n"); \ fprintf(stdout, "The following syntax variants are available:\n"); \ fprintf(stdout, "\t1. -c E::\n"); \ fprintf(stdout, "\t2. -c E::::\n"); \ fprintf(stdout, "\t For two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4\n\n"); \ fprintf(stdout, "3. Scatter policy among thread domain type.\n"); \ fprintf(stdout, "Example usage scatter: likwid-pin -c M:scatter ./myApp\n"); \ fprintf(stdout, "This will generate a thread to processor mapping scattered among all memory domains with physical cores first.\n\n"); \ fprintf(stdout, "4. Logical pinning.\n"); \ fprintf(stdout, "Example usage logical pinning: likwid-pin -c L:0,3,4 ./myApp\n"); \ fprintf(stdout, "This will generate a mapping containing the processors with index 0, 3 and 4 in the currently available processor list.\n"); \ fprintf(stdout, "If you are running inside a cpuset (taskset, cgroup) the sorted list of allowed processors is taken as processor list.\n"); \ fprintf(stdout, "Example usage logical pinning inside cpuset:\n"); \ fprintf(stdout, "taskset -c 4,7,2,1,5 likwid-pin -c L:0,2,4 ./myApp\n"); \ fprintf(stdout, "This maps the application to the processors 1,4,7.\n\n"); \ fprintf(stdout, "If you ommit the -c option likwid will use all processors available on the node\n"); \ fprintf(stdout, "with physical cores first. likwid-pin will also set OMP_NUM_THREADS with as many\n"); \ fprintf(stdout, "threads as specified in your pin expression if OMP_NUM_THREADS is not present\n"); \ fprintf(stdout, "in your environment.\n\n"); \ fflush(stdout); #define VERSION_MSG \ fprintf(stdout, "likwid-pin %d.%d \n\n",VERSION,RELEASE); \ fflush(stdout); /* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */ static void pinPid(int cpuid, int silent) { int status; cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpuid, &cpuset); status = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); if (status == -1) { fprintf(stderr, "sched_setaffinity failed : %s \n",strerror(errno)); } else { if(!silent) { #ifdef COLOR color_on(BRIGHT, COLOR); #endif fprintf(stdout, "[likwid-pin] Main PID -> core %d - OK", cpuid); #ifdef COLOR color_reset(); #endif fprintf(stdout, "\n"); fflush(stdout); } } } /* ##### FUNCTION DEFINITIONS - EXPORTED FUNCTIONS ################## */ int main (int argc, char** argv) { int i; int c; int skipMask = -1; int optInterleaved = 0; int optMemSweep = 0; int optPrintDomains = 0; int optSilent = 0; int hasAffinity = 0; bstring pinString; bstring skipString; bstring argString; int numThreads=0; int threads[MAX_NUM_THREADS]; char delimiter = ','; FILE* OUTSTREAM = stdout; threads[0] = 0; if (argc == 1) { HELP_MSG; exit (EXIT_SUCCESS); } if (cpuid_init() == EXIT_SUCCESS) { numa_init(); affinity_init(); hasAffinity = 1; } while ((c = getopt (argc, argv, "+c:d:hipqs:Sv")) != -1) { switch (c) { case 'c': CHECK_OPTION_STRING; if (hasAffinity) { numThreads = bstr_to_cpuset(threads, argString); } else { numThreads = bstr_to_cpuset_physical((uint32_t*) threads, argString); } if(!numThreads) { ERROR_PLAIN_PRINT(Failed to parse cpu list.); } break; case 'd': delimiter = optarg[0]; break; case 'h': HELP_MSG; exit (EXIT_SUCCESS); case 'i': optInterleaved = 1; break; case 'p': if (!hasAffinity) { fprintf(stderr, "Option -p is not supported for unknown processor!\n"); exit(EXIT_SUCCESS); } optPrintDomains = 1; break; case 'q': optSilent = 1; OUTSTREAM = NULL; setenv("LIKWID_SILENT","true", 1); break; case 's': CHECK_OPTION_STRING; skipMask = strtoul((char*) argString->data,NULL,16); break; case 'S': if (!hasAffinity) { fprintf(stderr, "Option -S is not supported for unknown processor!\n"); exit(EXIT_SUCCESS); } optMemSweep = 1; break; case 'v': VERSION_MSG; exit (EXIT_SUCCESS); default: HELP_MSG; exit(EXIT_FAILURE); } } if (optind == argc && !optPrintDomains) { fprintf(stderr,"Executable must be given on commandline\n"); exit(EXIT_FAILURE); } if (optPrintDomains && numThreads) { if ((!optSilent) && (OUTSTREAM)) { fprintf(OUTSTREAM, "%d",threads[0]); for ( i=1; i< numThreads; i++) { fprintf(OUTSTREAM, "%c%d",delimiter,threads[i]); } fprintf(OUTSTREAM, "\n"); fflush(OUTSTREAM); } exit (EXIT_SUCCESS); } else if ( optPrintDomains ) { affinity_printDomains(OUTSTREAM); exit (EXIT_SUCCESS); } if (!numThreads) { argString = bformat("N:0-%u", cpuid_topology.numHWThreads-1); numThreads = bstr_to_cpuset(threads, argString); } /* CPU List: * pthread (default): pin main pid + all thread tids * * OpenMP: Pin OMP_NUM_THREADS * intel openmp: pin main pid + all thread tids (skip thread 1) * gcc openmp: pin main pid + all thread tids (one less) */ if (optInterleaved) { if ((!optSilent) && (OUTSTREAM)) { fprintf(OUTSTREAM, "Set mem_policy to interleaved\n"); fflush(OUTSTREAM); } numa_setInterleaved(threads, numThreads); } if (optMemSweep) { if ((!optSilent) && (OUTSTREAM)) { fprintf(OUTSTREAM, "Sweeping memory\n"); fflush(OUTSTREAM); } memsweep_threadGroup(OUTSTREAM, threads, numThreads); } if ( getenv("OMP_NUM_THREADS") == NULL ) { argString = bformat("%d",numThreads); setenv("OMP_NUM_THREADS",(char*) argString->data , 0); } if (numThreads > 1) { bstring ldPreload = bfromcstr(getenv("LD_PRELOAD")); pinString = bformat("%d",threads[1]); for (i=2; i < numThreads;i++) { bformata(pinString,",%d",threads[i]); } bformata(pinString,",%d",threads[0]); if (skipMask >= 0) { skipString = bformat("%d",skipMask); setenv("LIKWID_SKIP",(char*) bdata(skipString) , 1); } setenv("KMP_AFFINITY", "disabled", 1); setenv("LIKWID_PIN",(char*) bdata(pinString) , 1); if (ldPreload == NULL) { setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1); } else { bconchar(ldPreload, ':'); bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN)); setenv("LD_PRELOAD", bdata(ldPreload), 1); } } pinPid(threads[0], optSilent); fflush(stdout); argv += optind; execvp(argv[0], argv); perror("execvp"); fprintf(stderr,"failed to execute %s\n", argv[0]); return EXIT_SUCCESS; } likwid-3.1.3/perl/Template/Plugin/Dumper.pm000644 137545 027340 00000007016 12336605216 021066 0ustar00unrz254unrz000000 000000 #============================================================================== # # Template::Plugin::Dumper # # DESCRIPTION # # A Template Plugin to provide a Template Interface to Data::Dumper # # AUTHOR # Simon Matthews # # COPYRIGHT # Copyright (C) 2000 Simon Matthews. All Rights Reserved # # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # #============================================================================== package Template::Plugin::Dumper; use strict; use warnings; use base 'Template::Plugin'; use Data::Dumper; our $VERSION = 2.70; our $DEBUG = 0 unless defined $DEBUG; our @DUMPER_ARGS = qw( Indent Pad Varname Purity Useqq Terse Freezer Toaster Deepcopy Quotekeys Bless Maxdepth ); our $AUTOLOAD; #============================================================================== # ----- CLASS METHODS ----- #============================================================================== #------------------------------------------------------------------------ # new($context, \@params) #------------------------------------------------------------------------ sub new { my ($class, $context, $params) = @_; my ($key, $val); $params ||= { }; foreach my $arg (@DUMPER_ARGS) { no strict 'refs'; if (defined ($val = $params->{ lc $arg }) or defined ($val = $params->{ $arg })) { ${"Data\::Dumper\::$arg"} = $val; } } bless { _CONTEXT => $context, }, $class; } sub dump { my $self = shift; my $content = Dumper @_; return $content; } sub dump_html { my $self = shift; my $content = Dumper @_; for ($content) { s/&/&/g; s//>/g; s/\n/
\n/g; } return $content; } 1; __END__ =head1 NAME Template::Plugin::Dumper - Plugin interface to Data::Dumper =head1 SYNOPSIS [% USE Dumper %] [% Dumper.dump(variable) %] [% Dumper.dump_html(variable) %] =head1 DESCRIPTION This is a very simple Template Toolkit Plugin Interface to the L module. A C object will be instantiated via the following directive: [% USE Dumper %] As a standard plugin, you can also specify its name in lower case: [% USE dumper %] The C C, C and C options are supported as constructor arguments to affect the output generated. See L for further details. [% USE dumper(Indent=0, Pad="
") %] These options can also be specified in lower case. [% USE dumper(indent=0, pad="
") %] =head1 METHODS There are two methods supported by the C object. Each will output into the template the contents of the variables passed to the object method. =head2 dump() Generates a raw text dump of the data structure(s) passed [% USE Dumper %] [% Dumper.dump(myvar) %] [% Dumper.dump(myvar, yourvar) %] =head2 dump_html() Generates a dump of the data structures, as per L, but with the characters E, E and E converted to their equivalent HTML entities and newlines converted to EbrE. [% USE Dumper %] [% Dumper.dump_html(myvar) %] =head1 AUTHOR Simon Matthews Esam@tt2.orgE =head1 COPYRIGHT Copyright (C) 2000 Simon Matthews. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L, L likwid-3.1.3/groups/phi/VECTOR2.txt000644 137545 027340 00000000741 12336605216 017243 0ustar00unrz254unrz000000 000000 SHORT Vector unit usage EVENTSET PMC0 VPU_INSTRUCTIONS_EXECUTED PMC1 VPU_STALL_REG METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] PMC1*inverseClock LONG This group measures how efficient the processor works with regard to instruction throughput. Also important as a standalone metric is INSTRUCTIONS_RETIRED as it tells you how many instruction you need to execute for a task. An optimization might show very low CPI values but execute many more instruction for it. likwid-3.1.3/groups/nehalemEX/MEM.txt000644 137545 027340 00000002617 12336605216 017667 0ustar00unrz254unrz000000 000000 SHORT Main memory bandwidth EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF WBOX4 UNCORE_CYCLES MBOX0C0 FVC_EV0_BBOX_CMDS_READS MBOX0C1 FVC_EV0_BBOX_RSP_ACK MBOX1C0 FVC_EV0_BBOX_CMDS_READS MBOX1C1 FVC_EV0_BBOX_RSP_ACK BBOX0C1 IMT_INSERTS_WR BBOX1C1 IMT_INSERTS_WR RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock Uncore Clock [MHz] 1.E-06*(WBOX4)/time CPI FIXC1/FIXC0 Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64 Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time LONG Profiling group to measure memory bandwidth drawn by all cores of a socket. Addional to the bandwidth it also outputs the data volume and the remote traffic over QPI links to other sockets. likwid-3.1.3/groups/phi/L2CACHE.txt000644 137545 027340 00000001231 12336605216 017153 0ustar00unrz254unrz000000 000000 SHORT L2 Compute to Data Access Ratio EVENTSET PMC0 VPU_ELEMENTS_ACTIVE PMC1 DATA_READ_MISS_OR_WRITE_MISS METRICS Runtime (RDTSC) [s] time L2 compute intensity PMC0/PMC1 LONG These metric is a way to measure the computational density of an application, or how many computations it is performing on average for each piece of data loaded. L2 Compute to Data Access Ratio, should be used to judge suitability of an application for running on the Intel MIC Architecture. Applications that will perform well on the Intel® MIC Architecture should be vectorized, and ideally be able to perform multiple operations on the same pieces of data (or same cachelines). likwid-3.1.3/src/pthread-overload/Makefile000644 137545 027340 00000003077 12426160352 021010 0ustar00unrz254unrz000000 000000 # ======================================================================================= # # Filename: Makefile # # Description: pthread-overload Makefile # # Version: 3.1.3 # Released: 4.11.2014 # # Author: Jan Treibig (jt), jan.treibig@gmail.com # Project: likwid # # Copyright (C) 2014 Jan Treibig # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see . # # ======================================================================================= include ../../config.mk include ../../make/include_$(COMPILER).mk TARGET = liblikwidpin.so ifneq ($(COLOR),NONE) DEFINES += -DCOLOR=$(COLOR) endif DEFINES += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) INCLUDES += -I../includes LIBS += -ldl CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) all: $(TARGET) $(TARGET): pthread-overload.c $(CC) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS) likwid-3.1.3/groups/westmereEX/FLOPS_SP.txt000644 137545 027340 00000002000 12336605216 020742 0ustar00unrz254unrz000000 000000 SHORT Single Precision MFlops/s EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time Packed MUOPS/s 1.0E-06*PMC0/time Scalar MUOPS/s 1.0E-06*PMC1/time SP MUOPS/s 1.0E-06*PMC2/time DP MUOPS/s 1.0E-06*PMC3/time LONG Formula: SP MFlops/s = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime - The Nehalem has not possibility to measure MFlops if mixed precision calculations are done. Therefore both Single as well as Double precision are measured to ensure the correctness of the measurements. You can check if your code was vectorized on the number of FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. likwid-3.1.3/bench/phi/update.ptt000644 137545 027340 00000000622 12336605216 017167 0ustar00unrz254unrz000000 000000 STREAMS 1 TYPE DOUBLE FLOPS 0 BYTES 16 LOOP 32 vprefetch0 [STR0 + GPR1 * 8 + 1024] vmovaps zmm0, [STR0 + GPR1 * 8] vmovaps zmm1, [STR0 + GPR1 * 8 + 64] vmovaps zmm2, [STR0 + GPR1 * 8 + 128] vmovaps zmm3, [STR0 + GPR1 * 8 + 192] vmovaps [STR0 + GPR1 * 8] , zmm0 vmovaps [STR0 + GPR1 * 8 + 64], zmm1 vmovaps [STR0 + GPR1 * 8 + 128], zmm2 vmovaps [STR0 + GPR1 * 8 + 192], zmm3 likwid-3.1.3/perl/Template/Plugin/CGI.pm000644 137545 027340 00000006017 12336605216 020234 0ustar00unrz254unrz000000 000000 #============================================================= -*-Perl-*- # # Template::Plugin::CGI # # DESCRIPTION # Simple Template Toolkit plugin interfacing to the CGI.pm module. # # AUTHOR # Andy Wardley # # COPYRIGHT # Copyright (C) 1996-2007 Andy Wardley. All Rights Reserved. # # This module is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # #============================================================================ package Template::Plugin::CGI; use strict; use warnings; use base 'Template::Plugin'; use CGI; our $VERSION = 2.70; sub new { my $class = shift; my $context = shift; CGI->new(@_); } # monkeypatch CGI::params() method to Do The Right Thing in TT land sub CGI::params { my $self = shift; local $" = ', '; return $self->{ _TT_PARAMS } ||= do { # must call Vars() in a list context to receive # plain list of key/vals rather than a tied hash my $params = { $self->Vars() }; # convert any null separated values into lists @$params{ keys %$params } = map { /\0/ ? [ split /\0/ ] : $_ } values %$params; $params; }; } 1; __END__ =head1 NAME Template::Plugin::CGI - Interface to the CGI module =head1 SYNOPSIS [% USE CGI %] [% CGI.param('parameter') %] [% USE things = CGI %] [% things.param('name') %] # see CGI docs for other methods provided by the CGI object =head1 DESCRIPTION This is a very simple Template Toolkit Plugin interface to the C module. A C object will be instantiated via the following directive: [% USE CGI %] C methods may then be called as follows: [% CGI.header %] [% CGI.param('parameter') %] An alias can be used to provide an alternate name by which the object should be identified. [% USE mycgi = CGI %] [% mycgi.start_form %] [% mycgi.popup_menu({ Name => 'Color' Values => [ 'Green' 'Black' 'Brown' ] }) %] Parenthesised parameters to the C directive will be passed to the plugin constructor: [% USE cgiprm = CGI('uid=abw&name=Andy+Wardley') %] [% cgiprm.param('uid') %] =head1 METHODS In addition to all the methods supported by the C module, this plugin defines the following. =head2 params() This method returns a reference to a hash of all the C parameters. Any parameters that have multiple values will be returned as lists. [% USE CGI('user=abw&item=foo&item=bar') %] [% CGI.params.user %] # abw [% CGI.params.item.join(', ') %] # foo, bar =head1 AUTHOR Andy Wardley Eabw@wardley.orgE L =head1 COPYRIGHT Copyright (C) 1996-2007 Andy Wardley. All Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L, L =cut # Local Variables: # mode: perl # perl-indent-level: 4 # indent-tabs-mode: nil # End: # # vim: expandtab shiftwidth=4: likwid-3.1.3/groups/ivybridge/TLB_DATA.txt000644 137545 027340 00000002224 12421737566 020604 0ustar00unrz254unrz000000 000000 SHORT L1 Data TLB miss rate/ratio EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK PMC2 DTLB_LOAD_MISSES_WALK_DURATION PMC3 DTLB_STORE_MISSES_WALK_DURATION METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 L1 DTLB load misses PMC0 L1 DTLB load miss rate PMC0/FIXC0 L1 DTLB load miss duration PMC2 L1 DTLB store misses PMC1 L1 DTLB store miss rate PMC1/FIXC0 L1 DTLB store miss duration PMC3 LONG Formulas: L1 DTLB load misses DTLB_LOAD_MISSES_CAUSES_A_WALK L1 DTLB load miss rate DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION L1 DTLB store misses DTLB_STORE_MISSES_CAUSES_A_WALK L1 DTLB store miss rate DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION - The DTLB load and store miss rates gives a measure how often a TLB miss occured per instruction. The duration measures the time in cycles how long a walk did take. likwid-3.1.3/src/likwid.f90000644 137545 027340 00000003172 12426160352 015707 0ustar00unrz254unrz000000 000000 ! ======================================================================================= ! ! Filename: likwid.f90 ! ! Description: Marker API f90 module ! ! Version: 3.1.3 ! Released: 4.11.2014 ! ! Author: Jan Treibig (jt), jan.treibig@gmail.com ! Project: likwid ! ! Copyright (C) 2014 Jan Treibig ! ! This program is free software: you can redistribute it and/or modify it under ! the terms of the GNU General Public License as published by the Free Software ! Foundation, either version 3 of the License, or (at your option) any later ! version. ! ! This program is distributed in the hope that it will be useful, but WITHOUT ANY ! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A ! PARTICULAR PURPOSE. See the GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along with ! this program. If not, see . ! ! ======================================================================================= module likwid interface subroutine likwid_markerInit() end subroutine likwid_markerInit subroutine likwid_markerThreadInit() end subroutine likwid_markerThreadInit subroutine likwid_markerClose() end subroutine likwid_markerClose subroutine likwid_markerStartRegion( regionTag ) character(*) :: regionTag end subroutine likwid_markerStartRegion subroutine likwid_markerStopRegion( regionTag ) character(*) :: regionTag end subroutine likwid_markerStopRegion end interface end module likwid likwid-3.1.3/src/strUtil.c000644 137545 027340 00000061304 12426160352 015717 0ustar00unrz254unrz000000 000000 /* * ======================================================================================= * * Filename: strUtil.c * * Description: Utility routines for strings. Depends on bstring lib. * * Version: 3.1.3 * Released: 4.11.2014 * * Author: Jan Treibig (jt), jan.treibig@gmail.com * Project: likwid * * Copyright (C) 2014 Jan Treibig * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see . * * ======================================================================================= */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */ static int cpu_count(cpu_set_t* set) { uint32_t i; int s = 0; const __cpu_mask *p = set->__bits; const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)]; while (p < end) { __cpu_mask l = *p++; if (l == 0) { continue; } for (i=0; i< (sizeof(__cpu_mask)*8); i++) { if (l&(1UL<qty;i++) { subtokens = bsplit(tokens->entry[i],'-'); if( subtokens->qty == 1 ) { threads[numThreads] = str2int((char *) bdata(subtokens->entry[0])); numThreads++; } else if ( subtokens->qty == 2 ) { rangeBegin = str2int((char*) bdata(subtokens->entry[0])); rangeEnd = str2int((char*) bdata(subtokens->entry[1])); if (!(rangeBegin <= rangeEnd)) { ERROR_PRINT(Range End %d bigger than begin %d, rangeEnd, rangeBegin); } while (rangeBegin <= rangeEnd) { threads[numThreads] = rangeBegin; numThreads++; rangeBegin++; } } else { ERROR_PLAIN_PRINT(Parse Error); } bstrListDestroy(subtokens); } if (numThreads > MAX_NUM_THREADS) { ERROR_PRINT(Number Of threads %d too large, numThreads); } bstrListDestroy(tokens); return numThreads; } uint32_t bstr_to_cpuset_logical(uint32_t* threads, const_bstring q) { int i; uint32_t j; int id; uint32_t tmpThreads[MAX_NUM_THREADS]; int globalNumThreads=0; uint32_t numThreads=0; struct bstrList* tokens; struct bstrList* subtokens; const AffinityDomain* domain; tokens = bsplit(q,'@'); for (i=0;iqty;i++) { subtokens = bsplit(tokens->entry[i],':'); if ( subtokens->qty == 2 ) { domain = affinity_getDomain(subtokens->entry[0]); if (!domain) { ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0])); } numThreads = bstr_to_cpuset_physical(tmpThreads, subtokens->entry[1]); for (j=0; j= domain->numberOfProcessors)) { id = (tmpThreads[j]/domain->numberOfCores) + (tmpThreads[j]%domain->numberOfCores) * cpuid_topology.numThreadsPerCore; threads[globalNumThreads++] = domain->processorList[id]; } else { ERROR_PRINT(Too many threads requested. Avaialable 0-%d,domain->numberOfProcessors-1); } } } else { ERROR_PLAIN_PRINT(Parse Error); } bstrListDestroy(subtokens); } bstrListDestroy(tokens); return globalNumThreads; } #define PRINT_EXPR_ERR printf("SYNTAX ERROR: Expression must have the format E::[:chunk size>:]\n") uint32_t bstr_to_cpuset_expression(uint32_t* threads, const_bstring qi) { int i; uint32_t j; bstring q = (bstring) qi; int globalNumThreads=0; uint32_t numThreads=0; struct bstrList* tokens; struct bstrList* subtokens; const AffinityDomain* domain; bdelete (q, 0, 2); tokens = bsplit(q,'@'); for (i=0;iqty;i++) { subtokens = bsplit(tokens->entry[i],':'); if ( subtokens->qty == 2 ) { domain = affinity_getDomain(subtokens->entry[0]); if (!domain) { ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0])); } numThreads = str2int(bdata(subtokens->entry[1])); if (numThreads > domain->numberOfProcessors) { ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d, domain->numberOfProcessors-1); } for (j=0; jprocessorList[j]; } } else if ( subtokens->qty == 4 ) { int counter; int currentId = 0; int startId = 0; int chunksize = str2int(bdata(subtokens->entry[2])); int stride = str2int(bdata(subtokens->entry[3])); domain = affinity_getDomain(subtokens->entry[0]); if (!domain) { ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0])); } numThreads = str2int(bdata(subtokens->entry[1])); if (numThreads > domain->numberOfProcessors) { ERROR_PRINT(Invalid number of processors requested. Available 0-%d, domain->numberOfProcessors-1); } counter = 0; for (j=0; jprocessorList[counter+i]; } counter += stride; if (counter >= domain->numberOfProcessors) { counter = 0; } } } else { PRINT_EXPR_ERR; ERROR_PLAIN_PRINT(Parse Error); } bstrListDestroy(subtokens); } bstrListDestroy(tokens); return globalNumThreads; } uint32_t bstr_to_cpuset_scatter(uint32_t* threads, const_bstring qi) { int domainId = 0; int id = 0; int threadId = 0; bstring q = (bstring) qi; bstring domaintag; int globalNumThreads=0; struct bstrList* subtokens; int numberOfDomains = 0; AffinityDomain* domain; AffinityDomain* tmpDomainPtr; domain = (AffinityDomain*) malloc(cpuid_topology.numHWThreads * sizeof(AffinityDomain)); subtokens = bsplit(q,':'); if ( subtokens->qty == 2 ) { for(int i =0;;i++) { domaintag = bformat("%s%d",bdata(subtokens->entry[0]),i); tmpDomainPtr = (AffinityDomain*) affinity_getDomain(domaintag); if (tmpDomainPtr == NULL) { break; } else { memcpy(domain+i,tmpDomainPtr,sizeof(AffinityDomain)); numberOfDomains++; } } threads[globalNumThreads++] = domain[domainId].processorList[0]; for (uint32_t i=1; inumberOfCores) + (threadId%domain->numberOfCores) * cpuid_topology.numThreadsPerCore; threads[globalNumThreads++] = domain[domainId].processorList[id]; } } else { PRINT_EXPR_ERR; ERROR_PLAIN_PRINT(Parse Error); } bstrListDestroy(subtokens); free(domain); return globalNumThreads; } #define CPUSET_ERROR \ if (cpuid_isInCpuset()) { \ ERROR_PLAIN_PRINT(You are running inside a cpuset. In cpusets only logical pinning inside set is allowed!); \ } int bstr_to_cpuset(int* threadsIN, const_bstring q) { uint32_t i; int num=0; int cpuMapping[cpuid_topology.numHWThreads]; cpu_set_t cpu_set; uint32_t numThreads; bstring domainStr = bformat("NSCM"); const_bstring scatter = bformat("scatter"); struct bstrList* tokens; CPU_ZERO(&cpu_set); sched_getaffinity(0,sizeof(cpu_set_t), &cpu_set); uint32_t* threads = (uint32_t*) threadsIN; if (binchr (q, 0, domainStr) != BSTR_ERR) { CPUSET_ERROR; if (binstr (q, 0 , scatter ) != BSTR_ERR) { numThreads = bstr_to_cpuset_scatter(threads,q); } else if (bstrchr (q, 'E') != BSTR_ERR) { numThreads = bstr_to_cpuset_expression(threads,q); } else { numThreads = bstr_to_cpuset_logical(threads,q); } } else if (bstrchr (q, 'L') != BSTR_ERR) { uint32_t count = cpu_count(&cpu_set); tokens = bsplit(q,':'); numThreads = bstr_to_cpuset_physical(threads,tokens->entry[1]); for (i=0; i < cpuid_topology.numHWThreads; i++) { if (CPU_ISSET(i,&cpu_set)) { cpuMapping[num++]=i; } } for (i=0; i < numThreads; i++) { if (!(threads[i] >= count)) { threads[i] = cpuMapping[threads[i]]; } else { fprintf(stderr, "Available CPUs: "); for (int j=0; j< num-1;j++) { fprintf(stderr, "%d,", cpuMapping[j]); } fprintf(stderr, "%d\n", cpuMapping[num-1]); ERROR_PRINT(Index %d out of range.,threads[i]); } } bstrListDestroy(tokens); } else { CPUSET_ERROR; numThreads = bstr_to_cpuset_physical(threads,q); } bdestroy(domainStr); return (int) numThreads; } void bstr_to_eventset(StrUtilEventSet* set, const_bstring q) { int i; struct bstrList* tokens; struct bstrList* subtokens; tokens = bsplit(q,','); set->numberOfEvents = tokens->qty; set->events = (StrUtilEvent*) malloc(set->numberOfEvents * sizeof(StrUtilEvent)); for (i=0;iqty;i++) { subtokens = bsplit(tokens->entry[i],':'); if ( subtokens->qty != 2 ) { fprintf(stderr, "Cannot parse event string %s, probably missing counter name\n" ,bdata(tokens->entry[i])); fprintf(stderr, "Format: :,...\n"); msr_finalize(); pci_finalize(); exit(EXIT_FAILURE); } else { set->events[i].eventName = bstrcpy(subtokens->entry[0]); set->events[i].counterName = bstrcpy(subtokens->entry[1]); } bstrListDestroy(subtokens); } bstrListDestroy(tokens); } FILE* bstr_to_outstream(const_bstring argString, bstring filter) { int i; char* cstr; FILE* STREAM; struct bstrList* tokens; bstring base; bstring suffix = bfromcstr("."); bstring filename; /* configure filter */ tokens = bsplit(argString,'.'); if (tokens->qty < 2) { fprintf(stderr, "Outputfile has no filetype suffix!\n"); fprintf(stderr, "Add suffix .txt for raw output or any supported filter suffix.\n"); exit(EXIT_FAILURE); } base = bstrcpy(tokens->entry[0]); if (biseqcstr(tokens->entry[1],"txt")) { bassigncstr(filter, "NO"); } else { bassigncstr(filter, TOSTRING(LIKWIDFILTERPATH)); bconchar(filter,'/'); bconcat(filter,tokens->entry[1]); } bconcat(suffix,tokens->entry[1]); bstrListDestroy(tokens); tokens = bsplit(base,'_'); if (tokens->qty < 1) { ERROR_PLAIN_PRINT(Error in parsing file string); } filename = bstrcpy(tokens->entry[0]); for (i=1; iqty; i++) { if (biseqcstr(tokens->entry[i],"%j")) { cstr = getenv("PBS_JOBID"); if (cstr != NULL) { bcatcstr(filename, "_"); bcatcstr(filename, cstr); } } else if (biseqcstr(tokens->entry[i],"%r")) { cstr = getenv("PMI_RANK"); if (cstr == NULL) { cstr = getenv("OMPI_COMM_WORLD_RANK"); } if (cstr != NULL) { bcatcstr(filename, "_"); bcatcstr(filename, cstr); } } else if (biseqcstr(tokens->entry[i],"%h")) { cstr = (char*) malloc(HOST_NAME_MAX * sizeof(char)); gethostname(cstr,HOST_NAME_MAX); bcatcstr(filename, "_"); bcatcstr(filename, cstr); free(cstr); } else if (biseqcstr(tokens->entry[i],"%p")) { bstring pid = bformat("_%d",getpid()); bconcat(filename, pid); bdestroy(pid); } else { ERROR_PLAIN_PRINT(Unsupported placeholder in filename!); } } if (biseqcstr(filter,"NO")) { bconcat(filename, suffix); } else { bcatcstr(filter, " "); bcatcstr(filename, ".tmp"); bconcat(filter, filename); } bstrListDestroy(tokens); STREAM = fopen(bdata(filename),"w"); bdestroy(filename); bdestroy(suffix); bdestroy(base); return STREAM; } uint64_t bstr_to_doubleSize(const_bstring str, DataType type) { bstring unit = bmidstr(str, blength(str)-2, 2); bstring sizeStr = bmidstr(str, 0, blength(str)-2); uint64_t sizeU = str2int(bdata(sizeStr)); uint64_t junk = 0; uint64_t bytesize = 0; switch (type) { case SINGLE: case SINGLE_RAND: bytesize = sizeof(float); break; case DOUBLE: case DOUBLE_RAND: bytesize = sizeof(double); break; } if (biseqcstr(unit, "kB")) { junk = (sizeU *1024)/bytesize; } else if (biseqcstr(unit, "MB")) { junk = (sizeU *1024*1024)/bytesize; } else if (biseqcstr(unit, "GB")) { junk = (sizeU *1024*1024*1024)/bytesize; } return junk; } void bstr_to_interval(const_bstring str, struct timespec* interval) { int size; int pos; bstring ms = bformat("ms"); if ((pos = bstrrchr (str, 's')) != BSTR_ERR) { if (pos != (blength(str)-1)) { fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n"); msr_finalize(); exit(EXIT_FAILURE); } /* unit is ms */ if (binstrr (str, blength(str), ms) != BSTR_ERR) { bstring sizeStr = bmidstr(str, 0, blength(str)-2); size = str2int(bdata(sizeStr)); if (size >= 1000) { interval->tv_sec = size/1000; interval->tv_nsec = (size%1000) * 1.E06; } else { interval->tv_sec = 0L; interval->tv_nsec = size * 1.E06; } } /* unit is s */ else { bstring sizeStr = bmidstr(str, 0, blength(str)-1); size = str2int(bdata(sizeStr)); interval->tv_sec = size; interval->tv_nsec = 0L; } } else { fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n"); msr_finalize(); exit(EXIT_FAILURE); } } void bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams) { uint32_t i; int parseStreams = 0; bstring threadInfo; bstring streams= bformat("0"); struct bstrList* tokens; struct bstrList* subtokens; const AffinityDomain* domain; /* split the workgroup into the thread and the streams part */ tokens = bsplit(str,'-'); if (tokens->qty == 2) { threadInfo = bstrcpy(tokens->entry[0]); streams = bstrcpy(tokens->entry[1]); parseStreams = 1; } else if (tokens->qty == 1) { threadInfo = bstrcpy(tokens->entry[0]); } else { ERROR_PLAIN_PRINT(Error in parsing workgroup string); } bstrListDestroy (tokens); tokens = bsplit(threadInfo,':'); if (tokens->qty == 5) { uint32_t maxNumThreads; int chunksize; int stride; int counter; int currentId = 0; int startId = 0; domain = affinity_getDomain(tokens->entry[0]); if (domain == NULL) { fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.", bdata(tokens->entry[0])); exit(EXIT_FAILURE); } group->size = bstr_to_doubleSize(tokens->entry[1], type); group->numberOfThreads = str2int(bdata(tokens->entry[2])); chunksize = str2int(bdata(tokens->entry[3])); stride = str2int(bdata(tokens->entry[4])); maxNumThreads = (domain->numberOfProcessors / stride) * chunksize; if (group->numberOfThreads > maxNumThreads) { fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n", bdata(tokens->entry[0]), maxNumThreads); exit(EXIT_FAILURE); } group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int)); counter = chunksize; for (i=0; inumberOfThreads; i++) { if (counter) { group->processorIds[i] = domain->processorList[currentId++]; } else { startId += stride; currentId = startId; group->processorIds[i] = domain->processorList[currentId++]; counter = chunksize; } counter--; } } else if (tokens->qty == 3) { domain = affinity_getDomain(tokens->entry[0]); if (domain == NULL) { fprintf(stderr, "Error: Domain %s not available on current machine.\n", bdata(tokens->entry[0])); fprintf(stderr, "Try likwid-bench -p for supported domains.\n"); exit(EXIT_FAILURE); } group->size = bstr_to_doubleSize(tokens->entry[1], type); group->numberOfThreads = str2int(bdata(tokens->entry[2])); if (group->numberOfThreads > domain->numberOfProcessors) { fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n", bdata(tokens->entry[0]),domain->numberOfProcessors); exit(EXIT_FAILURE); } group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int)); for (i=0; inumberOfThreads; i++) { group->processorIds[i] = domain->processorList[i]; } } else if (tokens->qty == 2) { domain = affinity_getDomain(tokens->entry[0]); if (domain == NULL) { fprintf(stderr, "Error: Domain %s not available on current machine.\n", bdata(tokens->entry[0])); fprintf(stderr, "Try likwid-bench -p for supported domains.\n"); exit(EXIT_FAILURE); } group->size = bstr_to_doubleSize(tokens->entry[1], type); group->numberOfThreads = domain->numberOfProcessors; group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int)); for (i=0; inumberOfThreads; i++) { group->processorIds[i] = domain->processorList[i]; } } else { ERROR_PLAIN_PRINT(Error in parsing workgroup string); } bstrListDestroy(tokens); /* parse stream list */ if (parseStreams) { tokens = bsplit(streams,','); if (tokens->qty < numberOfStreams) { ERROR_PRINT(Testcase requires at least %d streams, numberOfStreams); } group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream)); for (i=0;i<(uint32_t) tokens->qty;i++) { subtokens = bsplit(tokens->entry[i],':'); if ( subtokens->qty == 3 ) { int index = str2int(bdata(subtokens->entry[0])); if (index >= numberOfStreams) { ERROR_PRINT(Stream Index %d out of range,index); } group->streams[index].domain = bstrcpy(subtokens->entry[1]); group->streams[index].offset = str2int(bdata(subtokens->entry[2])); } else if ( subtokens->qty == 2 ) { int index = str2int(bdata(subtokens->entry[0])); if (index >= numberOfStreams) { ERROR_PRINT(Stream Index %d out of range,index); } group->streams[index].domain = bstrcpy(subtokens->entry[1]); group->streams[index].offset = 0; } else { ERROR_PLAIN_PRINT(Error in parsing event string); } bstrListDestroy(subtokens); } bstrListDestroy(tokens); } else { group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream)); for (i=0; i< (uint32_t)numberOfStreams; i++) { group->streams[i].domain = domain->tag; group->streams[i].offset = 0; } } group->size /= numberOfStreams; } #define INIT_SECURE_INPUT_LENGTH 256 bstring bSecureInput (int maxlen, char* vgcCtx) { int i, m, c = 1; bstring b, t; int termchar = 0; if (!vgcCtx) return NULL; b = bfromcstralloc (INIT_SECURE_INPUT_LENGTH, ""); for (i=0; ; i++) { if (termchar == c) { break; } else if ((maxlen > 0) && (i >= maxlen)) { b = NULL; return b; } else { c = *(vgcCtx++); } if (EOF == c) { break; } if (i+1 >= b->mlen) { /* Double size, but deal with unusual case of numeric overflows */ if ((m = b->mlen << 1) <= b->mlen && (m = b->mlen + 1024) <= b->mlen && (m = b->mlen + 16) <= b->mlen && (m = b->mlen + 1) <= b->mlen) { t = NULL; } else { t = bfromcstralloc (m, ""); } if (t) { memcpy (t->data, b->data, i); } bdestroy (b); /* Clean previous buffer */ b = t; if (!b) { return b; } } b->data[i] = (unsigned char) c; } i--; b->slen = i; b->data[i] = (unsigned char) '\0'; return b; } int bJustifyCenter (bstring b, int width) { unsigned char space = ' '; int alignSpace = (width - b->slen) / 2; int restSpace = (width - b->slen) % 2; if (width <= 0) return -__LINE__; if (b->slen <= width) { binsertch (b, 0, alignSpace, space); } binsertch (b, b->slen , alignSpace+restSpace, space); return BSTR_OK; } likwid-3.1.3/perl/Template/Stash/XS.pm000644 137545 027340 00000006427 12336605216 020015 0ustar00unrz254unrz000000 000000 #============================================================= -*-Perl-*- # # Template::Stash::XS # # DESCRIPTION # # Perl bootstrap for XS module. Inherits methods from # Template::Stash when not implemented in the XS module. # #======================================================================== package Template::Stash::XS; use strict; use warnings; use Template; use Template::Stash; our $AUTOLOAD; BEGIN { require DynaLoader; @Template::Stash::XS::ISA = qw( DynaLoader Template::Stash ); eval { bootstrap Template::Stash::XS $Template::VERSION; }; if ($@) { die "Couldn't load Template::Stash::XS $Template::VERSION:\n\n$@\n"; } } sub DESTROY { # no op 1; } # catch missing method calls here so perl doesn't barf # trying to load *.al files sub AUTOLOAD { my ($self, @args) = @_; my @c = caller(0); my $auto = $AUTOLOAD; $auto =~ s/.*:://; $self =~ s/=.*//; die "Can't locate object method \"$auto\"" . " via package \"$self\" at $c[1] line $c[2]\n"; } 1; __END__ =head1 NAME Template::Stash::XS - High-speed variable stash written in C =head1 SYNOPSIS use Template; use Template::Stash::XS; my $stash = Template::Stash::XS->new(\%vars); my $tt2 = Template->new({ STASH => $stash }); =head1 DESCRIPTION The Template:Stash::XS module is an implementation of the Template::Stash written in C. The "XS" in the name refers to Perl's XS extension system for interfacing Perl to C code. It works just like the regular Perl implementation of Template::Stash but runs about twice as fast. The easiest way to use the XS stash is to configure the Template Toolkit to use it by default. You can do this at installation time (when you run C) by answering 'y' to the questions: Do you want to build the XS Stash module? y Do you want to use the XS Stash by default? y See the F file distributed with the Template Toolkit for further details on installation. If you don't elect to use the XS stash by default then you should use the C configuration item when you create a new Template object. This should reference an XS stash object that you have created manually. use Template; use Template::Stash::XS; my $stash = Template::Stash::XS->new(\%vars); my $tt2 = Template->new({ STASH => $stash }); Alternately, you can set the C<$Template::Config::STASH> package variable like so: use Template; use Template::Config; $Template::Config::STASH = 'Template::Stash::XS'; my $tt2 = Template->new(); The XS stash will then be automatically used. If you want to use the XS stash by default and don't want to re-install the Template Toolkit, then you can manually modify the C