trunk/0000755000175000017500000000000011240605573011546 5ustar benoitbenoittrunk/write_lat.c0000755000175000017500000007311711237107527013722 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler) * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_RDMA_WRID 3 #define VERSION 1.0 #define ALL 1 #define MAX_INLINE 400 static int sl = 0; static int page_size; cycles_t *tstamp; struct user_parameters { const char *servername; int connection_type; int mtu; int all; /* run all msg size */ int iters; int tx_depth; int inline_size; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; struct report_options { int unsorted; int histogram; int cycles; /* report delta's in cycles, not microsec's */ }; struct pingpong_context { struct ibv_context *context; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *cq; struct ibv_qp *qp; void *buf; volatile char *post_buf; volatile char *poll_buf; int size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; union ibv_gid dgid; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static struct ibv_device *pp_find_dev(const char *ib_devname) { struct ibv_device **dev_list; struct ibv_device *ib_dev = NULL; dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) fprintf(stderr, "No IB devices found\n"); } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) fprintf(stderr, "IB device %s not found\n", ib_devname); } return ib_dev; } #define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000") #define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx" #define KEY_MSG_SIZE_GID (sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00") #define KEY_PRINT_FMT_GID "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { if (user_parm->gid_index < 0) { char msg[KEY_MSG_SIZE]; sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } else { char msg[KEY_MSG_SIZE_GID]; sprintf(msg, KEY_PRINT_FMT_GID, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } } static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { if (user_parm->gid_index < 0) { int parsed; char msg[KEY_MSG_SIZE]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n", (int)sizeof msg, msg); return -1; } return 0; } else { char msg[KEY_MSG_SIZE_GID]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); return 0; } } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } static int pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { if (pp_write_keys(sockfd, my_dest, user_parm)) return -1; return pp_read_keys(sockfd, my_dest, rem_dest, user_parm); } static int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static int pp_server_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest* rem_dest, struct user_parameters *user_parm) { if (pp_read_keys(sockfd, my_dest, rem_dest, user_parm)) return -1; return pp_write_keys(sockfd, my_dest, user_parm); } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int tx_depth, int port, struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); ctx->post_buf = (char*)ctx->buf + (size - 1); ctx->poll_buf = (char*)ctx->buf + (2 * size - 1); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, NULL, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->cq; attr.recv_cq = ctx->cq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = user_parm->inline_size; if (user_parm->connection_type==1) { attr.qp_type = IBV_QPT_UC; } else { attr.qp_type = IBV_QPT_RC; } ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } memset(&ctx->wr, 0, sizeof(ctx->wr)); ctx->wr.wr_id = PINGPONG_RDMA_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_WRITE; ctx->wr.next = NULL; return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest,struct user_parameters *user_parm) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_attr)); attr.qp_state = IBV_QPS_RTR; switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; if (user_parm->connection_type==0) { attr.max_dest_rd_atomic = 1; attr.min_rnr_timer = 12; } if (user_parm->gid_index < 0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if (user_parm->connection_type == 0) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; } else { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; if (user_parm->connection_type == 0) { attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } } else { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTS\n"); return 1; } } return 0; } static int pp_open_port(struct pingpong_context *ctx, const char * servername, int ib_port, int port, struct pingpong_dest *rem_dest,struct user_parameters *user_parm) { char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey %#08x VAddr %#016Lx\n"; struct pingpong_dest my_dest; int sockfd; int rc; union ibv_gid gid; /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ if (user_parm->gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_parm->gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } my_dest.lid = pp_get_local_lid(ctx, ib_port); my_dest.dgid = gid; my_dest.qpn = ctx->qp->qp_num; my_dest.psn = lrand48() & 0xffffff; if (user_parm->gid_index < 0) {/*We do not fail test upon lid in RDMAoE/Eth conf*/ if (!my_dest.lid) { fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n"); return 1; } } my_dest.rkey = ctx->mr->rkey; my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; printf(addr_fmt, "local", my_dest.lid, my_dest.qpn, my_dest.psn, my_dest.rkey, my_dest.vaddr); if (user_parm->gid_index > -1) { printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest.dgid.raw[0],my_dest.dgid.raw[1], my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4], my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7], my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10], my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13], my_dest.dgid.raw[14], my_dest.dgid.raw[15]); } sockfd = servername ? pp_client_connect(servername, port) : pp_server_connect(port); if (sockfd < 0) { printf("pp_connect_sock(%s,%d) failed (%d)!\n", servername, port, sockfd); return sockfd; } rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) : pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm); if (rc) return rc; printf(addr_fmt, "remote", rem_dest->lid, rem_dest->qpn, rem_dest->psn, rem_dest->rkey, rem_dest->vaddr); if (user_parm->gid_index > -1) { printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest->dgid.raw[0],rem_dest->dgid.raw[1], rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4], rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7], rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10], rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13], rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]); } if ((rc = pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest,user_parm))) return rc; /* An additional handshake is required *after* moving qp to RTR. * Arbitrarily reuse exch_dest for this purpose. */ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) : pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm); if (rc) return rc; if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -c, --connection= connection type RC/UC (default RC)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -s, --size= size of message to exchange (default 1)\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -t, --tx-depth= size of tx queue (default 50)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -I, --inline_size= max size of message to be sent in inline mode (default 400)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n"); printf(" -H, --report-histogram print out all results (default print summary only)\n"); printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n"); printf(" -V, --version display version number\n"); printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n"); } /* * When there is an * odd number of samples, the median is the middle number. * even number of samples, the median is the mean of the * two middle numbers. * */ static inline cycles_t get_median(int n, cycles_t delta[]) { if ((n - 1) % 2) return(delta[n / 2] + delta[n / 2 - 1]) / 2; else return delta[n / 2]; } static int cycles_compare(const void * aptr, const void * bptr) { const cycles_t *a = aptr; const cycles_t *b = bptr; if (*a < *b) return -1; if (*a > *b) return 1; return 0; } static void print_report(struct report_options * options, unsigned int iters, cycles_t *tstamp, int size, int no_cpu_freq_fail) { double cycles_to_units; cycles_t median; unsigned int i; const char* units; cycles_t *delta = malloc((iters - 1) * sizeof *delta); if (!delta) { perror("malloc"); return; } for (i = 0; i < iters - 1; ++i) delta[i] = tstamp[i + 1] - tstamp[i]; if (options->cycles) { cycles_to_units = 1; units = "cycles"; } else { cycles_to_units = get_cpu_mhz(no_cpu_freq_fail); units = "usec"; } if (options->unsorted) { printf("#, %s\n", units); for (i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2); } qsort(delta, iters - 1, sizeof *delta, cycles_compare); if (options->histogram) { printf("#, %s\n", units); for (i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2); } median = get_median(iters - 1, delta); printf("%7d %d %7.2f %7.2f %7.2f\n", size,iters,delta[0] / cycles_to_units / 2, delta[iters - 2] / cycles_to_units / 2,median / cycles_to_units / 2); free(delta); } int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest *rem_dest, int size) { struct ibv_qp *qp; struct ibv_send_wr *wr; volatile char *poll_buf; volatile char *post_buf; int scnt, ccnt, rcnt; int iters; int tx_depth; int inline_size; iters = user_param->iters; tx_depth = user_param->tx_depth; inline_size = user_param->inline_size; wr = &ctx->wr; ctx->list.addr = (uintptr_t) ctx->buf ; ctx->list.length = size; ctx->list.lkey = ctx->mr->lkey; wr->wr.rdma.remote_addr = rem_dest->vaddr; wr->wr.rdma.rkey = rem_dest->rkey; if (size > inline_size) {/* complaince to perf_main */ ctx->wr.send_flags = IBV_SEND_SIGNALED; } else { ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } scnt = 0; rcnt = 0; ccnt = 0; if(user_param->all == ALL) { post_buf = (char*)ctx->buf + size - 1; poll_buf = (char*)ctx->buf + 8388608 + size - 1; } else { poll_buf = ctx->poll_buf; post_buf = ctx->post_buf; } qp = ctx->qp; /* Done with setup. Start the test. */ while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < user_param->iters && !(scnt < 1 && user_param->servername)) { ++rcnt; while (*poll_buf != (char)rcnt) ; /* Here the data is already in the physical memory. If we wanted to actually use it, we may need a read memory barrier here. */ } if (scnt < user_param->iters) { struct ibv_send_wr *bad_wr; tstamp[scnt] = get_cycles(); *post_buf = (char)++scnt; if (ibv_post_send(qp, wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 11; } } if (ccnt < user_param->iters) { struct ibv_wc wc; int ne; ++ccnt; do { ne = ibv_poll_cq(ctx->cq, 1, &wc); } while (ne == 0); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 12; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return 13; } } } return(0); } int main(int argc, char *argv[]) { const char *ib_devname = NULL; int port = 18515; int ib_port = 1; int size = 2; int i = 0; struct report_options report = {}; struct pingpong_context *ctx; struct pingpong_dest rem_dest; struct ibv_device *ib_dev; struct user_parameters user_param; int no_cpu_freq_fail = 0; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; /* signal choose default by device */ user_param.iters = 1000; user_param.tx_depth = 50; user_param.servername = NULL; user_param.inline_size = MAX_INLINE; user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "connection", .has_arg = 1, .val = 'c' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "inline_size", .has_arg = 1, .val = 'I' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "report-cycles", .has_arg = 0, .val = 'C' }, { .name = "report-histogram",.has_arg = 0, .val = 'H' }, { .name = "report-unsorted",.has_arg = 0, .val = 'U' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:c:m:d:i:s:n:t:I:u:S:x:aCHUVF", long_options, NULL);///cpufreq if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'c': if (strcmp("UC",optarg)==0) user_param.connection_type=1; /* default is 0 for any other option RC*/ break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'a': user_param.all = ALL; break; case 'V': printf("perftest version : %.2f\n",VERSION); return 0; break; case 'd': ib_devname = strdupa(optarg); break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 2; } break; case 's': size = strtol(optarg, NULL, 0); if (size < 1) { usage(argv[0]); return 3; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 4; } break; case 'I': user_param.inline_size = strtol(optarg, NULL, 0); if (user_param.inline_size > MAX_INLINE) { usage(argv[0]); return 7; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 5; } break; case 'C': report.cycles = 1; break; case 'H': report.histogram = 1; break; case 'U': report.unsorted = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 6; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; default: usage(argv[0]); return 7; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 6; } /* * Done with parameter parsing. Perform setup. */ tstamp = malloc(user_param.iters * sizeof *tstamp); if (!tstamp) { perror("malloc"); return 10; } printf("------------------------------------------------------------------\n"); printf(" RDMA_Write Latency Test\n"); printf("Inline data is used up to %d bytes message\n", user_param.inline_size); if (user_param.connection_type==0) { printf("Connection type : RC\n"); } else { printf("Connection type : UC\n"); } if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } if (user_param.all == ALL) { /*since we run all sizes */ size = 8388608; /*2^23 */ } srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); ib_dev = pp_find_dev(ib_devname); if (!ib_dev) return 7; ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port,&user_param); if (!ctx) return 8; if (pp_open_port(ctx, user_param.servername, ib_port, port, &rem_dest,&user_param)) return 9; printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations t_min[usec] t_max[usec] t_typical[usec]\n"); if (user_param.all == ALL) { for (i = 1; i < 24 ; ++i) { size = 1 << i; if(run_iter(ctx, &user_param, &rem_dest, size)) return 17; print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail); } } else { if(run_iter(ctx, &user_param, &rem_dest, size)) return 18; print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail); } printf("------------------------------------------------------------------\n"); free(tstamp); return 0; } trunk/rdma_bw.c0000755000175000017500000010042111240600240013310 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_RDMA_WRID 3 static int sl = 0; static int page_size; static pid_t pid; struct pingpong_context { struct ibv_context *context; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *rcq; struct ibv_cq *scq; struct ibv_qp *qp; struct ibv_comp_channel *ch; void *buf; unsigned size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; }; struct pp_data { int port; int ib_port; unsigned size; int tx_depth; int use_cma; int sockfd; char *servername; struct pingpong_dest my_dest; struct pingpong_dest *rem_dest; struct ibv_device *ib_dev; struct rdma_event_channel *cm_channel; struct rdma_cm_id *cm_id; }; static void pp_post_recv(struct pingpong_context *); static void pp_wait_for_done(struct pingpong_context *); static void pp_send_done(struct pingpong_context *); static void pp_wait_for_start(struct pingpong_context *); static void pp_send_start(struct pingpong_context *); static void pp_close_cma(struct pp_data ); static struct pingpong_context *pp_init_ctx(void *, struct pp_data *); static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static struct pingpong_context *pp_client_connect(struct pp_data *data) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; int n_retries = 10; struct rdma_cm_event *event; struct sockaddr_in sin; struct pingpong_context *ctx = NULL; struct rdma_conn_param conn_param; if (asprintf(&service, "%d", data->port) < 0) goto err4; n = getaddrinfo(data->servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%d:%s: %s for %s:%d\n", pid, __func__, gai_strerror(n), data->servername, data->port); goto err4; } if (data->use_cma) { sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); retry_addr: if (rdma_resolve_addr(data->cm_id, NULL, (struct sockaddr *)&sin, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n", pid, __func__ ); goto err2; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; if (event->event == RDMA_CM_EVENT_ADDR_ERROR && n_retries-- > 0) { rdma_ack_cm_event (event); goto retry_addr; } if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); goto err1; } rdma_ack_cm_event(event); retry_route: if (rdma_resolve_route(data->cm_id, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", pid, __func__); goto err2; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; if (event->event == RDMA_CM_EVENT_ROUTE_ERROR && n_retries-- > 0) { rdma_ack_cm_event(event); goto retry_route; } if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); rdma_ack_cm_event(event); goto err1; } rdma_ack_cm_event(event); ctx = pp_init_ctx(data->cm_id, data); if (!ctx) { fprintf(stderr, "%d:%s: pp_init_ctx failed\n", pid, __func__); goto err2; } data->my_dest.psn = lrand48() & 0xffffff; data->my_dest.qpn = 0; data->my_dest.rkey = ctx->mr->rkey; data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 5; conn_param.private_data = &data->my_dest; conn_param.private_data_len = sizeof(data->my_dest); if (rdma_connect(data->cm_id, &conn_param)) { fprintf(stderr, "%d:%s: rdma_connect failure\n", pid, __func__); goto err2; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; if (event->event != RDMA_CM_EVENT_ESTABLISHED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); goto err1; } if (!event->param.conn.private_data || (event->param.conn.private_data_len < sizeof(*data->rem_dest))) { fprintf(stderr, "%d:%s: bad private data ptr %p len %d\n", pid, __func__, event->param.conn.private_data, event->param.conn.private_data_len); goto err1; } data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) goto err1; memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest)); rdma_ack_cm_event(event); } else { for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } if (sockfd < 0) { fprintf(stderr, "%d:%s: Couldn't connect to %s:%d\n", pid, __func__, data->servername, data->port); goto err3; } ctx = pp_init_ctx(data->ib_dev, data); if (!ctx) goto err3; data->sockfd = sockfd; } freeaddrinfo(res); return ctx; err1: rdma_ack_cm_event(event); err2: rdma_destroy_id(data->cm_id); rdma_destroy_event_channel(data->cm_channel); err3: freeaddrinfo(res); err4: return NULL; } static int pp_client_exch_dest(struct pp_data *data) { char msg[sizeof "0000:000000:000000:00000000:0000000000000000"]; int parsed; if (!data->use_cma) { sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx", data->my_dest.lid, data->my_dest.qpn, data->my_dest.psn, data->my_dest.rkey, data->my_dest.vaddr); if (write(data->sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "%d:%s: Couldn't send local address\n", pid, __func__); goto err; } if (read(data->sockfd, msg, sizeof msg) != sizeof msg) { perror("client read"); fprintf(stderr, "%d:%s: Couldn't read remote address\n", pid, __func__); goto err; } if (data->rem_dest != NULL) free(data->rem_dest); data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) goto err; parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &data->rem_dest->lid, &data->rem_dest->qpn, &data->rem_dest->psn, &data->rem_dest->rkey, &data->rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "%d:%s: Couldn't parse line <%.*s>\n", pid, __func__, (int)sizeof msg, msg); free(data->rem_dest); goto err; } } return 0; err: return 1; } static struct pingpong_context *pp_server_connect(struct pp_data *data) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; struct rdma_cm_event *event; struct sockaddr_in sin; struct pingpong_context *ctx = NULL; struct rdma_cm_id *child_cm_id; struct rdma_conn_param conn_param; if (asprintf(&service, "%d", data->port) < 0) goto err5; if ( (n = getaddrinfo(NULL, service, &hints, &res)) < 0 ) { fprintf(stderr, "%d:%s: %s for port %d\n", pid, __func__, gai_strerror(n), data->port); goto err5; } if (data->use_cma) { sin.sin_addr.s_addr = 0; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); if (rdma_bind_addr(data->cm_id, (struct sockaddr *)&sin)) { fprintf(stderr, "%d:%s: rdma_bind_addr failed\n", pid, __func__); goto err3; } if (rdma_listen(data->cm_id, 0)) { fprintf(stderr, "%d:%s: rdma_listen failed\n", pid, __func__); goto err3; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err3; if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { fprintf(stderr, "%d:%s: bad event waiting for connect request %d\n", pid, __func__, event->event); goto err2; } if (!event->param.conn.private_data || (event->param.conn.private_data_len < sizeof(*data->rem_dest))) { fprintf(stderr, "%d:%s: bad private data len %d\n", pid, __func__, event->param.conn.private_data_len); goto err2; } data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) goto err2; memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest)); child_cm_id = (struct rdma_cm_id *)event->id; ctx = pp_init_ctx(child_cm_id, data); if (!ctx) { free(data->rem_dest); goto err1; } data->my_dest.psn = lrand48() & 0xffffff; data->my_dest.qpn = 0; data->my_dest.rkey = ctx->mr->rkey; data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.private_data = &data->my_dest; conn_param.private_data_len = sizeof(data->my_dest); if (rdma_accept(child_cm_id, &conn_param)) { fprintf(stderr, "%d:%s: rdma_accept failed\n", pid, __func__); goto err1; } rdma_ack_cm_event(event); if (rdma_get_cm_event(data->cm_channel, &event)) { fprintf(stderr, "%d:%s: rdma_get_cm_event error\n", pid, __func__); rdma_destroy_id(child_cm_id); goto err3; } if (event->event != RDMA_CM_EVENT_ESTABLISHED) { fprintf(stderr, "%d:%s: bad event waiting for established %d\n", pid, __func__, event->event); goto err1; } rdma_ack_cm_event(event); } else { for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } if (sockfd < 0) { fprintf(stderr, "%d:%s: Couldn't listen to port %d\n", pid, __func__, data->port); goto err4; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "%d:%s: accept() failed\n", pid, __func__); close(sockfd); goto err4; } close(sockfd); ctx = pp_init_ctx(data->ib_dev, data); if (!ctx) goto err4; data->sockfd = connfd; } freeaddrinfo(res); return ctx; err1: rdma_destroy_id(child_cm_id); err2: rdma_ack_cm_event(event); err3: rdma_destroy_id(data->cm_id); rdma_destroy_event_channel(data->cm_channel); err4: freeaddrinfo(res); err5: return NULL; } static int pp_server_exch_dest(struct pp_data *data) { char msg[sizeof "0000:000000:000000:00000000:0000000000000000"]; int parsed; int n; if (!data->use_cma) { n = read(data->sockfd, msg, sizeof msg); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d:%s: %d/%d Couldn't read remote address\n", pid, __func__, n, (int) sizeof msg); goto err; } if (data->rem_dest != NULL) free(data->rem_dest); data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) goto err; parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &data->rem_dest->lid, &data->rem_dest->qpn, &data->rem_dest->psn, &data->rem_dest->rkey, &data->rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "%d:%s: Couldn't parse line <%.*s>\n", pid, __func__, (int)sizeof msg, msg); free(data->rem_dest); goto err; } sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx", data->my_dest.lid, data->my_dest.qpn, data->my_dest.psn, data->my_dest.rkey, data->my_dest.vaddr); if (write(data->sockfd, msg, sizeof msg) != sizeof msg) { perror("server write"); fprintf(stderr, "%d:%s: Couldn't send local address\n", pid, __func__); free(data->rem_dest); goto err; } } return 0; err: return 1; } static struct pingpong_context *pp_init_ctx(void *ptr, struct pp_data *data) { struct pingpong_context *ctx; struct ibv_device *ib_dev; struct rdma_cm_id *cm_id; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = data->size; ctx->tx_depth = data->tx_depth; ctx->buf = memalign(page_size, ctx->size * 2); if (!ctx->buf) { fprintf(stderr, "%d:%s: Couldn't allocate work buf.\n", pid, __func__); return NULL; } memset(ctx->buf, 0, ctx->size * 2); if (data->use_cma) { cm_id = (struct rdma_cm_id *)ptr; ctx->context = cm_id->verbs; if (!ctx->context) { fprintf(stderr, "%d:%s: Unbound cm_id!!\n", pid, __func__); return NULL; } } else { ib_dev = (struct ibv_device *)ptr; ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "%d:%s: Couldn't get context for %s\n", pid, __func__, ibv_get_device_name(ib_dev)); return NULL; } } ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "%d:%s: Couldn't allocate PD\n", pid, __func__); return NULL; } /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says: * The Consumer is not allowed to assign Remote Write or Remote Atomic to * a Memory Region that has not been assigned Local Write. */ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, ctx->size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__); return NULL; } ctx->ch = ibv_create_comp_channel(ctx->context); if (!ctx->ch) { fprintf(stderr, "%d:%s: Couldn't create comp channel\n", pid, __func__); return NULL; } ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0); if (!ctx->rcq) { fprintf(stderr, "%d:%s: Couldn't create recv CQ\n", pid, __func__); return NULL; } ctx->scq = ibv_create_cq(ctx->context, ctx->tx_depth, ctx, ctx->ch, 0); if (!ctx->scq) { fprintf(stderr, "%d:%s: Couldn't create send CQ\n", pid, __func__); return NULL; } struct ibv_qp_init_attr attr = { .send_cq = ctx->scq, .recv_cq = ctx->rcq, .cap = { .max_send_wr = ctx->tx_depth, /* Work around: driver doesnt support * recv_wr = 0 */ .max_recv_wr = 1, .max_send_sge = 1, .max_recv_sge = 1, .max_inline_data = 0 }, .qp_type = IBV_QPT_RC }; if (data->use_cma) { if (rdma_create_qp(cm_id, ctx->pd, &attr)) { fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__); return NULL; } ctx->qp = cm_id->qp; pp_post_recv(ctx); return ctx; } else { ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__); return NULL; } { struct ibv_qp_attr attr; attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = data->ib_port; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "%d:%s: Failed to modify QP to INIT\n", pid, __func__); return NULL; } } return ctx; } } static int pp_connect_ctx(struct pingpong_context *ctx, struct pp_data data) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof attr); attr.qp_state = IBV_QPS_RTR; attr.path_mtu = IBV_MTU_2048; attr.dest_qp_num = data.rem_dest->qpn; attr.rq_psn = data.rem_dest->psn; attr.max_dest_rd_atomic = 1; attr.min_rnr_timer = 12; attr.ah_attr.is_global = 0; attr.ah_attr.dlid = data.rem_dest->lid; attr.ah_attr.sl = sl; attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = data.ib_port; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { fprintf(stderr, "%d:%s: Failed to modify QP to RTR\n", pid, __func__); return 1; } attr.qp_state = IBV_QPS_RTS; attr.timeout = 14; attr.retry_cnt = 7; attr.rnr_retry = 7; attr.sq_psn = data.my_dest.psn; attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "%d:%s: Failed to modify QP to RTS\n", pid, __func__); return 1; } return 0; } static void pp_post_recv(struct pingpong_context *ctx) { struct ibv_sge list; struct ibv_recv_wr wr, *bad_wr; int rc; list.addr = (uintptr_t) ctx->buf; list.length = 1; list.lkey = ctx->mr->lkey; wr.next = NULL; wr.wr_id = 0xdeadbeef; wr.sg_list = &list; wr.num_sge = 1; rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); if (rc) { perror("ibv_post_recv"); fprintf(stderr, "%d:%s: ibv_post_recv failed %d\n", pid, __func__, rc); } } static void pp_wait_for_done(struct pingpong_context *ctx) { struct ibv_wc wc; int ne; do { usleep(500); ne = ibv_poll_cq(ctx->rcq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (!(wc.opcode & IBV_WC_RECV)) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xdeadbeef) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); } static void pp_send_done(struct pingpong_context *ctx) { struct ibv_send_wr *bad_wr; struct ibv_wc wc; int ne; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = 1; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr_id = 0xcafebabe; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__); return; } do { usleep(500); ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (wc.opcode != IBV_WC_SEND) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xcafebabe) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); } static void pp_wait_for_start(struct pingpong_context *ctx) { struct ibv_wc wc; int ne; do { usleep(500); ne = ibv_poll_cq(ctx->rcq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (!(wc.opcode & IBV_WC_RECV)) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xdeadbeef) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); pp_post_recv(ctx); } static void pp_send_start(struct pingpong_context *ctx) { struct ibv_send_wr *bad_wr; struct ibv_wc wc; int ne; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = 1; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr_id = 0xabbaabba; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__); return; } do { usleep(500); ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (wc.opcode != IBV_WC_SEND) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xabbaabba) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); } static void pp_close_cma(struct pp_data data) { struct rdma_cm_event *event; int rc; if (data.servername) { rc = rdma_disconnect(data.cm_id); if (rc) { perror("rdma_disconnect"); fprintf(stderr, "%d:%s: rdma disconnect error\n", pid, __func__); return; } } rdma_get_cm_event(data.cm_channel, &event); if (event->event != RDMA_CM_EVENT_DISCONNECTED) fprintf(stderr, "%d:%s: unexpected event during disconnect %d\n", pid, __func__, event->event); rdma_ack_cm_event(event); rdma_destroy_id(data.cm_id); rdma_destroy_event_channel(data.cm_channel); } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -s, --size= size of message to exchange (default 65536)\n"); printf(" -t, --tx-depth= size of tx queue (default 100)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n"); printf(" -c, --cma use RDMA CM\n"); } static void print_report(unsigned int iters, unsigned size, int duplex, cycles_t *tposted, cycles_t *tcompleted) { double cycles_to_units; unsigned long tsize; /* Transferred size, in megabytes */ int i, j; int opt_posted = 0, opt_completed = 0; cycles_t opt_delta; cycles_t t; opt_delta = tcompleted[opt_posted] - tposted[opt_completed]; /* Find the peak bandwidth */ for (i = 0; i < iters; ++i) for (j = i; j < iters; ++j) { t = (tcompleted[j] - tposted[i]) / (j - i + 1); if (t < opt_delta) { opt_delta = t; opt_posted = i; opt_completed = j; } } cycles_to_units = get_cpu_mhz(0) * 1000000; tsize = duplex ? 2 : 1; tsize = tsize * size; printf("\n%d: Bandwidth peak (#%d to #%d): %g MB/sec\n", pid, opt_posted, opt_completed, tsize * cycles_to_units / opt_delta / 0x100000); printf("%d: Bandwidth average: %g MB/sec\n", pid, tsize * iters * cycles_to_units / (tcompleted[iters - 1] - tposted[0]) / 0x100000); printf("%d: Service Demand peak (#%d to #%d): %ld cycles/KB\n", pid, opt_posted, opt_completed, (unsigned long)opt_delta * 1024 / tsize); printf("%d: Service Demand Avg : %ld cycles/KB\n", pid, (unsigned long)(tcompleted[iters - 1] - tposted[0]) * 1024 / (tsize * iters)); } int main(int argc, char *argv[]) { struct ibv_device **dev_list; struct pingpong_context *ctx = NULL; char *ib_devname = NULL; int iters = 1000; int scnt, ccnt; int duplex = 0; struct ibv_qp *qp; cycles_t *tposted; cycles_t *tcompleted; struct pp_data data = { .port = 18515, .ib_port = 1, .size = 65536, .tx_depth = 100, .use_cma = 0, .servername = NULL, .rem_dest = NULL, .ib_dev = NULL, .cm_channel = NULL, .cm_id = NULL }; /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "bidirectional", .has_arg = 0, .val = 'b' }, { .name = "cma", .has_arg = 0, .val = 'c' }, { 0 } }; c = getopt_long(argc, argv, "p:d:i:s:n:t:S:bc", long_options, NULL); if (c == -1) break; switch (c) { case 'p': data.port = strtol(optarg, NULL, 0); if (data.port < 0 || data.port > 65535) { usage(argv[0]); return 1; } break; case 'd': ib_devname = strdupa(optarg); break; case 'i': data.ib_port = strtol(optarg, NULL, 0); if (data.ib_port < 0) { usage(argv[0]); return 1; } break; case 's': data.size = strtoll(optarg, NULL, 0); if (data.size < 1 || data.size > UINT_MAX / 2) { usage(argv[0]); return 1; } break; case 't': data.tx_depth = strtol(optarg, NULL, 0); if (data.tx_depth < 1) { usage(argv[0]); return 1; } break; case 'n': iters = strtol(optarg, NULL, 0); if (iters < 2) { usage(argv[0]); return 1; } break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 1; } break; case 'b': duplex = 1; break; case 'c': data.use_cma = 1; break; default: usage(argv[0]); return 1; } } if (optind == argc - 1) data.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; } /* Get the PID and prepend it to every output on stdout/stderr * This helps to parse output when multiple client/server are * run from single host */ pid = getpid(); printf("%d: | port=%d | ib_port=%d | size=%d | tx_depth=%d | sl=%d | iters=%d | duplex=%d | cma=%d |\n", pid, data.port, data.ib_port, data.size, data.tx_depth, sl, iters, duplex, data.use_cma); /* Done with parameter parsing. Perform setup. */ srand48(pid * time(NULL)); page_size = sysconf(_SC_PAGESIZE); if (data.use_cma) { data.cm_channel = rdma_create_event_channel(); if (!data.cm_channel) { fprintf(stderr, "%d:%s: rdma_create_event_channel failed\n", pid, __func__); return 1; } if (rdma_create_id(data.cm_channel, &data.cm_id, NULL, RDMA_PS_TCP)) { fprintf(stderr, "%d:%s: rdma_create_id failed\n", pid, __func__); return 1; } if (data.servername) { ctx = pp_client_connect(&data); if (!ctx) return 1; } else { ctx = pp_server_connect(&data); if (!ctx) return 1; } } else { dev_list = ibv_get_device_list(NULL); if (!ib_devname) { data.ib_dev = dev_list[0]; if (!data.ib_dev) { fprintf(stderr, "%d:%s: No IB devices found\n", pid, __func__); return 1; } } else { for (; (data.ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(data.ib_dev), ib_devname)) break; if (!data.ib_dev) { fprintf(stderr, "%d:%s: IB device %s not found\n", pid, __func__, ib_devname); return 1; } } if (data.servername) { ctx = pp_client_connect(&data); if (!ctx) return 1; } else { ctx = pp_server_connect(&data); if (!ctx) return 1; } data.my_dest.lid = pp_get_local_lid(ctx, data.ib_port); if (!data.my_dest.lid) { fprintf(stderr, "%d:%s: Local lid 0x0 detected. Is an SM running?\n", pid, __func__); return 1; } data.my_dest.qpn = ctx->qp->qp_num; data.my_dest.psn = lrand48() & 0xffffff; data.my_dest.rkey = ctx->mr->rkey; data.my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ if (data.servername) { if (pp_client_exch_dest(&data)) return 1; } else { if (pp_server_exch_dest(&data)) return 1; } } printf("%d: Local address: LID %#04x, QPN %#06x, PSN %#06x " "RKey %#08x VAddr %#016Lx\n", pid, data.my_dest.lid, data.my_dest.qpn, data.my_dest.psn, data.my_dest.rkey, data.my_dest.vaddr); printf("%d: Remote address: LID %#04x, QPN %#06x, PSN %#06x, " "RKey %#08x VAddr %#016Lx\n\n", pid, data.rem_dest->lid, data.rem_dest->qpn, data.rem_dest->psn, data.rem_dest->rkey, data.rem_dest->vaddr); if (data.use_cma) { /* * Synch up and force the server to wait for the client to send * the first message (MPA requirement). */ if (data.servername) { pp_send_start(ctx); } else { pp_wait_for_start(ctx); } } else { if (pp_connect_ctx(ctx, data)) return 1; /* An additional handshake is required *after* moving qp to RTR. Arbitrarily reuse exch_dest for this purpose. */ if (data.servername) { if (pp_client_exch_dest(&data)) return 1; } else { if (pp_server_exch_dest(&data)) return 1; } } /* For half duplex tests, server just waits for client to exit */ if (!data.servername && !duplex) { if (data.use_cma) { pp_wait_for_done(ctx); pp_send_done(ctx); pp_close_cma(data); } else { pp_server_exch_dest(&data); write(data.sockfd, "done", sizeof "done"); close(data.sockfd); } return 0; } ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = ctx->size; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr.rdma.remote_addr = data.rem_dest->vaddr; ctx->wr.wr.rdma.rkey = data.rem_dest->rkey; ctx->wr.wr_id = PINGPONG_RDMA_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_WRITE; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; scnt = 0; ccnt = 0; qp = ctx->qp; tposted = malloc(iters * sizeof *tposted); if (!tposted) { perror("malloc"); return 1; } tcompleted = malloc(iters * sizeof *tcompleted); if (!tcompleted) { perror("malloc"); return 1; } /* Done with setup. Start the test. */ while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < data.tx_depth) { struct ibv_send_wr *bad_wr; tposted[scnt] = get_cycles(); if (ibv_post_send(qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "%d:%s: Couldn't post send: scnt=%d\n", pid, __func__, scnt); return 1; } ++scnt; } if (ccnt < iters) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (ne == 0); tcompleted[ccnt] = get_cycles(); if (ne < 0) { fprintf(stderr, "%d:%s: poll CQ failed %d\n", pid, __func__, ne); return 1; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "%d:%s: Completion with error at %s:\n", pid, __func__, data.servername ? "client" : "server"); fprintf(stderr, "%d:%s: Failed status %d: wr_id %d\n", pid, __func__, wc.status, (int) wc.wr_id); fprintf(stderr, "%d:%s: scnt=%d, ccnt=%d\n", pid, __func__, scnt, ccnt); return 1; } ccnt += 1; } } if (data.use_cma) { /* This is racy when duplex mode is used*/ pp_send_done(ctx); pp_wait_for_done(ctx); pp_close_cma(data); } else { if (data.servername) pp_client_exch_dest(&data); else pp_server_exch_dest(&data); write(data.sockfd, "done", sizeof "done"); close(data.sockfd); } print_report(iters, data.size, duplex, tposted, tcompleted); free(tposted); free(tcompleted); return 0; } trunk/Makefile0000755000175000017500000000113211234043545013204 0ustar benoitbenoitTESTS = write_bw_postlist rdma_lat rdma_bw send_lat send_bw write_lat write_bw read_lat read_bw UTILS = clock_test all: ${TESTS} ${UTILS} CFLAGS += -Wall -g -D_GNU_SOURCE -O2 EXTRA_FILES = get_clock.c EXTRA_HEADERS = get_clock.h #The following seems to help GNU make on some platforms LOADLIBES += LDFLAGS += ${TESTS}: LOADLIBES += -libverbs -lrdmacm ${TESTS} ${UTILS}: %: %.c ${EXTRA_FILES} ${EXTRA_HEADERS} $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< ${EXTRA_FILES} $(LOADLIBES) $(LDLIBS) -o ib_$@ clean: $(foreach fname,${TESTS} ${UTILS}, rm -f ib_${fname}) .DELETE_ON_ERROR: .PHONY: all clean trunk/perftest.spec0000755000175000017500000000312311240600160014243 0ustar benoitbenoitName: perftest Summary: IB Performance tests Version: 1.2 Release: 1.ofed1.4.2 License: BSD 3-Clause, GPL v2 or later Group: Productivity/Networking/Diagnostic Source: http://www.openfabrics.org/downloads/perftest-1.2.tar.gz Url: http://www.openfabrics.org BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: libibverbs-devel librdmacm-devel %description gen2 uverbs microbenchmarks %prep %setup -q %build export CFLAGS="$RPM_OPT_FLAGS" %{__make} chmod -x runme %install install -D -m 0755 ib_rdma_lat $RPM_BUILD_ROOT%{_bindir}/ib_rdma_lat install -D -m 0755 ib_rdma_bw $RPM_BUILD_ROOT%{_bindir}/ib_rdma_bw install -D -m 0755 ib_write_lat $RPM_BUILD_ROOT%{_bindir}/ib_write_lat install -D -m 0755 ib_write_bw $RPM_BUILD_ROOT%{_bindir}/ib_write_bw install -D -m 0755 ib_send_lat $RPM_BUILD_ROOT%{_bindir}/ib_send_lat install -D -m 0755 ib_send_bw $RPM_BUILD_ROOT%{_bindir}/ib_send_bw install -D -m 0755 ib_read_lat $RPM_BUILD_ROOT%{_bindir}/ib_read_lat install -D -m 0755 ib_read_bw $RPM_BUILD_ROOT%{_bindir}/ib_read_bw install -D -m 0755 ib_write_bw_postlist $RPM_BUILD_ROOT%{_bindir}/ib_write_bw_postlist install -D -m 0755 ib_clock_test $RPM_BUILD_ROOT%{_bindir}/ib_clock_test %clean rm -rf ${RPM_BUILD_ROOT} %files %defattr(-, root, root) %doc README COPYING runme %_bindir/* %changelog * Mon Jul 09 2007 - hvogel@suse.de - Use correct version * Wed Jul 04 2007 - hvogel@suse.de - Add GPL COPYING file [#289509] * Mon Jul 02 2007 - hvogel@suse.de - Update to the OFED 1.2 version * Fri Jun 22 2007 - hvogel@suse.de - Initial Package, Version 1.1 trunk/runme0000755000175000017500000000055311234043545012623 0ustar benoitbenoit#!/bin/sh # trivial script to launch a server/client test with ssh # must be launched from client # example: runme 10.0.0.1 /home/perftest/rdma_lat -s 10 if [ $# -lt 1 ] ; then echo "Usage: runme " exit 3 fi server=$1 shift ssh $server $* & #give server time to start sleep 2 $* $server status=$? wait exit $status trunk/write_bw_postlist.c0000755000175000017500000010631311237107527015506 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_RDMA_WRID 3 #define VERSION 1.0 #define ALL 1 #define MAX_INLINE 400 #define RC 0 #define UC 1 struct user_parameters { const char *servername; int connection_type; int mtu; int all; /* run all msg size */ int iters; int tx_depth; int numofqps; int maxpostsofqpiniteration; int inline_size; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; struct extended_qp { struct ibv_qp *qp; int scnt, ccnt ; }; static int sl = 0; static int page_size; cycles_t *tposted; cycles_t *tcompleted; struct pingpong_context { struct ibv_context *context; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *cq; struct ibv_qp **qp; void *buf; unsigned size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; int *scnt; int *ccnt ; union ibv_gid dgid; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } struct pingpong_dest * pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { struct pingpong_dest *rem_dest = NULL; char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; int parsed; sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); goto out; } if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("client read"); fprintf(stderr, "Couldn't read remote address\n"); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } out: return rem_dest; } int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; struct pingpong_dest *rem_dest = NULL; int parsed; int n; n = read(connfd, msg, sizeof msg); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(connfd, msg, sizeof msg) != sizeof msg) { perror("server write"); fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); rem_dest = NULL; goto out; } out: return rem_dest; } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, unsigned size, int tx_depth, int port, struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; int counter; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->qp = malloc(sizeof (struct ibv_qp*) * user_parm->numofqps ); ctx->size = size; ctx->tx_depth = tx_depth; ctx->scnt = malloc(user_parm->numofqps * sizeof (int)); if (!ctx->scnt) { perror("malloc"); return NULL; } ctx->ccnt = malloc(user_parm->numofqps * sizeof (int)); if (!ctx->ccnt) { perror("malloc"); return NULL; } memset(ctx->scnt, 0, user_parm->numofqps * sizeof (int)); memset(ctx->ccnt, 0, user_parm->numofqps * sizeof (int)); ctx->buf = memalign(page_size, size * 2 * user_parm->numofqps ); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2 * user_parm->numofqps); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says: * The Consumer is not allowed to assign Remote Write or Remote Atomic to * a Memory Region that has not been assigned Local Write. */ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2 * user_parm->numofqps, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } ctx->cq = ibv_create_cq(ctx->context, tx_depth * user_parm->numofqps , NULL, NULL, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } for (counter =0 ; counter < user_parm->numofqps ; counter++) { struct ibv_qp_init_attr initattr; struct ibv_qp_attr attr; memset(&initattr, 0, sizeof(struct ibv_qp_init_attr)); initattr.send_cq = ctx->cq; initattr.recv_cq = ctx->cq; initattr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ initattr.cap.max_recv_wr = 1; initattr.cap.max_send_sge = 1; initattr.cap.max_recv_sge = 1; initattr.cap.max_inline_data = user_parm->inline_size; if (user_parm->connection_type == 1) { initattr.qp_type = IBV_QPT_UC; } else { initattr.qp_type = IBV_QPT_RC; } ctx->qp[counter] = ibv_create_qp(ctx->pd, &initattr); if (!ctx->qp[counter]) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; if (ibv_modify_qp(ctx->qp[counter], &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest, struct user_parameters *user_parm, int qpindex) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof attr); attr.qp_state = IBV_QPS_RTR; switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; if (user_parm->connection_type==RC) { attr.max_dest_rd_atomic = 1; attr.min_rnr_timer = 12; } if (user_parm->gid_index < 0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if (user_parm->connection_type == RC) { if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; } else { if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; attr.max_rd_atomic = 1; if (user_parm->connection_type == 0) { attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } } else { if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTS\n"); return 1; } } return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -c, --connection= connection type RC/UC (default RC)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -g, --post= number of posts for each qp in the chain (default tx_depth)\n"); printf(" -q, --qp= Num of qp's(default 1)\n"); printf(" -s, --size= size of message to exchange (default 65536)\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -t, --tx-depth= size of tx queue (default 100)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 5000)\n"); printf(" -I, --inline_size= max size of message to be sent in inline mode (default 400)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n"); printf(" -V, --version display version number\n"); printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n"); } static void print_report(unsigned int iters, unsigned size, int duplex, cycles_t *tposted, cycles_t *tcompleted, struct user_parameters *user_param, int no_cpu_freq_fail) { double cycles_to_units; unsigned long tsize; /* Transferred size, in megabytes */ int i, j; int opt_posted = 0, opt_completed = 0; cycles_t opt_delta; cycles_t t; opt_delta = tcompleted[opt_posted] - tposted[opt_completed]; /* Find the peak bandwidth */ for (i = 0; i < iters * user_param->numofqps; ++i) for (j = i; j < iters * user_param->numofqps; ++j) { t = (tcompleted[j] - tposted[i]) / (j - i + 1); if (t < opt_delta) { opt_delta = t; opt_posted = i; opt_completed = j; } } cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000; tsize = duplex ? 2 : 1; tsize = tsize * size; printf("%7d %d %7.2f %7.2f\n", size,iters,tsize * cycles_to_units / opt_delta / 0x100000, tsize * iters * user_param->numofqps * cycles_to_units /(tcompleted[(iters* user_param->numofqps) - 1] - tposted[0]) / 0x100000); } int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest **rem_dest, int size) { struct ibv_qp *qp; int totscnt, totccnt ; int index , qpindex; int numpostperqp ; struct ibv_send_wr *wrlist; struct ibv_send_wr *bad_wr; struct ibv_wc wc; wrlist = malloc(user_param->numofqps * sizeof (struct ibv_send_wr) * user_param->tx_depth); if (!wrlist) { perror("malloc"); return -1; } ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = size; ctx->list.lkey = ctx->mr->lkey; /* prepare the wqe */ ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_WRITE; if (size > user_param->inline_size) {/* complaince to perf_main */ ctx->wr.send_flags = IBV_SEND_SIGNALED; } else { ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } ctx->wr.next = NULL; /*These should be the i'th place ... */ ctx->wr.wr.rdma.remote_addr = rem_dest[0]->vaddr; ctx->wr.wr.rdma.rkey = rem_dest[0]->rkey; /* lets make the list with the right id's*/ for (qpindex=0 ; qpindex < user_param->numofqps ; qpindex++) { for (index =0 ; index < user_param->maxpostsofqpiniteration ; index++) { wrlist[qpindex*user_param->maxpostsofqpiniteration+index]=ctx->wr; wrlist[qpindex*user_param->maxpostsofqpiniteration+ index].wr_id = qpindex ; if(index < user_param->maxpostsofqpiniteration -1) { wrlist[qpindex*user_param->maxpostsofqpiniteration+index].next=&wrlist[qpindex*user_param->maxpostsofqpiniteration+index+1]; } else { wrlist[qpindex*user_param->maxpostsofqpiniteration+index].next=NULL; } } } totscnt = 0; totccnt = 0; /*clear the scnt ccnt counters for each iteration*/ for (index =0 ; index < user_param->numofqps ; index++) { ctx->scnt[index] = 0; ctx->ccnt[index] = 0; } index = 0; /* Done with setup. Start the test. */ while (totscnt < (user_param->iters * user_param->numofqps) || totccnt < (user_param->iters * user_param->numofqps) ) { /* main loop to run over all the qps and post for each accumulated 40 wq's */ for (qpindex =0 ; qpindex < user_param->numofqps ; qpindex++) { qp = ctx->qp[qpindex]; if (user_param->iters > ctx->scnt[qpindex] ) { numpostperqp = user_param->maxpostsofqpiniteration - (ctx->scnt[qpindex] - ctx->ccnt[qpindex]); if (numpostperqp > 40 || ((user_param->iters - ctx->scnt[qpindex]) <= 40 && numpostperqp > 0) ){ wrlist[qpindex*user_param->maxpostsofqpiniteration+numpostperqp-1].next=NULL; tposted[totscnt] = get_cycles(); if (ibv_post_send(qp, &wrlist[qpindex*user_param->maxpostsofqpiniteration], &bad_wr)) { fprintf(stderr, "Couldn't post %d send: qp index = %d qp scnt=%d total scnt %d qp scnt=%d total ccnt=%d\n", numpostperqp,qpindex,ctx->scnt[qpindex],totscnt,ctx->ccnt[qpindex],totccnt); return 1; } ctx->scnt[qpindex]= ctx->scnt[qpindex]+numpostperqp; totscnt=totscnt + numpostperqp; wrlist[qpindex*user_param->maxpostsofqpiniteration+numpostperqp-1].next=&wrlist[qpindex*user_param->maxpostsofqpiniteration+numpostperqp]; } } /*FINISHED POSTING */ } if (totccnt < (user_param->iters * user_param->numofqps) ) { int ne; do { ne = ibv_poll_cq(ctx->cq, 1, &wc); } while (ne == 0); tcompleted[totccnt] = get_cycles(); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "qp index %d ,qp scnt=%d, qp ccnt=%d total scnt %d total ccnt %d\n", (int)wc.wr_id, ctx->scnt[(int)wc.wr_id], ctx->ccnt[(int)wc.wr_id], totscnt, totccnt); return 1; } /*here the id is the index to the qp num */ ctx->ccnt[(int)wc.wr_id] = ctx->ccnt[(int)wc.wr_id]+1; totccnt += 1; } } free(wrlist); return(0); } int main(int argc, char *argv[]) { struct ibv_device **dev_list; struct ibv_device *ib_dev; struct pingpong_context *ctx; struct pingpong_dest *my_dest; struct pingpong_dest **rem_dest; struct user_parameters user_param; struct ibv_device_attr device_attribute; char *ib_devname = NULL; int port = 18515; int ib_port = 1; long long size = 65536; int sockfd; int duplex = 0; int i = 0; int inline_given_in_cmd = 0; struct ibv_context *context; int no_cpu_freq_fail = 0; union ibv_gid gid; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; user_param.iters = 5000; user_param.tx_depth = 100; user_param.servername = NULL; user_param.numofqps = 1; user_param.maxpostsofqpiniteration = 100; user_param.inline_size = MAX_INLINE; user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "qp", .has_arg = 1, .val = 'q' }, { .name = "post", .has_arg = 1, .val = 'g' }, { .name = "connection", .has_arg = 1, .val = 'c' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "inline_size", .has_arg = 1, .val = 'I' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "bidirectional", .has_arg = 0, .val = 'b' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:d:i:m:q:g:c:s:n:t:I:u:S:x:baVF", long_options, NULL); if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'd': ib_devname = strdupa(optarg); break; case 'c': if (strcmp("UC",optarg)==0) user_param.connection_type=UC; break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'q': user_param.numofqps = strtol(optarg, NULL, 0); break; case 'g': user_param.maxpostsofqpiniteration = strtol(optarg, NULL, 0); break; case 'a': user_param.all = ALL; break; case 'V': printf("rdma_bw version : %.2f\n",VERSION); return 0; break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 1; } break; case 's': size = strtoll(optarg, NULL, 0); if (size < 1 || size > UINT_MAX / 2) { usage(argv[0]); return 1; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 1; } break; case 'I': user_param.inline_size = strtol(optarg, NULL, 0); inline_given_in_cmd =1; if (user_param.inline_size > MAX_INLINE) { usage(argv[0]); return 7; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 1; } break; case 'b': duplex = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 1; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; default: usage(argv[0]); return 1; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; } printf("------------------------------------------------------------------\n"); if (duplex == 1) { printf(" RDMA_Write Bidirectional Post List BW Test\n"); } else { printf(" RDMA_Write Post List BW Test\n"); } printf("Number of qp's running %d\n",user_param.numofqps); if (user_param.connection_type==RC) { printf("Connection type : RC\n"); } else { printf("Connection type : UC\n"); } if (user_param.maxpostsofqpiniteration > user_param.tx_depth ) { printf("Can not post more than tx_depth , adjusting number of post to tx_depth\n"); user_param.maxpostsofqpiniteration = user_param.tx_depth; } else { printf("Each Qp will post %d messages each time\n",user_param.maxpostsofqpiniteration); } if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } /* Done with parameter parsing. Perform setup. */ if (user_param.all == ALL) { /*since we run all sizes */ size = 8388608; /*2^23 */ } srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) { fprintf(stderr, "No IB devices found\n"); return 1; } } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) { fprintf(stderr, "IB device %s not found\n", ib_devname); return 1; } } context = ibv_open_device(ib_dev); if (ibv_query_device(context, &device_attribute)) { fprintf(stderr, "Failed to query device props"); return 1; } if ((device_attribute.vendor_part_id == 25408 || device_attribute.vendor_part_id == 25418 || device_attribute.vendor_part_id == 26408 || device_attribute.vendor_part_id == 26418 || device_attribute.vendor_part_id == 26428) && (!inline_given_in_cmd)) { user_param.inline_size = 1; } printf("Inline data is used up to %d bytes message\n", user_param.inline_size); ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port, &user_param); if (!ctx) return 1; if (user_param.gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } if (user_param.servername) { sockfd = pp_client_connect(user_param.servername, port); if (sockfd < 0) return 1; } else { sockfd = pp_server_connect(port); if (sockfd < 0) return 1; } my_dest = malloc(user_param.numofqps * sizeof *my_dest); rem_dest = malloc(sizeof (struct pingpong_dest*) * user_param.numofqps ); for (i =0 ; iqp[i]->qp_num; /* TBD this should be changed into VA and diffreent key to each qp */ my_dest[i].rkey = ctx->mr->rkey; my_dest[i].vaddr = (uintptr_t)ctx->buf + ctx->size; printf(" local address: LID %#04x, QPN %#06x, PSN %#06x " "RKey %#08x VAddr %#016Lx\n", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn, my_dest[i].rkey, my_dest[i].vaddr); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest[i].dgid.raw[0],my_dest[i].dgid.raw[1], my_dest[i].dgid.raw[2], my_dest[i].dgid.raw[3], my_dest[i].dgid.raw[4], my_dest[i].dgid.raw[5], my_dest[i].dgid.raw[6], my_dest[i].dgid.raw[7], my_dest[i].dgid.raw[8], my_dest[i].dgid.raw[9], my_dest[i].dgid.raw[10], my_dest[i].dgid.raw[11], my_dest[i].dgid.raw[12], my_dest[i].dgid.raw[13], my_dest[i].dgid.raw[14], my_dest[i].dgid.raw[15]); } if (user_param.servername) { rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param); } else { rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param); } if (!rem_dest[i]) return 1; printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x, " "RKey %#08x VAddr %#016Lx\n", rem_dest[i]->lid, rem_dest[i]->qpn, rem_dest[i]->psn, rem_dest[i]->rkey, rem_dest[i]->vaddr); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest[i]->dgid.raw[0],rem_dest[i]->dgid.raw[1], rem_dest[i]->dgid.raw[2], rem_dest[i]->dgid.raw[3], rem_dest[i]->dgid.raw[4], rem_dest[i]->dgid.raw[5], rem_dest[i]->dgid.raw[6], rem_dest[i]->dgid.raw[7], rem_dest[i]->dgid.raw[8], rem_dest[i]->dgid.raw[9], rem_dest[i]->dgid.raw[10], rem_dest[i]->dgid.raw[11], rem_dest[i]->dgid.raw[12], rem_dest[i]->dgid.raw[13], rem_dest[i]->dgid.raw[14], rem_dest[i]->dgid.raw[15]); } if (pp_connect_ctx(ctx, ib_port, my_dest[i].psn, rem_dest[i], &user_param, i)) return 1; /* An additional handshake is required *after* moving qp to RTR. Arbitrarily reuse exch_dest for this purpose. */ if (user_param.servername) { rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param); } else { rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param); } } printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n"); /* For half duplex tests, server just waits for client to exit */ /* the 0th place is arbitrary to signal finish ... */ if (!user_param.servername && !duplex) { rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param); if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("server write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); return 0; } tposted = malloc(user_param.iters * user_param.numofqps * sizeof *tposted); if (!tposted) { perror("malloc"); return 1; } tcompleted = malloc(user_param.iters * user_param.numofqps * sizeof *tcompleted); if (!tcompleted) { perror("malloc"); return 1; } if (user_param.all == ALL) { for (i = 1; i < 24 ; ++i) { size = 1 << i; if(run_iter(ctx, &user_param, rem_dest, size)) return 17; print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, no_cpu_freq_fail); } } else { if(run_iter(ctx, &user_param, rem_dest, size)) return 18; print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, no_cpu_freq_fail); } /* the 0th place is arbitrary to signal finish ... */ if (user_param.servername) { rem_dest[0] = pp_client_exch_dest(sockfd, &my_dest[0], &user_param); } else { rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param); } if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); free(tposted); free(tcompleted); printf("------------------------------------------------------------------\n"); return 0; } trunk/README0000755000175000017500000001254111234043545012432 0ustar benoitbenoit Open Fabrics Enterprise Distribution (OFED) Performance Tests README for OFED 1.3 March 2008 =============================================================================== Table of Contents =============================================================================== 1. Overview 2. Notes on Testing Method 3. Test Descriptions 4. Running Tests =============================================================================== 1. Overview =============================================================================== This is a collection of tests written over uverbs intended for use as a performance micro-benchmark. As an example, the tests can be used for hardware or software tuning and/or functional testing. Please post results and observations to the openib-general mailing list. See "Contact Us" at http://openib.org/mailman/listinfo/openib-general and http://www.openib.org. =============================================================================== 2. Notes on Testing Method =============================================================================== - The benchmark uses the CPU cycle counter to get time stamps without a context switch. Some CPU architectures (e.g., Intel's 80486 or older PPC) do NOT have such capability. - The benchmark measures round-trip time but reports half of that as one-way latency. This means that it may not be sufficiently accurate for asymmetrical configurations. - Min/Median/Max results are reported. The Median (vs average) is less sensitive to extreme scores. Typically, the Max value is the first value measured. - Larger samples only help marginally. The default (1000) is very satisfactory. Note that an array of cycles_t (typically an unsigned long) is allocated once to collect samples and again to store the difference between them. Really big sample sizes (e.g., 1 million) might expose other problems with the program. - The "-H" option will dump the histogram for additional statistical analysis. See xgraph, ygraph, r-base (http://www.r-project.org/), pspp, or other statistical math programs. Architectures tested: i686, x86_64, ia64 =============================================================================== 3. Test Descriptions =============================================================================== The following tests are mainly useful for hardware/software benchmarking. write_lat.c latency test with RDMA write transactions write_bw.c bandwidth test with RDMA write transactions send_lat.c latency test with send transactions send_bw.c bandwidth test with send transactions read_lat.c latency test with RDMA read transactions read_bw.c bandwidth test with RDMA read transactions Legacy tests: (To be removed in the next release) rdma_lat.c latency test with RDMA write transactions rdma_bw.c streaming bandwidth test with RDMA write transactions The executable name of each test starts with the general prefix "ib_"; for example, ib_write_lat. =============================================================================== 4. Running Tests =============================================================================== Prerequisites: kernel 2.6 ib_uverbs (kernel module) matches libibverbs ("match" means binary compatible, but ideally of the same SVN rev) Server: ./ Client: ./ o is IPv4 or IPv6 address. You can use the IPoIB diags_release_notes.txt mpi-selector_release_notes.txt rdma_cm_release_notes.txt MSTFLINT_README.txt open_mpi_release_notes.txt RDS_README.txt ib-bonding.txt mthca_release_notes.txt opensm_release_notes.txt rds_release_notes.txt ibutils_release_notes.txt* mvapich_release_notes.txt PERF_TEST_README.txt sdp_release_notes.txt ipoib_release_notes.txt srp_release_notes.txt QoS_in_OFED.txt SRPT_README.txt mlx4_release_notes.txt QoS_management_in_OpenSM. address if IPoIB is configured. o --help lists the available *** IMPORTANT NOTE: The SAME OPTIONS must be passed to both server and client. Common Options to all tests: -p, --port= listen on/connect to port (default: 18515) -m, --mtu= mtu size (default: 1024) -d, --ib-dev= use IB device (default: first device found) -i, --ib-port= use port of IB device (default: 1) -s, --size= size of message to exchange (default: 1) -a, --all run sizes from 2 till 2^23 -t, --tx-depth= size of tx queue (default: 50) -n, --iters= number of exchanges (at least 100, default: 1000) -C, --report-cycles report times in cpu cycle units (default: microseconds) -H, --report-histogram print out all results (default: print summary only) -U, --report-unsorted (implies -H) print out unsorted results (default: sorted) -V, --version display version number *** IMPORTANT NOTE: You need to be running a Subnet Manager on the switch or on one of the nodes in your fabric. Example: Run "ib_write_lat -a" on the server side. Then run "ib_write_lat -a " on the client side. ib_write_lat will exit on both server and client after printing results. trunk/write_bw.c0000755000175000017500000010631211237107527013544 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_RDMA_WRID 3 #define VERSION 2.0 #define ALL 1 #define MAX_INLINE 400 #define RC 0 #define UC 1 struct user_parameters { const char *servername; int connection_type; int mtu; int all; /* run all msg size */ int iters; int tx_depth; int numofqps; int maxpostsofqpiniteration; int inline_size; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; struct extended_qp { struct ibv_qp *qp; int scnt, ccnt ; }; static int sl = 0; static int page_size; cycles_t *tposted; cycles_t *tcompleted; struct pingpong_context { struct ibv_context *context; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *cq; struct ibv_qp **qp; void *buf; unsigned size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; int *scnt; int *ccnt; union ibv_gid dgid; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } struct pingpong_dest * pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { struct pingpong_dest *rem_dest = NULL; char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; int parsed; sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); goto out; } if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("client read"); fprintf(stderr, "Couldn't read remote address\n"); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } out: return rem_dest; } int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; struct pingpong_dest *rem_dest = NULL; int parsed; int n; n = read(connfd, msg, sizeof msg); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(connfd, msg, sizeof msg) != sizeof msg) { perror("server write"); fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); rem_dest = NULL; goto out; } out: return rem_dest; } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, unsigned size, int tx_depth, int port, struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; int counter; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->qp = malloc(sizeof (struct ibv_qp*) * user_parm->numofqps ); ctx->size = size; ctx->tx_depth = tx_depth; ctx->scnt = malloc(user_parm->numofqps * sizeof (int)); if (!ctx->scnt) { perror("malloc"); return NULL; } ctx->ccnt = malloc(user_parm->numofqps * sizeof (int)); if (!ctx->ccnt) { perror("malloc"); return NULL; } memset(ctx->scnt, 0, user_parm->numofqps * sizeof (int)); memset(ctx->ccnt, 0, user_parm->numofqps * sizeof (int)); ctx->buf = memalign(page_size, size * 2 * user_parm->numofqps ); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2 * user_parm->numofqps); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says: * The Consumer is not allowed to assign Remote Write or Remote Atomic to * a Memory Region that has not been assigned Local Write. */ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2 * user_parm->numofqps, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } ctx->cq = ibv_create_cq(ctx->context, tx_depth * user_parm->numofqps , NULL, NULL, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } for (counter =0 ; counter < user_parm->numofqps ; counter++) { struct ibv_qp_init_attr initattr; struct ibv_qp_attr attr; memset(&initattr, 0, sizeof(struct ibv_qp_init_attr)); initattr.send_cq = ctx->cq; initattr.recv_cq = ctx->cq; initattr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ initattr.cap.max_recv_wr = 1; initattr.cap.max_send_sge = 1; initattr.cap.max_recv_sge = 1; initattr.cap.max_inline_data = user_parm->inline_size; if (user_parm->connection_type == 1) { initattr.qp_type = IBV_QPT_UC; } else { initattr.qp_type = IBV_QPT_RC; } ctx->qp[counter] = ibv_create_qp(ctx->pd, &initattr); if (!ctx->qp[counter]) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; if (ibv_modify_qp(ctx->qp[counter], &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest, struct user_parameters *user_parm, int qpindex) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof attr); attr.qp_state = IBV_QPS_RTR; switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; if (user_parm->connection_type==RC) { attr.max_dest_rd_atomic = 1; attr.min_rnr_timer = 12; } if (user_parm->gid_index<0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if (user_parm->connection_type == RC) { if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; } else { if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; attr.max_rd_atomic = 1; if (user_parm->connection_type == 0) { attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } } else { if (ibv_modify_qp(ctx->qp[qpindex], &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTS\n"); return 1; } } return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -c, --connection= connection type RC/UC (default RC)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -g, --post= number of posts for each qp in the chain (default tx_depth)\n"); printf(" -q, --qp= Num of qp's(default 1)\n"); printf(" -s, --size= size of message to exchange (default 65536)\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -t, --tx-depth= size of tx queue (default 100)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 5000)\n"); printf(" -I, --inline_size= max size of message to be sent in inline mode (default 400)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n"); printf(" -V, --version display version number\n"); printf(" -N, --no peak-bw cancel peak-bw calculation (default with peak-bw)\n"); printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n"); } static void print_report(unsigned int iters, unsigned size, int duplex, cycles_t *tposted, cycles_t *tcompleted, struct user_parameters *user_param, int noPeak, int no_cpu_freq_fail) { double cycles_to_units; unsigned long tsize; /* Transferred size, in megabytes */ int i, j; int opt_posted = 0, opt_completed = 0; cycles_t opt_delta; cycles_t t; opt_delta = tcompleted[opt_posted] - tposted[opt_completed]; if (!noPeak) { /* Find the peak bandwidth unless asked not to in command line*/ for (i = 0; i < iters * user_param->numofqps; ++i) for (j = i; j < iters * user_param->numofqps; ++j) { t = (tcompleted[j] - tposted[i]) / (j - i + 1); if (t < opt_delta) { opt_delta = t; opt_posted = i; opt_completed = j; } } } cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000; tsize = duplex ? 2 : 1; tsize = tsize * size; printf("%7d %d %7.2f %7.2f\n", size,iters,!(noPeak) * tsize * cycles_to_units / opt_delta / 0x100000, tsize * iters * user_param->numofqps * cycles_to_units /(tcompleted[(iters* user_param->numofqps) - 1] - tposted[0]) / 0x100000); } int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest **rem_dest, int size) { struct ibv_qp *qp; int totscnt, totccnt ; int index ,warmindex; int inline_size; struct ibv_send_wr *bad_wr; struct ibv_wc wc; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = size; ctx->list.lkey = ctx->mr->lkey; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_WRITE; inline_size = user_param->inline_size; if (size > inline_size) {/* complaince to perf_main */ ctx->wr.send_flags = IBV_SEND_SIGNALED; } else { ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } ctx->wr.next = NULL; totscnt = 0; totccnt = 0; /*clear the scnt ccnt counters for each iteration*/ for (index =0 ; index < user_param->numofqps ; index++) { ctx->scnt[index] = 0; ctx->ccnt[index] = 0; } index = 0; /* Done with setup. Start the test. warm up posting of total 100 wq's per qp 1 for each qp till all qps have 100 */ for (warmindex =0 ;warmindex < user_param->maxpostsofqpiniteration ;warmindex ++ ) { for (index =0 ; index < user_param->numofqps ; index++) { ctx->wr.wr.rdma.remote_addr = rem_dest[index]->vaddr; ctx->wr.wr.rdma.rkey = rem_dest[index]->rkey; qp = ctx->qp[index]; ctx->wr.wr_id = index ; tposted[totscnt] = get_cycles(); if (ibv_post_send(qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "Couldn't post warmup send: qp index = %d qp scnt=%d total scnt %d\n", index,ctx->scnt[index],totscnt); return 1; } ctx->scnt[index]= ctx->scnt[index]+1; ++totscnt; } } /* main loop for posting */ while (totscnt < (user_param->iters * user_param->numofqps) || totccnt < (user_param->iters * user_param->numofqps) ) { /* main loop to run over all the qps and post each time n messages */ for (index =0 ; index < user_param->numofqps ; index++) { ctx->wr.wr.rdma.remote_addr = rem_dest[index]->vaddr; ctx->wr.wr.rdma.rkey = rem_dest[index]->rkey; qp = ctx->qp[index]; ctx->wr.wr_id = index ; while (ctx->scnt[index] < user_param->iters && (ctx->scnt[index] - ctx->ccnt[index]) < user_param->maxpostsofqpiniteration) { tposted[totscnt] = get_cycles(); if (ibv_post_send(qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: qp index = %d qp scnt=%d total scnt %d\n", index,ctx->scnt[index],totscnt); return 1; } ctx->scnt[index]= ctx->scnt[index]+1; ++totscnt; } } /* finished posting now polling */ if (totccnt < (user_param->iters * user_param->numofqps) ) { int ne; do { ne = ibv_poll_cq(ctx->cq, 1, &wc); } while (ne == 0); tcompleted[totccnt] = get_cycles(); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "qp index %d ,qp scnt=%d, qp ccnt=%d total scnt %d total ccnt %d\n", (int)wc.wr_id, ctx->scnt[(int)wc.wr_id], ctx->ccnt[(int)wc.wr_id], totscnt, totccnt); return 1; } /*here the id is the index to the qp num */ ctx->ccnt[(int)wc.wr_id] = ctx->ccnt[(int)wc.wr_id]+1; totccnt += 1; } } return(0); } int main(int argc, char *argv[]) { struct ibv_device **dev_list; struct ibv_device *ib_dev; struct pingpong_context *ctx; struct pingpong_dest *my_dest; struct pingpong_dest **rem_dest; struct user_parameters user_param; struct ibv_device_attr device_attribute; char *ib_devname = NULL; int port = 18515; int ib_port = 1; long long size = 65536; int sockfd; int duplex = 0; int i = 0; int noPeak = 0;/*noPeak == 0: regular peak-bw calculation done*/ int inline_given_in_cmd = 0; struct ibv_context *context; int no_cpu_freq_fail = 0; union ibv_gid gid; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; user_param.iters = 5000; user_param.tx_depth = 100; user_param.servername = NULL; user_param.numofqps = 1; user_param.maxpostsofqpiniteration = 100; user_param.inline_size = MAX_INLINE; user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "qp", .has_arg = 1, .val = 'q' }, { .name = "post", .has_arg = 1, .val = 'g' }, { .name = "connection", .has_arg = 1, .val = 'c' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "inline_size", .has_arg = 1, .val = 'I' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "bidirectional", .has_arg = 0, .val = 'b' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "noPeak", .has_arg = 0, .val = 'N' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:d:i:m:q:g:c:s:n:t:I:u:S:x:baVNF", long_options, NULL); if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'd': ib_devname = strdupa(optarg); break; case 'c': if (strcmp("UC",optarg)==0) user_param.connection_type=UC; break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'q': user_param.numofqps = strtol(optarg, NULL, 0); break; case 'g': user_param.maxpostsofqpiniteration = strtol(optarg, NULL, 0); break; case 'a': user_param.all = ALL; break; case 'V': printf("rdma_bw version : %.2f\n",VERSION); return 0; break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 1; } break; case 's': size = strtoll(optarg, NULL, 0); if (size < 1 || size > UINT_MAX / 2) { usage(argv[0]); return 1; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 1; } break; case 'I': user_param.inline_size = strtol(optarg, NULL, 0); inline_given_in_cmd =1; if (user_param.inline_size > MAX_INLINE) { usage(argv[0]); return 7; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 1; } break; case 'b': duplex = 1; break; case 'N': noPeak = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 1; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; default: usage(argv[0]); return 1; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; } printf("------------------------------------------------------------------\n"); if (duplex == 1) { printf(" RDMA_Write Bidirectional BW Test\n"); } else { printf(" RDMA_Write BW Test\n"); } printf("Number of qp's running %d\n",user_param.numofqps); if (user_param.connection_type==RC) { printf("Connection type : RC\n"); } else { printf("Connection type : UC\n"); } if (user_param.maxpostsofqpiniteration > user_param.tx_depth ) { printf("Can not post more than tx_depth , adjusting number of post to tx_depth\n"); user_param.maxpostsofqpiniteration = user_param.tx_depth; } if (user_param.maxpostsofqpiniteration > user_param.iters ) { printf("Can not post more than iterations per qp , adjusting max number of post to num of iteration\n"); user_param.maxpostsofqpiniteration = user_param.iters; } if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } printf("Each Qp will post up to %d messages each time\n",user_param.maxpostsofqpiniteration); /* Done with parameter parsing. Perform setup. */ if (user_param.all == ALL) { /*since we run all sizes */ size = 8388608; /*2^23 */ } srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) { fprintf(stderr, "No IB devices found\n"); return 1; } } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) { fprintf(stderr, "IB device %s not found\n", ib_devname); return 1; } } context = ibv_open_device(ib_dev); if (ibv_query_device(context, &device_attribute)) { fprintf(stderr, "Failed to query device props"); return 1; } if ((device_attribute.vendor_part_id == 25408 || device_attribute.vendor_part_id == 25418 || device_attribute.vendor_part_id == 26408 || device_attribute.vendor_part_id == 26418 || device_attribute.vendor_part_id == 26428) && (!inline_given_in_cmd)) { user_param.inline_size = 1; } printf("Inline data is used up to %d bytes message\n", user_param.inline_size); ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port, &user_param); if (!ctx) return 1; if (user_param.gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } if (user_param.servername) { sockfd = pp_client_connect(user_param.servername, port); if (sockfd < 0) return 1; } else { sockfd = pp_server_connect(port); if (sockfd < 0) return 1; } my_dest = malloc(user_param.numofqps * sizeof *my_dest); if (!my_dest) { perror("malloc my_dest"); return 1; } rem_dest = malloc(sizeof (struct pingpong_dest*) * user_param.numofqps ); if (!rem_dest ) { perror("malloc rem_dest"); return 1; } for (i =0 ;iqp[i]->qp_num; /* TBD this should be changed inot VA and different key to each qp */ my_dest[i].rkey = ctx->mr->rkey; my_dest[i].vaddr = (uintptr_t)ctx->buf + ctx->size; printf(" local address: LID %#04x, QPN %#06x, PSN %#06x " "RKey %#08x VAddr %#016Lx\n", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn, my_dest[i].rkey, my_dest[i].vaddr); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest[i].dgid.raw[0],my_dest[i].dgid.raw[1], my_dest[i].dgid.raw[2], my_dest[i].dgid.raw[3], my_dest[i].dgid.raw[4], my_dest[i].dgid.raw[5], my_dest[i].dgid.raw[6], my_dest[i].dgid.raw[7], my_dest[i].dgid.raw[8], my_dest[i].dgid.raw[9], my_dest[i].dgid.raw[10], my_dest[i].dgid.raw[11], my_dest[i].dgid.raw[12], my_dest[i].dgid.raw[13], my_dest[i].dgid.raw[14], my_dest[i].dgid.raw[15]); } if (user_param.servername) { rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param); } else { rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param); } if (!rem_dest[i]) return 1; printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x, " "RKey %#08x VAddr %#016Lx\n", rem_dest[i]->lid, rem_dest[i]->qpn, rem_dest[i]->psn, rem_dest[i]->rkey, rem_dest[i]->vaddr); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest[i]->dgid.raw[0],rem_dest[i]->dgid.raw[1], rem_dest[i]->dgid.raw[2], rem_dest[i]->dgid.raw[3], rem_dest[i]->dgid.raw[4], rem_dest[i]->dgid.raw[5], rem_dest[i]->dgid.raw[6], rem_dest[i]->dgid.raw[7], rem_dest[i]->dgid.raw[8], rem_dest[i]->dgid.raw[9], rem_dest[i]->dgid.raw[10], rem_dest[i]->dgid.raw[11], rem_dest[i]->dgid.raw[12], rem_dest[i]->dgid.raw[13], rem_dest[i]->dgid.raw[14], rem_dest[i]->dgid.raw[15]); } if (pp_connect_ctx(ctx, ib_port, my_dest[i].psn, rem_dest[i], &user_param, i)) return 1; /* An additional handshake is required *after* moving qp to RTR. Arbitrarily reuse exch_dest for this purpose. */ if (user_param.servername) { rem_dest[i] = pp_client_exch_dest(sockfd, &my_dest[i], &user_param); } else { rem_dest[i] = pp_server_exch_dest(sockfd, &my_dest[i], &user_param); } } printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n"); /* For half duplex tests, server just waits for client to exit */ /* the 0th place is arbitrary to signal finish ... */ if (!user_param.servername && !duplex) { rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param); if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("server write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); return 0; } tposted = malloc(user_param.iters * user_param.numofqps * sizeof *tposted); if (!tposted) { perror("malloc"); return 1; } tcompleted = malloc(user_param.iters * user_param.numofqps * sizeof *tcompleted); if (!tcompleted) { perror("malloc"); return 1; } if (user_param.all == ALL) { for (i = 1; i < 24 ; ++i) { size = 1 << i; if(run_iter(ctx, &user_param, rem_dest, size)) return 17; print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, noPeak, no_cpu_freq_fail); } } else { if(run_iter(ctx, &user_param, rem_dest, size)) return 18; print_report(user_param.iters, size, duplex, tposted, tcompleted, &user_param, noPeak, no_cpu_freq_fail); } /* the 0th place is arbitrary to signal finish ... */ if (user_param.servername) { rem_dest[0] = pp_client_exch_dest(sockfd, &my_dest[0], &user_param); } else { rem_dest[0] = pp_server_exch_dest(sockfd, &my_dest[0], &user_param); } if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); free(tposted); free(tcompleted); printf("------------------------------------------------------------------\n"); return 0; } trunk/read_lat.c0000755000175000017500000007417611240605460013502 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler) * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_READ_WRID 1 #define VERSION 1.1 #define ALL 1 static int sl = 0; static int page_size; cycles_t *tstamp; struct pingpong_dest my_dest; struct user_parameters { const char *servername; int connection_type; int mtu; int all; /* run all msg size */ int iters; int tx_depth; int sockfd; int max_out_read; int use_event; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; struct report_options { int unsorted; int histogram; int cycles; /* report delta's in cycles, not microsec's */ }; struct pingpong_context { struct ibv_context *context; struct ibv_comp_channel *channel; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *cq; struct ibv_qp *qp; void *buf; volatile char *post_buf; volatile char *poll_buf; int size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; union ibv_gid dgid; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static struct ibv_device *pp_find_dev(const char *ib_devname) { struct ibv_device **dev_list; struct ibv_device *ib_dev = NULL; dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) fprintf(stderr, "No IB devices found\n"); } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) fprintf(stderr, "IB device %s not found\n", ib_devname); } return ib_dev; } #define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000") #define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx" #define KEY_MSG_SIZE_GID (sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00") #define KEY_PRINT_FMT_GID "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { if (user_parm->gid_index < 0) { char msg[KEY_MSG_SIZE]; sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } else { char msg[KEY_MSG_SIZE_GID]; sprintf(msg, KEY_PRINT_FMT_GID, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } } static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { if (user_parm->gid_index < 0) { int parsed; char msg[KEY_MSG_SIZE]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n", (int)sizeof msg, msg); return -1; } return 0; } else { char msg[KEY_MSG_SIZE_GID]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); return 0; } } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } static int pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { if (pp_write_keys(sockfd, my_dest, user_parm)) return -1; return pp_read_keys(sockfd, my_dest, rem_dest, user_parm); } static int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static int pp_server_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest* rem_dest, struct user_parameters *user_parm) { if (pp_read_keys(sockfd, my_dest, rem_dest, user_parm)) return -1; return pp_write_keys(sockfd, my_dest, user_parm); } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int tx_depth, int port, struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); ctx->post_buf = (char*)ctx->buf + (size - 1); ctx->poll_buf = (char*)ctx->buf + (2 * size - 1); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->cq; attr.recv_cq = ctx->cq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; if (user_parm->connection_type==1) { attr.qp_type = IBV_QPT_UC; } else { attr.qp_type = IBV_QPT_RC; } ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_READ }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } ctx->wr.wr_id = PINGPONG_READ_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_READ; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest,struct user_parameters *user_parm) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_attr)); attr.qp_state = IBV_QPS_RTR; switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; attr.max_dest_rd_atomic = user_parm->max_out_read; attr.min_rnr_timer = 12; if (user_parm->gid_index < 0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; if (user_parm->connection_type==0) { attr.max_rd_atomic = user_parm->max_out_read; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } } else { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTS\n"); return 1; } } return 0; } static int pp_open_port(struct pingpong_context *ctx, const char * servername, int ib_port, int port, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey %#08x VAddr %#016Lx\n"; int sockfd; int rc; union ibv_gid gid; /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ if (user_parm->gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_parm->gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } my_dest.lid = pp_get_local_lid(ctx, ib_port); my_dest.dgid = gid; my_dest.qpn = ctx->qp->qp_num; my_dest.psn = lrand48() & 0xffffff; if (user_parm->gid_index < 0) {/*We do not fail test upon lid in RDMAoE/Eth conf*/ if (!my_dest.lid) { fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n"); return 1; } } my_dest.rkey = ctx->mr->rkey; my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; printf(addr_fmt, "local", my_dest.lid, my_dest.qpn, my_dest.psn, my_dest.rkey, my_dest.vaddr); if (user_parm->gid_index > -1) { printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest.dgid.raw[0],my_dest.dgid.raw[1], my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4], my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7], my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10], my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13], my_dest.dgid.raw[14], my_dest.dgid.raw[15]); } sockfd = servername ? pp_client_connect(servername, port) : pp_server_connect(port); if (sockfd < 0) { printf("pp_connect_sock(%s,%d) failed (%d)!\n", servername, port, sockfd); return sockfd; } rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) : pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm); if (rc) return rc; printf(addr_fmt, "remote", rem_dest->lid, rem_dest->qpn, rem_dest->psn, rem_dest->rkey, rem_dest->vaddr); if (user_parm->gid_index > -1) { printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest->dgid.raw[0],rem_dest->dgid.raw[1], rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4], rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7], rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10], rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13], rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]); } if ((rc = pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest,user_parm))) return rc; /* An additional handshake is required *after* moving qp to RTR. * Arbitrarily reuse exch_dest for this purpose. */ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) : pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm); if (rc) return rc; return sockfd; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -c, --connection= connection type RC/UC (default RC)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -s, --size= size of message to exchange (default 1)\n"); printf(" -t, --tx-depth= size of tx queue (default 50)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -o, --outs= num of outstanding read/atom(default 4)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n"); printf(" -H, --report-histogram print out all results (default print summary only)\n"); printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n"); printf(" -V, --version display version number\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -F, --CPU-freq do not fail test on different cpu frequencies\n"); } /* * When there is an * odd number of samples, the median is the middle number. * even number of samples, the median is the mean of the * two middle numbers. * */ static inline cycles_t get_median(int n, cycles_t delta[]) { if ((n - 1) % 2) return (delta[n / 2] + delta[n / 2 - 1]) / 2; else return delta[n / 2]; } static int cycles_compare(const void * aptr, const void * bptr) { const cycles_t *a = aptr; const cycles_t *b = bptr; if (*a < *b) return -1; if (*a > *b) return 1; return 0; } static void print_report(struct report_options * options, unsigned int iters, cycles_t *tstamp,int size, int no_cpu_freq_fail) { double cycles_to_units; cycles_t median; unsigned int i; const char* units; cycles_t *delta = malloc((iters - 1) * sizeof *delta); if (!delta) { perror("malloc"); return; } for (i = 0; i < iters - 1; ++i) delta[i] = tstamp[i + 1] - tstamp[i]; if (options->cycles) { cycles_to_units = 1; units = "cycles"; } else { cycles_to_units = get_cpu_mhz(no_cpu_freq_fail); units = "usec"; } if (options->unsorted) { printf("#, %s\n", units); for (i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units ); } qsort(delta, iters - 1, sizeof *delta, cycles_compare); if (options->histogram) { printf("#, %s\n", units); for (i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units ); } median = get_median(iters - 1, delta); printf("%7d %d %7.2f %7.2f %7.2f\n", size,iters,delta[0] / cycles_to_units , delta[iters - 2] / cycles_to_units ,median / cycles_to_units ); free(delta); } int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest *rem_dest, int size) { struct ibv_qp *qp; struct ibv_send_wr *wr; volatile char *poll_buf; volatile char *post_buf; int scnt, ccnt; int iters; int tx_depth; struct ibv_wc wc; int ne; if (!user_param->servername) return 0; iters = user_param->iters; tx_depth = user_param->tx_depth; wr = &ctx->wr; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = size; ctx->list.lkey = ctx->mr->lkey; wr->wr.rdma.remote_addr = rem_dest->vaddr; wr->wr.rdma.rkey = rem_dest->rkey; scnt = 0; ccnt = 0; poll_buf = ctx->poll_buf; post_buf = ctx->post_buf; qp = ctx->qp; /* Done with setup. Start the test. */ while (scnt < user_param->iters ) { struct ibv_send_wr *bad_wr; *post_buf = (char)++scnt; tstamp[scnt - 1] = get_cycles(); if (ibv_post_send(qp, wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 11; } if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq_event\n"); return 1; } if (ev_cq != ctx->cq) { fprintf(stderr, "CQ event for unknown RCQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } do { ne = ibv_poll_cq(ctx->cq, 1, &wc); } while (!user_param->use_event && ne < 1); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 12; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "scnt=%d, ccnt=%d\n", scnt, ccnt); return 13; } } return 0; } int main(int argc, char *argv[]) { const char *ib_devname = NULL; int port = 18515; int ib_port = 1; int size = 2; int tmp_size; int i = 0; struct report_options report = {}; struct pingpong_context *ctx; struct pingpong_dest rem_dest; struct ibv_device *ib_dev; struct user_parameters user_param; int no_cpu_freq_fail = 0; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; user_param.iters = 1000; user_param.tx_depth = 50; user_param.servername = NULL; user_param.use_event = 0; user_param.max_out_read = 4; /* the device capability on gen2 */ user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "connection", .has_arg = 1, .val = 'c' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "outs", .has_arg = 1, .val = 'o' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "report-cycles", .has_arg = 0, .val = 'C' }, { .name = "report-histogram",.has_arg = 0, .val = 'H' }, { .name = "report-unsorted",.has_arg = 0, .val = 'U' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:c:m:d:i:s:o:n:t:u:S:x:aeHUVF", long_options, NULL); if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'c': if (strcmp("UC",optarg)==0) user_param.connection_type=1; /* default is 0 for any other option RC*/ break; case 'e': ++user_param.use_event; break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'o': user_param.max_out_read = strtol(optarg, NULL, 0); break; case 'a': user_param.all = ALL; break; case 'V': printf("perftest version : %.2f\n",VERSION); return 0; break; case 'd': ib_devname = strdupa(optarg); break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 2; } break; case 's': size = strtol(optarg, NULL, 0); if (size < 1) { usage(argv[0]); return 3; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 4; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 5; } break; case 'C': report.cycles = 1; break; case 'H': report.histogram = 1; break; case 'U': report.unsorted = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 5; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; default: usage(argv[0]); return 6; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 6; } /* * Done with parameter parsing. Perform setup. */ tstamp = malloc(user_param.iters * sizeof *tstamp); if (!tstamp) { perror("malloc"); return 10; } printf("------------------------------------------------------------------\n"); printf(" RDMA_Read Latency Test\n"); printf("Connection type : RC\n"); /* anyway make sure the connection is RC */ if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } tmp_size = size; if (user_param.all == ALL) { /*since we run all sizes */ size = 8388608; /*2^23 */ } else if (size < 128) { /* can cut up to 70 nsec probably related to cache line size */ size = 128; } user_param.connection_type = 0; srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); ib_dev = pp_find_dev(ib_devname); if (!ib_dev) return 7; ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port,&user_param); if (!ctx) return 8; user_param.sockfd=pp_open_port(ctx, user_param.servername, ib_port, port, &rem_dest,&user_param); if (user_param.sockfd==-1) { return 9; } /* fix for true size in small msg size */ if (tmp_size < 128) { size = tmp_size ; } if (user_param.use_event) { printf("Test with events.\n"); if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request RCQ notification\n"); return 1; } } printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations t_min[usec] t_max[usec] t_typical[usec]\n"); if (user_param.all == ALL) { for (i = 1; i < 24 ; ++i) { size = 1 << i; if(run_iter(ctx, &user_param, &rem_dest, size)) return 17; if(user_param.servername) { print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail); } } } else { if(run_iter(ctx, &user_param, &rem_dest, size)) return 18; if(user_param.servername) { print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail); } } /* done close sockets */ if(user_param.servername) { /*Signal client is finished */ pp_client_exch_dest(user_param.sockfd, &my_dest, &rem_dest, &user_param); if (write(user_param.sockfd, "done", sizeof "done") != sizeof "done"){ perror("client write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(user_param.sockfd); } else { /*Server is finished wait for client */ pp_server_exch_dest(user_param.sockfd, &my_dest, &rem_dest, &user_param); if (write(user_param.sockfd, "done", sizeof "done") != sizeof "done"){ perror("server write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(user_param.sockfd); } printf("------------------------------------------------------------------\n"); free(tstamp); return 0; } trunk/send_lat.c0000755000175000017500000011364211237107527013517 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler) * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_SEND_WRID 1 #define PINGPONG_RECV_WRID 2 #define RC 0 #define UC 1 #define UD 3 #define VERSION 1.1 #define SIGNAL 1 #define MAX_INLINE 400 #define MCG_LID 0xc001 #define MCG_GID {255,1,0,0,0,2,201,133,0,0,0,0,0,0,0,0} static int sl = 0; static int page_size; cycles_t *tstamp; struct user_parameters { const char *servername; int connection_type; int mtu; int signal_comp; int all; /* run all msg size */ int iters; int tx_depth; int use_event; int inline_size; int use_mcg; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; struct report_options { int unsorted; int histogram; int cycles; /* report delta's in cycles, not microsec's */ }; struct pingpong_context { struct ibv_sge list; struct ibv_sge recv_list; struct ibv_send_wr wr; struct ibv_recv_wr rwr; struct ibv_context *context; struct ibv_comp_channel *channel; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *scq; struct ibv_cq *rcq; struct ibv_qp *qp; struct ibv_ah *ah; void *buf; volatile char *post_buf; volatile char *poll_buf; int size; int tx_depth; union ibv_gid dgid; }; struct pingpong_dest { unsigned long long vaddr; int lid; int qpn; int psn; unsigned int rkey; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static struct ibv_device *pp_find_dev(const char *ib_devname) { struct ibv_device **dev_list; struct ibv_device *ib_dev = NULL; dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) fprintf(stderr, "No IB devices found\n"); } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) fprintf(stderr, "IB device %s not found\n", ib_devname); } return ib_dev; } #define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000") #define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx" #define KEY_MSG_SIZE_GID (sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00") #define KEY_PRINT_FMT_GID "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { if (user_parm->gid_index < 0) { char msg[KEY_MSG_SIZE]; sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } else { char msg[KEY_MSG_SIZE_GID]; sprintf(msg, KEY_PRINT_FMT_GID, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } } static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { if (user_parm->gid_index < 0) { int parsed; char msg[KEY_MSG_SIZE]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n", (int)sizeof msg, msg); return -1; } return 0; } else { char msg[KEY_MSG_SIZE_GID]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); return 0; } } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } static int pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest, struct user_parameters *user_parm) { if (pp_write_keys(sockfd, my_dest, user_parm)) return -1; return pp_read_keys(sockfd, my_dest, rem_dest, user_parm); } static int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static int pp_server_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest* rem_dest, struct user_parameters *user_parm) { if (pp_read_keys(sockfd, my_dest, rem_dest, user_parm)) return -1; return pp_write_keys(sockfd, my_dest, user_parm); } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int tx_depth, int port,struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; /* in case of UD need space for the GRH */ if (user_parm->connection_type==UD) { ctx->buf = memalign(page_size, ( size + 40 ) * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, ( size + 40 ) * 2); } else { ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); } ctx->post_buf = (char*)ctx->buf + (size - 1); ctx->poll_buf = (char*)ctx->buf + (2 * size - 1); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } if (user_parm->connection_type==UD) { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } else { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->scq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } ctx->rcq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->rcq) { fprintf(stderr, "Couldn't create Recieve CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->scq; attr.recv_cq = ctx->rcq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = tx_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = user_parm->inline_size; switch (user_parm->connection_type) { case RC : attr.qp_type = IBV_QPT_RC; break; case UC : attr.qp_type = IBV_QPT_UC; break; case UD : attr.qp_type = IBV_QPT_UD; break; default: fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type); return NULL; } attr.sq_sig_all = 0; ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; if (user_parm->connection_type==UD) { attr.qkey = 0x11111111; } else { attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; } if (user_parm->connection_type==UD) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) { fprintf(stderr, "Failed to modify UD QP to INIT\n"); return NULL; } if (user_parm->use_mcg) { union ibv_gid gid; uint8_t mcg_gid[16] = MCG_GID; /* use the local QP number as part of the mcg */ mcg_gid[11] = (user_parm->servername) ? 0 : 1; *(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num; memcpy(gid.raw, mcg_gid, 16); if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) { fprintf(stderr, "Couldn't attach QP to mcg\n"); return NULL; } } } else if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } //send ctx->wr.wr_id = PINGPONG_SEND_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.next = NULL; //recieve ctx->rwr.wr_id = PINGPONG_RECV_WRID; ctx->rwr.sg_list = &ctx->recv_list; ctx->rwr.num_sge = 1; ctx->rwr.next = NULL; return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest,struct user_parameters *user_parm) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_attr)); attr.qp_state = IBV_QPS_RTR; if (user_parm->connection_type != UD) { switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; } if (user_parm->connection_type==RC) { attr.max_dest_rd_atomic = 1; attr.min_rnr_timer = 12; } if (user_parm->gid_index < 0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if ((user_parm->connection_type==UD) && (user_parm->use_mcg)) { uint8_t mcg_gid[16] = MCG_GID; /* send the message to the mcg of the other side */ mcg_gid[11] = (user_parm->servername) ? 1 : 0; *(uint32_t *)(&mcg_gid[12]) = dest->qpn; attr.ah_attr.dlid = MCG_LID; attr.ah_attr.is_global = 1; attr.ah_attr.grh.sgid_index = 0; memcpy(attr.ah_attr.grh.dgid.raw, mcg_gid, 16); } if (user_parm->connection_type==RC) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; } else if (user_parm->connection_type==UC) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } else { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE )) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; if (user_parm->connection_type==RC) { attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } } else { /*both UC and UD */ if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify UC/UD QP to RTS\n"); return 1; } } if (user_parm->connection_type==UD) { ctx->ah = ibv_create_ah(ctx->pd, &attr.ah_attr); if (!ctx->ah) { fprintf(stderr, "Failed to create AH for UD\n"); return 1; } } /* post recieve max msg size*/ { int i; struct ibv_recv_wr *bad_wr_recv; ctx->recv_list.addr = (uintptr_t) ctx->buf; if (user_parm->connection_type==UD) { ctx->recv_list.length = ctx->size + 40; } else { ctx->recv_list.length = ctx->size; } ctx->recv_list.lkey = ctx->mr->lkey; for (i = 0; i < user_parm->tx_depth / 2; ++i) { if (ibv_post_recv(ctx->qp, &ctx->rwr, &bad_wr_recv)) { fprintf(stderr, "Couldn't post recv: counter=%d\n", i); return 14; } } } return 0; } static int pp_open_port(struct pingpong_context *ctx, const char * servername, int ib_port, int port, struct pingpong_dest *rem_dest,struct user_parameters *user_parm) { char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x\n"; struct pingpong_dest my_dest; int sockfd; int rc; union ibv_gid gid; /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ my_dest.lid = pp_get_local_lid(ctx, ib_port); my_dest.qpn = ctx->qp->qp_num; my_dest.psn = lrand48() & 0xffffff; if (user_parm->gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/ if (!my_dest.lid) { fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n"); return -1; } } if (user_parm->gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_parm->gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } my_dest.dgid = gid; my_dest.rkey = ctx->mr->rkey; my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; printf(addr_fmt, "local", my_dest.lid, my_dest.qpn, my_dest.psn); if (user_parm->gid_index > -1) { printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest.dgid.raw[0],my_dest.dgid.raw[1], my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4], my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7], my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10], my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13], my_dest.dgid.raw[14], my_dest.dgid.raw[15]); } sockfd = servername ? pp_client_connect(servername, port) : pp_server_connect(port); if (sockfd < 0) { printf("pp_connect_sock(%s,%d) failed (%d)!\n", servername, port, sockfd); return sockfd; } rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) : pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm); if (rc) return rc; printf(addr_fmt, "remote", rem_dest->lid, rem_dest->qpn, rem_dest->psn, rem_dest->rkey, rem_dest->vaddr); if (user_parm->gid_index > -1) { printf(" GID: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest->dgid.raw[0],rem_dest->dgid.raw[1], rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4], rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7], rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10], rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13], rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]); } if ((rc = pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest,user_parm))) return rc; /* An additional handshake is required *after* moving qp to RTR. * Arbitrarily reuse exch_dest for this purpose. */ rc = servername ? pp_client_exch_dest(sockfd, &my_dest, rem_dest, user_parm) : pp_server_exch_dest(sockfd, &my_dest, rem_dest, user_parm); if (rc) return rc; if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -c, --connection= connection type RC/UC/UD (default RC)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -s, --size= size of message to exchange (default 1)\n"); printf(" -t, --tx-depth= size of tx queue (default 50)\n"); printf(" -l, --signal signal completion on each msg\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -I, --inline_size= max size of message to be sent in inline mode (default 400)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n"); printf(" -H, --report-histogram print out all results (default print summary only)\n"); printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n"); printf(" -V, --version display version number\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -g, --mcg send messages to multicast group(only available in UD connection\n"); printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n"); } /* * When there is an * odd number of samples, the median is the middle number. * even number of samples, the median is the mean of the * two middle numbers. * */ static inline cycles_t get_median(int n, cycles_t delta[]) { if ((n - 1) % 2) return(delta[n / 2] + delta[n / 2 - 1]) / 2; else return delta[n / 2]; } static int cycles_compare(const void * aptr, const void * bptr) { const cycles_t *a = aptr; const cycles_t *b = bptr; if (*a < *b) return -1; if (*a > *b) return 1; return 0; } static void print_report(struct report_options * options, unsigned int iters, cycles_t *tstamp,int size, int no_cpu_freq_fail) { double cycles_to_units; cycles_t median; unsigned int i; const char* units; cycles_t *delta = malloc((iters - 1) * sizeof *delta); if (!delta) { perror("malloc"); return; } for (i = 0; i < iters - 1; ++i) delta[i] = tstamp[i + 1] - tstamp[i]; if (options->cycles) { cycles_to_units = 1; units = "cycles"; } else { cycles_to_units = get_cpu_mhz(no_cpu_freq_fail); units = "usec"; } if (options->unsorted) { printf("#, %s\n", units); for (i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2); } qsort(delta, iters - 1, sizeof *delta, cycles_compare); if (options->histogram) { printf("#, %s\n", units); for (i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2); } median = get_median(iters - 1, delta); printf("%7d %d %7.2f %7.2f %7.2f\n", size,iters,delta[0] / cycles_to_units / 2, delta[iters - 2] / cycles_to_units / 2,median / cycles_to_units / 2); free(delta); } int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest *rem_dest, int size) { struct ibv_qp *qp; struct ibv_send_wr *wr; struct ibv_recv_wr rwr; struct ibv_recv_wr *bad_wr_recv; volatile char *poll_buf; volatile char *post_buf; int scnt, rcnt, ccnt, poll; int iters; int tx_depth; iters = user_param->iters; tx_depth = user_param->tx_depth; if (user_param->connection_type==UD) { if (size > 2048) { if (user_param->gid_index < 0) { size = 2048; } else { size = 1024; } } } ///send // wr = &ctx->wr; if (user_param->connection_type==UD) { ctx->list.addr = (uintptr_t) ctx->buf + 40; } else { ctx->list.addr = (uintptr_t) ctx->buf; } ctx->list.length = size; ctx->list.lkey = ctx->mr->lkey; if (user_param->connection_type==UD) { ctx->wr.wr.ud.ah = ctx->ah; ctx->wr.wr.ud.remote_qpn = rem_dest->qpn; ctx->wr.wr.ud.remote_qkey = 0x11111111; if (user_param->use_mcg) { ctx->wr.wr.ud.remote_qpn = 0xffffff; } else { ctx->wr.wr.ud.remote_qpn = rem_dest->qpn; } } /// receive // rwr = ctx->rwr; ctx->recv_list.addr = (uintptr_t) ctx->buf; if (user_param->connection_type==UD) { ctx->recv_list.length = ctx->size + 40; } else { ctx->recv_list.length = ctx->size; } ctx->recv_list.lkey = ctx->mr->lkey; scnt = 0; rcnt = 0; ccnt = 0; poll = 0; poll_buf = ctx->poll_buf; post_buf = ctx->post_buf; qp = ctx->qp; if (size > user_param->inline_size || size == 0) {/* complaince to perf_main don't signal*/ ctx->wr.send_flags = 0; } else { ctx->wr.send_flags = IBV_SEND_INLINE; } while (scnt < iters || rcnt < iters) { if (rcnt < iters && !(scnt < 1 && user_param->servername)) { int ne; struct ibv_wc wc; /*Server is polling on recieve first */ ++rcnt; if (ibv_post_recv(qp, &rwr, &bad_wr_recv)) { fprintf(stderr, "Couldn't post recv: rcnt=%d\n", rcnt); return 15; } if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get receive cq_event\n"); return 1; } if (ev_cq != ctx->rcq) { fprintf(stderr, "CQ event for unknown RCQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->rcq, 0)) { fprintf(stderr, "Couldn't request RCQ notification\n"); return 1; } } do { ne = ibv_poll_cq(ctx->rcq, 1, &wc); } while (!user_param->use_event && ne < 1); if (ne < 0) { fprintf(stderr, "Poll Recieve CQ failed %d\n", ne); return 12; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Recieve Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return 13; } } if (scnt < iters ) { if (ccnt == (tx_depth - 2) || (user_param->signal_comp == SIGNAL) || (scnt == (iters - 1)) ) { ccnt = 0; poll=1; if (size > user_param->inline_size || size == 0) {/* complaince to perf_main */ ctx->wr.send_flags = IBV_SEND_SIGNALED; } else { ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } } struct ibv_send_wr *bad_wr; /* client post first */ tstamp[scnt] = get_cycles(); *post_buf = (char)++scnt; if (ibv_post_send(qp, wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 11; } } if (poll == 1) { struct ibv_wc wc; int ne; if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get send cq_event\n"); return 1; } if (ev_cq != ctx->scq) { fprintf(stderr, "CQ event for unknown SCQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->scq, 0)) { fprintf(stderr, "Couldn't request SCQ notification\n"); return 1; } } /* poll on scq */ do { ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (!user_param->use_event && ne < 1); if (ne < 0) { fprintf(stderr, "poll SCQ failed %d\n", ne); return 12; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return 13; } poll = 0; if (size > user_param->inline_size || size == 0) {/* complaince to perf_main don't signal*/ ctx->wr.send_flags = 0; } else { ctx->wr.send_flags = IBV_SEND_INLINE; } } ++ccnt; } return(0); } int main(int argc, char *argv[]) { const char *ib_devname = NULL; int port = 18515; int ib_port = 1; int size = 2; int i = 0; int size_max_pow = 24; struct report_options report = {}; struct pingpong_context *ctx; struct pingpong_dest rem_dest; struct ibv_device *ib_dev; struct user_parameters user_param; int no_cpu_freq_fail = 0; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; user_param.iters = 1000; user_param.tx_depth = 50; user_param.servername = NULL; user_param.use_event = 0; user_param.use_mcg = 0; user_param.inline_size = MAX_INLINE; user_param.signal_comp = 0; user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "connection", .has_arg = 1, .val = 'c' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "inline_size", .has_arg = 1, .val = 'I' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "signal", .has_arg = 0, .val = 'l' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "report-cycles", .has_arg = 0, .val = 'C' }, { .name = "report-histogram",.has_arg = 0, .val = 'H' }, { .name = "report-unsorted",.has_arg = 0, .val = 'U' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "mcg", .has_arg = 0, .val = 'g' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:c:m:d:i:s:n:t:I:u:S:x:laeCHUVgF", long_options, NULL); if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'c': if (strcmp("UC",optarg)==0) user_param.connection_type=UC; if (strcmp("UD",optarg)==0) user_param.connection_type=UD; /* default is 0 for any other option RC*/ break; case 'e': ++user_param.use_event; break; case 'g': ++user_param.use_mcg; break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'l': user_param.signal_comp = SIGNAL; break; case 'a': user_param.all = SIGNAL; break; case 'V': printf("perftest version : %.2f\n",VERSION); return 0; break; case 'd': ib_devname = strdupa(optarg); break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 2; } break; case 's': size = strtol(optarg, NULL, 0); if (size < 1) { usage(argv[0]); return 3; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 4; } break; case 'I': user_param.inline_size = strtol(optarg, NULL, 0); if (user_param.inline_size > MAX_INLINE) { usage(argv[0]); return 19; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 5; } break; case 'C': report.cycles = 1; break; case 'H': report.histogram = 1; break; case 'U': report.unsorted = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 6; } break; default: usage(argv[0]); return 7; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 6; } /* * Done with parameter parsing. Perform setup. */ tstamp = malloc(user_param.iters * sizeof *tstamp); if (!tstamp) { perror("malloc"); return 10; } /* Print header data */ printf("------------------------------------------------------------------\n"); if (user_param.use_mcg && (user_param.connection_type == UD)) printf(" Send Latency Multicast Test\n"); else printf(" Send Latency Test\n"); printf("Inline data is used up to %d bytes message\n", user_param.inline_size); if (user_param.connection_type==RC) { printf("Connection type : RC\n"); } else if (user_param.connection_type==UC) { printf("Connection type : UC\n"); } else { printf("Connection type : UD\n"); } if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } if (user_param.all == 1) { /*since we run all sizes lets allocate big enough buffer */ size = 8388608; /*2^23 */ } if (user_param.connection_type == UD && size > 2048) { printf("Max msg size in UD is 2048 changing to 2048\n"); size = 2048; } if (user_param.connection_type == UD && user_param.gid_index > -1 && size > 1024) { printf("Max msg size in UD RDMAoE is 1024. changing to 1024\n"); size = 1024; } srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); ib_dev = pp_find_dev(ib_devname); if (!ib_dev) return 7; ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port,&user_param); if (!ctx) return 8; if (pp_open_port(ctx, user_param.servername, ib_port, port, &rem_dest,&user_param)) return 9; if (user_param.use_event) { printf("Test with events.\n"); if (ibv_req_notify_cq(ctx->rcq, 0)) { fprintf(stderr, "Couldn't request RCQ notification\n"); return 1; } if (ibv_req_notify_cq(ctx->scq, 0)) { fprintf(stderr, "Couldn't request SCQ notification\n"); return 1; } } printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations t_min[usec] t_max[usec] t_typical[usec]\n"); if (user_param.all == 1) { if (user_param.connection_type==UD) { if (user_param.gid_index < 0) { size_max_pow = 12; } else { size_max_pow = 11; } } for (i = 1; i < size_max_pow ; ++i) { size = 1 << i; if(run_iter(ctx, &user_param, &rem_dest, size)) return 17; print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail); } } else { if(run_iter(ctx, &user_param, &rem_dest, size)) return 18; print_report(&report, user_param.iters, tstamp, size, no_cpu_freq_fail); } printf("------------------------------------------------------------------\n"); free(tstamp); return 0; } trunk/send_bw.c0000755000175000017500000012414211237107527013344 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_SEND_WRID 1 #define PINGPONG_RECV_WRID 2 #define RC 0 #define UC 1 #define UD 3 #define VERSION 1.1 #define SIGNAL 1 #define MAX_INLINE 400 #define ALL 1 #define MCG_LID 0xc001 #define MCG_GID {255,1,0,0,0,2,201,133,0,0,0,0,0,0,0,0} struct user_parameters { const char *servername; int connection_type; int mtu; int all; /* run all msg size */ int signal_comp; int iters; int tx_depth; int rx_depth; int duplex; int use_event; int use_mcg; int inline_size; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; static int sl = 0; static int page_size; cycles_t *tposted; cycles_t *tcompleted; int post_recv; struct pingpong_context { struct ibv_context *context; struct ibv_comp_channel *channel; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *cq; struct ibv_qp *qp; void *buf; unsigned size; int tx_depth; int rx_depth; struct ibv_sge list; struct ibv_sge recv_list; struct ibv_send_wr wr; struct ibv_recv_wr rwr; struct ibv_ah *ah; union ibv_gid dgid; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } struct pingpong_dest * pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { struct pingpong_dest *rem_dest = NULL; char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; int parsed; sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); goto out; } if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("client read"); fprintf(stderr, "Couldn't read remote address\n"); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } out: return rem_dest; } int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; struct pingpong_dest *rem_dest = NULL; int parsed; int n; n = read(connfd, msg, sizeof msg); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(connfd, msg, sizeof msg) != sizeof msg) { perror("server write"); fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); rem_dest = NULL; goto out; } out: return rem_dest; } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, unsigned size, int tx_depth, int rx_depth, int port, struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; ctx->rx_depth = rx_depth + tx_depth; /* in case of UD need space for the GRH */ if (user_parm->connection_type==UD) { ctx->buf = memalign(page_size, ( size + 40 ) * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, ( size + 40 ) * 2); } else { ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); } ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says: * The Consumer is not allowed to assign Remote Write or Remote Atomic to * a Memory Region that has not been assigned Local Write. */ if (user_parm->connection_type==UD) { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } else { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } ctx->cq = ibv_create_cq(ctx->context, ctx->rx_depth, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->cq; attr.recv_cq = ctx->cq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = ctx->rx_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = user_parm->inline_size; switch (user_parm->connection_type) { case RC : attr.qp_type = IBV_QPT_RC; break; case UC : attr.qp_type = IBV_QPT_UC; break; case UD : attr.qp_type = IBV_QPT_UD; break; default: fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type); return NULL; } /*attr.sq_sig_all = 0;*/ ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr; attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; if (user_parm->connection_type==UD) attr.qkey = 0x11111111; else attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; if (user_parm->connection_type==UD) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) { fprintf(stderr, "Failed to modify UD QP to INIT\n"); return NULL; } if ((user_parm->use_mcg) && (!user_parm->servername || user_parm->duplex)) { union ibv_gid gid; uint8_t mcg_gid[16] = MCG_GID; /* use the local QP number as part of the mcg */ mcg_gid[11] = (user_parm->servername) ? 0 : 1; *(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num; memcpy(gid.raw, mcg_gid, 16); if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) { fprintf(stderr, "Couldn't attach QP to mcg\n"); return NULL; } } } else if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest, struct user_parameters *user_parm) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof attr); attr.qp_state = IBV_QPS_RTR; switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; if (user_parm->connection_type == RC) { attr.max_dest_rd_atomic = 1; attr.min_rnr_timer = 12; } if (user_parm->gid_index < 0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if ((user_parm->connection_type==UD) && (user_parm->use_mcg)) { uint8_t mcg_gid[16] = MCG_GID; /* send the message to the mcg of the other side */ mcg_gid[11] = (user_parm->servername) ? 1 : 0; *(uint32_t *)(&mcg_gid[12]) = dest->qpn; attr.ah_attr.dlid = MCG_LID; attr.ah_attr.is_global = 1; attr.ah_attr.grh.sgid_index = 0; memcpy(attr.ah_attr.grh.dgid.raw, mcg_gid, 16); } if (user_parm->connection_type == RC) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; } else if (user_parm->connection_type == UC) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } else { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE )) { fprintf(stderr, "Failed to modify UC QP to RTR\n"); return 1; } } attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; attr.max_rd_atomic = 1; if (user_parm->connection_type == RC) { attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } } else { /*both UC and UD */ if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify UC QP to RTS\n"); return 1; } } if (user_parm->connection_type==UD) { ctx->ah = ibv_create_ah(ctx->pd, &attr.ah_attr); if (!ctx->ah) { fprintf(stderr, "Failed to create AH for UD\n"); return 1; } } /* post recieve max msg size*/ { int i; struct ibv_recv_wr *bad_wr_recv; //recieve ctx->rwr.wr_id = PINGPONG_RECV_WRID; ctx->rwr.sg_list = &ctx->recv_list; ctx->rwr.num_sge = 1; ctx->rwr.next = NULL; ctx->recv_list.addr = (uintptr_t) ctx->buf; if (user_parm->connection_type==UD) { ctx->recv_list.length = ctx->size + 40; } else { ctx->recv_list.length = ctx->size; } ctx->recv_list.lkey = ctx->mr->lkey; for (i = 0; i < ctx->rx_depth; ++i) if (ibv_post_recv(ctx->qp, &ctx->rwr, &bad_wr_recv)) { fprintf(stderr, "Couldn't post recv: counter=%d\n", i); return 14; } } post_recv = ctx->rx_depth; return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -c, --connection= connection type RC/UC/UD (default RC)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -s, --size= size of message to exchange (default 65536)\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -t, --tx-depth= size of tx queue (default 300)\n"); printf(" -g, --mcg send messages to multicast group(only available in UD connection\n"); printf(" -r, --rx-depth= make rx queue bigger than tx (default 600)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -I, --inline_size= max size of message to be sent in inline mode (default 400)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n"); printf(" -V, --version display version number\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -N, --no peak-bw cancel peak-bw calculation (default with peak-bw)\n"); printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n"); } static void print_report(unsigned int iters, unsigned size, int duplex, cycles_t *tposted, cycles_t *tcompleted, int noPeak, int no_cpu_freq_fail) { double cycles_to_units; unsigned long tsize; /* Transferred size, in megabytes */ int i, j; int opt_posted = 0, opt_completed = 0; cycles_t opt_delta; cycles_t t; opt_delta = tcompleted[opt_posted] - tposted[opt_completed]; if (!noPeak) { /* Find the peak bandwidth, unless asked not to in command line */ for (i = 0; i < iters; ++i) for (j = i; j < iters; ++j) { t = (tcompleted[j] - tposted[i]) / (j - i + 1); if (t < opt_delta) { opt_delta = t; opt_posted = i; opt_completed = j; } } } cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000; tsize = duplex ? 2 : 1; tsize = tsize * size; printf("%7d %d %7.2f %7.2f\n", size,iters,!(noPeak) * tsize * cycles_to_units / opt_delta / 0x100000, tsize * iters * cycles_to_units /(tcompleted[iters - 1] - tposted[0]) / 0x100000); } int run_iter_bi(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest *rem_dest, int size) { struct ibv_qp *qp; int scnt, ccnt, rcnt; struct ibv_recv_wr *bad_wr_recv; if (user_param->connection_type == UD) { if (size > 2048) { if (user_param->gid_index < 0) { size = 2048; } else { size = 1024; } } } /********************************************* * Important note : * In case of UD/UC this is NOT the way to measure * BW sicen we are running with loop on the send side * while we should run on the recieve side or enable retry in SW * Since the sender may be faster than the reciver than although * we had posted recieve it is not enough and might end this will * result in deadlock of test since both sides are stuck on poll cq * In this test i do not solve this for the general test ,need to write * seperate test for UC/UD but in case the tx_depth is ~1/3 from the * number of iterations this should be ok . * Also note that the sender is limited in the number of send, ans * i try to make the reciver full *********************************************/ if (user_param->connection_type == UD) ctx->recv_list.length = ctx->size + 40; else ctx->recv_list.length = ctx->size; if (size > user_param->inline_size) /*complaince to perf_main */ ctx->wr.send_flags = IBV_SEND_SIGNALED; else ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; ctx->list.length = size; scnt = 0; ccnt = 0; rcnt = 0; qp = ctx->qp; while (ccnt < user_param->iters || rcnt < user_param->iters ) { struct ibv_wc wc; int ne; while (scnt < user_param->iters && (scnt - ccnt) < user_param->tx_depth / 2) { struct ibv_send_wr *bad_wr; tposted[scnt] = get_cycles(); if (ibv_post_send(qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 1; } ++scnt; } if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq_event\n"); return 1; } if (ev_cq != ctx->cq) { fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } for (;;) { ne = ibv_poll_cq(ctx->cq, 1, &wc); if (ne <= 0) break; if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n", wc.status, (int) wc.wr_id, wc.vendor_err); fprintf(stderr, "scnt=%d, ccnt=%d\n", scnt, ccnt); return 1; } switch ((int) wc.wr_id) { case PINGPONG_SEND_WRID: tcompleted[ccnt] = get_cycles(); ccnt += 1; break; case PINGPONG_RECV_WRID: if (--post_recv <= ctx->rx_depth - 2) { while (rcnt < user_param->iters && (ctx->rx_depth - post_recv) > 0 ) { ++post_recv; if (ibv_post_recv(qp, &ctx->rwr, &bad_wr_recv)) { fprintf(stderr, "Couldn't post recv: rcnt=%d\n", rcnt); return 15; } } } rcnt += 1; break; default: fprintf(stderr, "Completion for unknown wr_id %d\n", (int) wc.wr_id); break; } } if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } return(0); } int run_iter_uni(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest *rem_dest, int size) { struct ibv_qp *qp; int scnt, ccnt, rcnt; struct ibv_recv_wr *bad_wr_recv; if (user_param->connection_type == UD) { if (size > 2048) { if (user_param->gid_index < 0) { size = 2048; } else { size = 1024; } } } if (user_param->connection_type == UD) ctx->recv_list.length = ctx->size + 40; else ctx->recv_list.length = ctx->size; if (size > user_param->inline_size) { /*complaince to perf_main */ ctx->wr.send_flags = IBV_SEND_SIGNALED; } else { ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } ctx->list.length = size; scnt = 0; ccnt = 0; rcnt = 0; qp = ctx->qp; if (!user_param->servername) { while (rcnt < user_param->iters) { int ne; struct ibv_wc wc; /*Server is polling on recieve first */ if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq_event\n"); return 1; } if (ev_cq != ctx->cq) { fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } do { ne = ibv_poll_cq(ctx->cq, 1, &wc); if (ne) { tcompleted[ccnt] = get_cycles(); if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n", wc.status, (int) wc.wr_id, wc.vendor_err); fprintf(stderr, "scnt=%d, ccnt=%d\n", scnt, ccnt); return 1; } ++rcnt; if (ibv_post_recv(qp, &ctx->rwr, &bad_wr_recv)) { fprintf(stderr, "Couldn't post recv: rcnt=%d\n", rcnt); return 15; } } } while (ne > 0 ); if (ne < 0) { fprintf(stderr, "Poll Recieve CQ failed %d\n", ne); return 12; } } } else { /* client is posting and not receiving. */ while (scnt < user_param->iters || ccnt < user_param->iters) { while (scnt < user_param->iters && (scnt - ccnt) < user_param->tx_depth ) { struct ibv_send_wr *bad_wr; tposted[scnt] = get_cycles(); if (ibv_post_send(qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 1; } ++scnt; } if (ccnt < user_param->iters) { struct ibv_wc wc; int ne; if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq_event\n"); return 1; } if (ev_cq != ctx->cq) { fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } for (;;) { ne = ibv_poll_cq(ctx->cq, 1, &wc); if (ne <= 0) break; tcompleted[ccnt] = get_cycles(); if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n", wc.status, (int) wc.wr_id, wc.vendor_err); fprintf(stderr, "scnt=%d, ccnt=%d\n", scnt, ccnt); return 1; } ccnt += ne; } if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } } } return 0; } int main(int argc, char *argv[]) { struct ibv_device **dev_list; struct ibv_device *ib_dev; struct pingpong_context *ctx; struct pingpong_dest my_dest; struct pingpong_dest *rem_dest; struct user_parameters user_param; struct ibv_device_attr device_attribute; char *ib_devname = NULL; int port = 18515; int ib_port = 1; long long size = 65536; int sockfd; int i = 0; int size_max_pow = 24; int noPeak = 0;/*noPeak == 0: regular peak-bw calculation done*/ int inline_given_in_cmd = 0; struct ibv_context *context; int no_cpu_freq_fail = 0; union ibv_gid gid; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; user_param.iters = 1000; user_param.tx_depth = 300; user_param.servername = NULL; user_param.use_event = 0; user_param.duplex = 0; user_param.inline_size = MAX_INLINE; user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "connection", .has_arg = 1, .val = 'c' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "inline_size", .has_arg = 1, .val = 'I' }, { .name = "rx-depth", .has_arg = 1, .val = 'r' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "bidirectional", .has_arg = 0, .val = 'b' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "mcg", .has_arg = 0, .val = 'g' }, { .name = "noPeak", .has_arg = 0, .val = 'N' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:d:i:m:c:s:n:t:I:r:u:S:x:ebaVgNF", long_options, NULL); if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'e': ++user_param.use_event; break; case 'g': ++user_param.use_mcg; break; case 'd': ib_devname = strdupa(optarg); break; case 'c': if (strcmp("UC",optarg)==0) user_param.connection_type=UC; if (strcmp("UD",optarg)==0) user_param.connection_type=UD; break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'a': user_param.all = ALL; break; case 'V': printf("send_bw version : %.2f\n",VERSION); return 0; break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 1; } break; case 's': size = strtoll(optarg, NULL, 0); if (size < 1 || size > UINT_MAX / 2) { usage(argv[0]); return 1; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 1; } break; case 'I': user_param.inline_size = strtol(optarg, NULL, 0); inline_given_in_cmd =1; if (user_param.inline_size > MAX_INLINE) { usage(argv[0]); return 7; } case 'r': errno = 0; user_param.rx_depth = strtol(optarg, NULL, 0); if (errno) { usage(argv[0]); return 1; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 1; } break; case 'b': user_param.duplex = 1; break; case 'N': noPeak = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 1; } break; default: usage(argv[0]); return 1; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; } printf("------------------------------------------------------------------\n"); if (user_param.duplex == 1 && (!user_param.use_mcg || !(user_param.connection_type == UD))) printf(" Send Bidirectional BW Test\n"); else if (user_param.duplex == 1 && user_param.use_mcg && (user_param.connection_type == UD)) printf(" Send Bidirectional BW Multicast Test\n"); else if (!user_param.duplex == 1 && user_param.use_mcg && (user_param.connection_type == UD)) printf(" Send BW Multicast Test\n"); else printf(" Send BW Test\n"); if (user_param.connection_type == RC) printf("Connection type : RC\n"); else if (user_param.connection_type == UC) printf("Connection type : UC\n"); else{ printf("Connection type : UD\n"); } if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } /* Done with parameter parsing. Perform setup. */ if (user_param.all == ALL) /*since we run all sizes */ size = 8388608; /*2^23 */ else if (user_param.connection_type == UD && size > 2048) { printf("Max msg size in UD is 2048 changing to 2048\n"); size = 2048; } if (user_param.connection_type == UD && user_param.gid_index > -1 && size > 1024) { printf("Max msg size in UD RDMAoE is 1024. changing to 1024\n"); size = 1024; } srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) { fprintf(stderr, "No IB devices found\n"); return 1; } } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) { fprintf(stderr, "IB device %s not found\n", ib_devname); return 1; } } context = ibv_open_device(ib_dev); if (ibv_query_device(context, &device_attribute)) { fprintf(stderr, "Failed to query device props"); return 1; } if ((device_attribute.vendor_part_id == 25408 || device_attribute.vendor_part_id == 25418 || device_attribute.vendor_part_id == 26408 || device_attribute.vendor_part_id == 26418 || device_attribute.vendor_part_id == 26428) && (!inline_given_in_cmd)) { user_param.inline_size = 1; } printf("Inline data is used up to %d bytes message\n", user_param.inline_size); ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, user_param.rx_depth, ib_port, &user_param); if (!ctx) return 1; /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ my_dest.lid = pp_get_local_lid(ctx, ib_port); my_dest.qpn = ctx->qp->qp_num; my_dest.psn = lrand48() & 0xffffff; if (user_param.gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } if (user_param.gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/ if (!my_dest.lid) { fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n"); return 1; } } my_dest.dgid = gid; my_dest.rkey = ctx->mr->rkey; my_dest.vaddr = (uintptr_t)ctx->buf + size; printf(" local address: LID %#04x, QPN %#06x, PSN %#06x\n", my_dest.lid, my_dest.qpn, my_dest.psn); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest.dgid.raw[0],my_dest.dgid.raw[1], my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4], my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7], my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10], my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13], my_dest.dgid.raw[14], my_dest.dgid.raw[15]); } if (user_param.servername) { sockfd = pp_client_connect(user_param.servername, port); if (sockfd < 0) return 1; rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); } else { sockfd = pp_server_connect(port); if (sockfd < 0) return 1; rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); } if (!rem_dest) return 1; printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x\n", rem_dest->lid, rem_dest->qpn, rem_dest->psn); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest->dgid.raw[0],rem_dest->dgid.raw[1], rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4], rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7], rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10], rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13], rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]); } if (pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest, &user_param)) return 1; /* An additional handshake is required *after* moving qp to RTR. Arbitrarily reuse exch_dest for this purpose. */ if (user_param.servername) { rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); } else { rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); } if (user_param.use_event) { printf("Test with events.\n"); if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n"); tposted = malloc(user_param.iters * sizeof *tposted); if (!tposted) { perror("malloc"); return 1; } tcompleted = malloc(user_param.iters * sizeof *tcompleted); if (!tcompleted) { perror("malloc"); return 1; } /* send */ if (user_param.connection_type == UD) { ctx->list.addr = (uintptr_t) ctx->buf + 40; ctx->wr.wr.ud.ah = ctx->ah; ctx->wr.wr.ud.remote_qpn = rem_dest->qpn; ctx->wr.wr.ud.remote_qkey = 0x11111111; if (user_param.use_mcg) { ctx->wr.wr.ud.remote_qpn = 0xffffff; } else { ctx->wr.wr.ud.remote_qpn = rem_dest->qpn; } } else ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr_id = PINGPONG_SEND_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.next = NULL; /* recieve */ ctx->rwr.wr_id = PINGPONG_RECV_WRID; ctx->rwr.sg_list = &ctx->recv_list; ctx->rwr.num_sge = 1; ctx->rwr.next = NULL; ctx->recv_list.addr = (uintptr_t) ctx->buf; ctx->recv_list.lkey = ctx->mr->lkey; if (user_param.all == ALL) { if (user_param.connection_type == UD) { if (user_param.gid_index < 0) { size_max_pow = 12; } else { size_max_pow = 11; } } for (i = 1; i < size_max_pow ; ++i) { size = 1 << i; if (user_param.duplex) { if(run_iter_bi(ctx, &user_param, rem_dest, size)) return 17; } else { if(run_iter_uni(ctx, &user_param, rem_dest, size)) return 17; } if (user_param.servername) { print_report(user_param.iters, size, user_param.duplex, tposted, tcompleted, noPeak, no_cpu_freq_fail); /* sync again for the sake of UC/UC */ rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); } else rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); } } else { if (user_param.duplex) { if (run_iter_bi(ctx, &user_param, rem_dest, size)) return 18; } else { if(run_iter_uni(ctx, &user_param, rem_dest, size)) return 18; } if (user_param.servername) print_report(user_param.iters, size, user_param.duplex, tposted, tcompleted, noPeak, no_cpu_freq_fail); } /* close sockets */ if (user_param.servername) rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); else rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); free(tposted); free(tcompleted); printf("------------------------------------------------------------------\n"); return 0; } trunk/rdma_lat.c0000755000175000017500000010163311240600321013466 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler) * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_RDMA_WRID 3 #define MAX_INLINE 400 static int inline_size = MAX_INLINE; static int sl = 0; static int page_size; static pid_t pid; struct report_options { int unsorted; int histogram; int cycles; /* report delta's in cycles, not microsec's */ }; struct pingpong_context { struct ibv_context *context; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *rcq; struct ibv_cq *scq; struct ibv_qp *qp; void *buf; volatile char *post_buf; volatile char *poll_buf; int size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; }; struct pp_data { int port; int ib_port; unsigned size; int tx_depth; int use_cma; int sockfd; char *servername; struct pingpong_dest my_dest; struct pingpong_dest *rem_dest; struct ibv_device *ib_dev; struct rdma_event_channel *cm_channel; struct rdma_cm_id *cm_id; }; static void pp_post_recv(struct pingpong_context *); static void pp_wait_for_done(struct pingpong_context *); static void pp_send_done(struct pingpong_context *); static void pp_wait_for_start(struct pingpong_context *); static void pp_send_start(struct pingpong_context *); static void pp_close_cma(struct pp_data ); static struct pingpong_context *pp_init_ctx(void *, struct pp_data *); static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static struct ibv_device *pp_find_dev(const char *ib_devname) { struct ibv_device **dev_list; struct ibv_device *ib_dev = NULL; dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) fprintf(stderr, "No IB devices found\n"); } else { for (; (ib_dev = *dev_list); ++dev_list) { if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; } if (!ib_dev) fprintf(stderr, "IB device %s not found\n", ib_devname); } return ib_dev; } #define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000") #define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx" static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest) { char msg[KEY_MSG_SIZE]; sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); return -1; } return 0; } static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest, struct pingpong_dest *rem_dest) { int parsed; char msg[KEY_MSG_SIZE]; if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("pp_read_keys"); fprintf(stderr, "Couldn't read remote address\n"); return -1; } parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n", (int)sizeof msg, msg); return -1; } return 0; } static struct pingpong_context *pp_client_connect(struct pp_data *data) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; int n_retries = 10; struct rdma_cm_event *event; struct sockaddr_in sin; struct pingpong_context *ctx = NULL; struct rdma_conn_param conn_param; if (asprintf(&service, "%d", data->port) < 0) goto err4; n = getaddrinfo(data->servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%d:%s: %s for %s:%d\n", pid, __func__, gai_strerror(n), data->servername, data->port); goto err4; } if (data->use_cma) { sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); retry_addr: if (rdma_resolve_addr(data->cm_id, NULL, (struct sockaddr *)&sin, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n", pid, __func__ ); goto err2; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; if (event->event == RDMA_CM_EVENT_ADDR_ERROR && n_retries-- > 0) { rdma_ack_cm_event(event); goto retry_addr; } if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); goto err1; } rdma_ack_cm_event(event); retry_route: if (rdma_resolve_route(data->cm_id, 2000)) { fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", pid, __func__); goto err2; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; if (event->event == RDMA_CM_EVENT_ROUTE_ERROR && n_retries-- > 0) { rdma_ack_cm_event(event); goto retry_route; } if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); rdma_ack_cm_event(event); goto err1; } rdma_ack_cm_event(event); ctx = pp_init_ctx(data->cm_id, data); if (!ctx) { fprintf(stderr, "%d:%s: pp_init_ctx failed\n", pid, __func__); goto err2; } data->my_dest.psn = lrand48() & 0xffffff; data->my_dest.qpn = 0; data->my_dest.rkey = ctx->mr->rkey; data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 5; conn_param.private_data = &data->my_dest; conn_param.private_data_len = sizeof(data->my_dest); if (rdma_connect(data->cm_id, &conn_param)) { fprintf(stderr, "%d:%s: rdma_connect failure\n", pid, __func__); goto err2; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; if (event->event != RDMA_CM_EVENT_ESTABLISHED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); goto err1; } if (!event->param.conn.private_data || (event->param.conn.private_data_len < sizeof(*data->rem_dest))) { fprintf(stderr, "%d:%s: bad private data ptr %p len %d\n", pid, __func__, event->param.conn.private_data, event->param.conn.private_data_len); goto err1; } data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) goto err1; memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest)); rdma_ack_cm_event(event); } else { for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } if (sockfd < 0) { fprintf(stderr, "%d:%s: Couldn't connect to %s:%d\n", pid, __func__, data->servername, data->port); goto err3; } ctx = pp_init_ctx(data->ib_dev, data); if (!ctx) goto err3; data->sockfd = sockfd; } freeaddrinfo(res); return ctx; err1: rdma_ack_cm_event(event); err2: rdma_destroy_id(data->cm_id); rdma_destroy_event_channel(data->cm_channel); err3: freeaddrinfo(res); err4: return NULL; } static int pp_client_exch_dest(struct pp_data *data) { if (data->rem_dest != NULL) free(data->rem_dest); data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) return -1; if (pp_write_keys(data->sockfd, &data->my_dest)) return -1; return pp_read_keys(data->sockfd, &data->my_dest, data->rem_dest); } static struct pingpong_context *pp_server_connect(struct pp_data *data) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; struct rdma_cm_event *event; struct sockaddr_in sin; struct pingpong_context *ctx = NULL; struct rdma_cm_id *child_cm_id; struct rdma_conn_param conn_param; if (asprintf(&service, "%d", data->port) < 0) goto err5; if ( (n = getaddrinfo(NULL, service, &hints, &res)) < 0 ) { fprintf(stderr, "%d:%s: %s for port %d\n", pid, __func__, gai_strerror(n), data->port); goto err5; } if (data->use_cma) { sin.sin_addr.s_addr = 0; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); if (rdma_bind_addr(data->cm_id, (struct sockaddr *)&sin)) { fprintf(stderr, "%d:%s: rdma_bind_addr failed\n", pid, __func__); goto err3; } if (rdma_listen(data->cm_id, 0)) { fprintf(stderr, "%d:%s: rdma_listen failed\n", pid, __func__); goto err3; } if (rdma_get_cm_event(data->cm_channel, &event)) goto err3; if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { fprintf(stderr, "%d:%s: bad event waiting for connect request %d\n", pid, __func__, event->event); goto err2; } if (!event->param.conn.private_data || (event->param.conn.private_data_len < sizeof(*data->rem_dest))) { fprintf(stderr, "%d:%s: bad private data len %d\n", pid, __func__, event->param.conn.private_data_len); goto err2; } data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) goto err2; memcpy(data->rem_dest, event->param.conn.private_data, sizeof(*data->rem_dest)); child_cm_id = (struct rdma_cm_id *)event->id; ctx = pp_init_ctx(child_cm_id, data); if (!ctx) { free(data->rem_dest); goto err1; } data->my_dest.psn = lrand48() & 0xffffff; data->my_dest.qpn = 0; data->my_dest.rkey = ctx->mr->rkey; data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.private_data = &data->my_dest; conn_param.private_data_len = sizeof(data->my_dest); if (rdma_accept(child_cm_id, &conn_param)) { fprintf(stderr, "%d:%s: rdma_accept failed\n", pid, __func__); goto err1; } rdma_ack_cm_event(event); if (rdma_get_cm_event(data->cm_channel, &event)) { fprintf(stderr, "%d:%s: rdma_get_cm_event error\n", pid, __func__); rdma_destroy_id(child_cm_id); goto err3; } if (event->event != RDMA_CM_EVENT_ESTABLISHED) { fprintf(stderr, "%d:%s: bad event waiting for established %d\n", pid, __func__, event->event); goto err1; } rdma_ack_cm_event(event); } else { for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } if (sockfd < 0) { fprintf(stderr, "%d:%s: Couldn't listen to port %d\n", pid, __func__, data->port); goto err4; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "%d:%s: accept() failed\n", pid, __func__); close(sockfd); goto err4; } close(sockfd); ctx = pp_init_ctx(data->ib_dev, data); if (!ctx) goto err4; data->sockfd = connfd; } freeaddrinfo(res); return ctx; err1: rdma_destroy_id(child_cm_id); err2: rdma_ack_cm_event(event); err3: rdma_destroy_id(data->cm_id); rdma_destroy_event_channel(data->cm_channel); err4: freeaddrinfo(res); err5: return NULL; } static int pp_server_exch_dest(struct pp_data *data) { if (data->rem_dest != NULL) free(data->rem_dest); data->rem_dest = malloc(sizeof *data->rem_dest); if (!data->rem_dest) return -1; if (pp_read_keys(data->sockfd, &data->my_dest, data->rem_dest)) return -1; return pp_write_keys(data->sockfd, &data->my_dest); } static struct pingpong_context *pp_init_ctx(void *ptr, struct pp_data *data) { struct pingpong_context *ctx; struct ibv_device *ib_dev; struct rdma_cm_id *cm_id; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = data->size; ctx->tx_depth = data->tx_depth; ctx->buf = memalign(page_size, ctx->size * 2); if (!ctx->buf) { fprintf(stderr, "%d:%s: Couldn't allocate work buf.\n", pid, __func__); return NULL; } memset(ctx->buf, 0, ctx->size * 2); ctx->post_buf = (char *)ctx->buf + (ctx->size -1); ctx->poll_buf = (char *)ctx->buf + (2 * ctx->size -1); if (data->use_cma) { cm_id = (struct rdma_cm_id *)ptr; ctx->context = cm_id->verbs; if (!ctx->context) { fprintf(stderr, "%d:%s: Unbound cm_id!!\n", pid, __func__); return NULL; } } else { ib_dev = (struct ibv_device *)ptr; ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "%d:%s: Couldn't get context for %s\n", pid, __func__, ibv_get_device_name(ib_dev)); return NULL; } } ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "%d:%s: Couldn't allocate PD\n", pid, __func__); return NULL; } /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says: * The Consumer is not allowed to assign Remote Write or Remote Atomic to * a Memory Region that has not been assigned Local Write. */ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, ctx->size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__); return NULL; } ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0); if (!ctx->rcq) { fprintf(stderr, "%d:%s: Couldn't create recv CQ\n", pid, __func__); return NULL; } ctx->scq = ibv_create_cq(ctx->context, ctx->tx_depth, ctx, NULL, 0); if (!ctx->scq) { fprintf(stderr, "%d:%s: Couldn't create send CQ\n", pid, __func__); return NULL; } struct ibv_qp_init_attr attr = { .send_cq = ctx->scq, .recv_cq = ctx->rcq, .cap = { .max_send_wr = ctx->tx_depth, /* Work around: driver doesnt support * recv_wr = 0 */ .max_recv_wr = 1, .max_send_sge = 1, .max_recv_sge = 1, .max_inline_data = inline_size, }, .qp_type = IBV_QPT_RC }; if (data->use_cma) { if (rdma_create_qp(cm_id, ctx->pd, &attr)) { fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__); return NULL; } ctx->qp = cm_id->qp; pp_post_recv(ctx); } else { ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__); return NULL; } { struct ibv_qp_attr attr; attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = data->ib_port; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "%d:%s: Failed to modify QP to INIT\n", pid, __func__); return NULL; } } } return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, struct pp_data *data) { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_256, .dest_qp_num = data->rem_dest->qpn, .rq_psn = data->rem_dest->psn, .max_dest_rd_atomic = 1, .min_rnr_timer = 12, .ah_attr.is_global = 0, .ah_attr.dlid = data->rem_dest->lid, .ah_attr.sl = sl, .ah_attr.src_path_bits = 0, .ah_attr.port_num = data->ib_port }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { fprintf(stderr, "%s: Failed to modify QP to RTR\n", __func__); return 1; } attr.qp_state = IBV_QPS_RTS; attr.timeout = 14; attr.retry_cnt = 7; attr.rnr_retry = 7; attr.sq_psn = data->my_dest.psn; attr.max_rd_atomic = 1; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "%s: Failed to modify QP to RTS\n", __func__); return 1; } return 0; } static int pp_open_port(struct pingpong_context *ctx, struct pp_data *data ) { char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey %#08x VAddr %#016Lx\n"; /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ data->my_dest.lid = pp_get_local_lid(ctx, data->ib_port); data->my_dest.qpn = ctx->qp->qp_num; data->my_dest.psn = lrand48() & 0xffffff; if (!data->my_dest.lid) { fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n"); return -1; } data->my_dest.rkey = ctx->mr->rkey; data->my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; printf(addr_fmt, "local", data->my_dest.lid, data->my_dest.qpn, data->my_dest.psn, data->my_dest.rkey, data->my_dest.vaddr); if (data->servername) { if (pp_client_exch_dest(data)) return 1; } else { if (pp_server_exch_dest(data)) return 1; } printf(addr_fmt, "remote", data->rem_dest->lid, data->rem_dest->qpn, data->rem_dest->psn, data->rem_dest->rkey, data->rem_dest->vaddr); if (pp_connect_ctx(ctx, data)) return 1; /* An additional handshake is required *after* moving qp to RTR. Arbitrarily reuse exch_dest for this purpose. */ if (data->servername) { if (pp_client_exch_dest(data)) return -1; } else { if (pp_server_exch_dest(data)) return -1; } if (write(data->sockfd, "done", sizeof "done") != sizeof "done"){ perror("write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(data->sockfd); return 0; } static void pp_post_recv(struct pingpong_context *ctx) { struct ibv_sge list; struct ibv_recv_wr wr, *bad_wr; int rc; list.addr = (uintptr_t) ctx->buf; list.length = 1; list.lkey = ctx->mr->lkey; wr.next = NULL; wr.wr_id = 0xdeadbeef; wr.sg_list = &list; wr.num_sge = 1; rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); if (rc) { perror("ibv_post_recv"); fprintf(stderr, "%d:%s: ibv_post_recv failed %d\n", pid, __func__, rc); } } static void pp_wait_for_done(struct pingpong_context *ctx) { struct ibv_wc wc; int ne; do { usleep(500); ne = ibv_poll_cq(ctx->rcq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (!(wc.opcode & IBV_WC_RECV)) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xdeadbeef) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); } static void pp_send_done(struct pingpong_context *ctx) { struct ibv_send_wr *bad_wr; struct ibv_wc wc; int ne; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = 1; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr_id = 0xcafebabe; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__); return; } do { usleep(500); ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (wc.opcode != IBV_WC_SEND) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xcafebabe) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); } static void pp_wait_for_start(struct pingpong_context *ctx) { struct ibv_wc wc; int ne; do { usleep(500); ne = ibv_poll_cq(ctx->rcq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (!(wc.opcode & IBV_WC_RECV)) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xdeadbeef) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); pp_post_recv(ctx); } static void pp_send_start(struct pingpong_context *ctx) { struct ibv_send_wr *bad_wr; struct ibv_wc wc; int ne; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = 1; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr_id = 0xabbaabba; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; if (ibv_post_send(ctx->qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "%d:%s: ibv_post_send failed\n", pid, __func__); return; } do { usleep(500); ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (ne == 0); if (wc.status) fprintf(stderr, "%d:%s: bad wc status %d\n", pid, __func__, wc.status); if (wc.opcode != IBV_WC_SEND) fprintf(stderr, "%d:%s: bad wc opcode %d\n", pid, __func__, wc.opcode); if (wc.wr_id != 0xabbaabba) fprintf(stderr, "%d:%s: bad wc wr_id 0x%x\n", pid, __func__, (int)wc.wr_id); } static void pp_close_cma(struct pp_data data) { struct rdma_cm_event *event; int rc; if (data.servername) { rc = rdma_disconnect(data.cm_id); if (rc) { perror("rdma_disconnect"); fprintf(stderr, "%d:%s: rdma disconnect error\n", pid, __func__); return; } } rdma_get_cm_event(data.cm_channel, &event); if (event->event != RDMA_CM_EVENT_DISCONNECTED) fprintf(stderr, "%d:%s: unexpected event during disconnect %d\n", pid, __func__, event->event); rdma_ack_cm_event(event); rdma_destroy_id(data.cm_id); rdma_destroy_event_channel(data.cm_channel); } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -s, --size= size of message to exchange (default 1)\n"); printf(" -t, --tx-depth= size of tx queue (default 50)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -I, --inline_size= max size of message to be sent in inline mode (default 400)\n"); printf(" -C, --report-cycles report times in cpu cycle units (default microseconds)\n"); printf(" -H, --report-histogram print out all results (default print summary only)\n"); printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n"); printf(" -c, --cma Use the RDMA CMA to setup the RDMA connection\n"); } /* * When there is an * odd number of samples, the median is the middle number. * even number of samples, the median is the mean of the * two middle numbers. * */ static inline cycles_t get_median(int n, cycles_t delta[]) { if ((n - 1) % 2) return (delta[n / 2] + delta[n / 2 - 1]) / 2; else return delta[n / 2]; } static int cycles_compare(const void * aptr, const void * bptr) { const cycles_t *a = aptr; const cycles_t *b = bptr; if (*a < *b) return -1; if (*a > *b) return 1; return 0; } static void print_report(struct report_options * options, unsigned int iters, cycles_t *tstamp) { double cycles_to_units; cycles_t median; unsigned int i; const char* units; cycles_t *delta = malloc((iters - 1) * sizeof *delta); if (!delta) { perror("malloc"); return; } for (i = 0; i < iters - 1; ++i) delta[i] = tstamp[i + 1] - tstamp[i]; if (options->cycles) { cycles_to_units = 1; units = "cycles"; } else { cycles_to_units = get_cpu_mhz(0); units = "usec"; } if (options->unsorted) { printf("#, %s\n", units); for(i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2); } qsort(delta, iters - 1, sizeof *delta, cycles_compare); if (options->histogram) { printf("#, %s\n", units); for(i = 0; i < iters - 1; ++i) printf("%d, %g\n", i + 1, delta[i] / cycles_to_units / 2); } median = get_median(iters - 1, delta); printf("Latency typical: %g %s\n", median / cycles_to_units / 2, units); printf("Latency best : %g %s\n", delta[0] / cycles_to_units / 2, units); printf("Latency worst : %g %s\n", delta[iters - 2] / cycles_to_units / 2, units); free(delta); } int main(int argc, char *argv[]) { const char *ib_devname = NULL; const char *servername = NULL; int iters = 1000; struct report_options report = {}; struct pingpong_context *ctx; struct ibv_qp *qp; struct ibv_send_wr *wr; volatile char *poll_buf; volatile char *post_buf; int scnt, rcnt, ccnt; cycles_t *tstamp; struct pp_data data = { .port = 18515, .ib_port = 1, .size = 1, .tx_depth = 50, .use_cma = 0, .servername = NULL, .rem_dest = NULL, .ib_dev = NULL, .cm_channel = NULL, .cm_id = NULL }; /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "inline_size", .has_arg = 1, .val = 'I' }, { .name = "report-cycles", .has_arg = 0, .val = 'C' }, { .name = "report-histogram",.has_arg = 0, .val = 'H' }, { .name = "report-unsorted",.has_arg = 0, .val = 'U' }, { .name = "cma", .has_arg = 0, .val = 'c' }, { 0 } }; c = getopt_long(argc, argv, "p:d:i:s:n:t:S:I:CHUc", long_options, NULL); if (c == -1) break; switch (c) { case 'p': data.port = strtol(optarg, NULL, 0); if (data.port < 0 || data.port > 65535) { usage(argv[0]); return 1; } break; case 'd': ib_devname = strdupa(optarg); break; case 'i': data.ib_port = strtol(optarg, NULL, 0); if (data.ib_port < 0) { usage(argv[0]); return 2; } break; case 's': data.size = strtol(optarg, NULL, 0); if (data.size < 1) { usage(argv[0]); return 3; } break; case 't': data.tx_depth = strtol(optarg, NULL, 0); if (data.tx_depth < 1) { usage(argv[0]); return 4; } break; case 'n': iters = strtol(optarg, NULL, 0); if (iters < 2) { usage(argv[0]); return 5; } break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 6; } break; case 'I': inline_size = strtol(optarg, NULL, 0); break; case 'C': report.cycles = 1; break; case 'H': report.histogram = 1; break; case 'U': report.unsorted = 1; break; case 'c': data.use_cma = 1; break; default: usage(argv[0]); return 7; } } if (optind == argc - 1) data.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 6; } /* * Done with parameter parsing. Perform setup. */ pid = getpid(); srand48(pid * time(NULL)); page_size = sysconf(_SC_PAGESIZE); if (data.use_cma) { data.cm_channel = rdma_create_event_channel(); if (!data.cm_channel) { fprintf(stderr, "%d:%s: rdma_create_event_channel failed\n", pid, __func__); return 1; } if (rdma_create_id(data.cm_channel, &data.cm_id, NULL, RDMA_PS_TCP)) { fprintf(stderr, "%d:%s: rdma_create_id failed\n", pid, __func__); return 1; } if (data.servername) { ctx = pp_client_connect(&data); if (!ctx) return 1; } else { ctx = pp_server_connect(&data); if (!ctx) return 1; } printf("%d: Local address: LID %#04x, QPN %#06x, PSN %#06x " "RKey %#08x VAddr %#016Lx\n", pid, data.my_dest.lid, data.my_dest.qpn, data.my_dest.psn, data.my_dest.rkey, data.my_dest.vaddr); printf("%d: Remote address: LID %#04x, QPN %#06x, PSN %#06x, " "RKey %#08x VAddr %#016Lx\n\n", pid, data.rem_dest->lid, data.rem_dest->qpn, data.rem_dest->psn, data.rem_dest->rkey, data.rem_dest->vaddr); if (data.servername) { pp_send_start(ctx); } else { pp_wait_for_start(ctx); } } else { data.ib_dev = pp_find_dev(ib_devname); if (!data.ib_dev) return 7; if (data.servername) { ctx = pp_client_connect(&data); if (!ctx) return 8; } else { ctx = pp_server_connect(&data); if (!ctx) return 8; } if (pp_open_port(ctx, &data)) return 9; } wr = &ctx->wr; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = ctx->size; ctx->list.lkey = ctx->mr->lkey; wr->wr.rdma.remote_addr = data.rem_dest->vaddr; wr->wr.rdma.rkey = data.rem_dest->rkey; ctx->wr.wr_id = PINGPONG_RDMA_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_WRITE; if (ctx->size > inline_size || ctx->size == 0) { ctx->wr.send_flags = IBV_SEND_SIGNALED; } else { ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; } ctx->wr.next = NULL; scnt = 0; rcnt = 0; ccnt = 0; poll_buf = ctx->poll_buf; post_buf = ctx->post_buf; qp = ctx->qp; tstamp = malloc(iters * sizeof *tstamp); if (!tstamp) { perror("malloc"); return 10; } /* Done with setup. Start the test. */ while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && data.servername)) { ++rcnt; while (*poll_buf != (char)rcnt) ; /* Here the data is already in the physical memory. If we wanted to actually use it, we may need a read memory barrier here. */ } if (scnt < iters) { struct ibv_send_wr *bad_wr; tstamp[scnt] = get_cycles(); *post_buf = (char)++scnt; if (ibv_post_send(qp, wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 11; } } if (ccnt < iters) { struct ibv_wc wc; int ne; ++ccnt; do { ne = ibv_poll_cq(ctx->scq, 1, &wc); } while (ne == 0); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 12; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); fprintf(stderr, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return 13; } } } if (data.use_cma) { pp_send_done(ctx); pp_wait_for_done(ctx); pp_close_cma(data); } print_report(&report, iters, tstamp); return 0; } trunk/get_clock.c0000755000175000017500000001123011234043545013642 0ustar benoitbenoit/* * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ * * Author: Michael S. Tsirkin */ /* #define DEBUG 1 */ /* #define DEBUG_DATA 1 */ /* #define GET_CPU_MHZ_FROM_PROC 1 */ /* For gettimeofday */ #define _BSD_SOURCE #include #include #include #include "get_clock.h" #ifndef DEBUG #define DEBUG 0 #endif #ifndef DEBUG_DATA #define DEBUG_DATA 0 #endif #define MEASUREMENTS 200 #define USECSTEP 10 #define USECSTART 100 /* Use linear regression to calculate cycles per microsecond. http://en.wikipedia.org/wiki/Linear_regression#Parameter_estimation */ static double sample_get_cpu_mhz(void) { struct timeval tv1, tv2; cycles_t start; double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0; double tx, ty; int i; /* Regression: y = a + b x */ long x[MEASUREMENTS]; cycles_t y[MEASUREMENTS]; double a; /* system call overhead in cycles */ double b; /* cycles per microsecond */ double r_2; for (i = 0; i < MEASUREMENTS; ++i) { start = get_cycles(); if (gettimeofday(&tv1, NULL)) { fprintf(stderr, "gettimeofday failed.\n"); return 0; } do { if (gettimeofday(&tv2, NULL)) { fprintf(stderr, "gettimeofday failed.\n"); return 0; } } while ((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec) < USECSTART + i * USECSTEP); x[i] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec; y[i] = get_cycles() - start; if (DEBUG_DATA) fprintf(stderr, "x=%ld y=%Ld\n", x[i], (long long)y[i]); } for (i = 0; i < MEASUREMENTS; ++i) { tx = x[i]; ty = y[i]; sx += tx; sy += ty; sxx += tx * tx; syy += ty * ty; sxy += tx * ty; } b = (MEASUREMENTS * sxy - sx * sy) / (MEASUREMENTS * sxx - sx * sx); a = (sy - b * sx) / MEASUREMENTS; if (DEBUG) fprintf(stderr, "a = %g\n", a); if (DEBUG) fprintf(stderr, "b = %g\n", b); if (DEBUG) fprintf(stderr, "a / b = %g\n", a / b); r_2 = (MEASUREMENTS * sxy - sx * sy) * (MEASUREMENTS * sxy - sx * sy) / (MEASUREMENTS * sxx - sx * sx) / (MEASUREMENTS * syy - sy * sy); if (DEBUG) fprintf(stderr, "r^2 = %g\n", r_2); if (r_2 < 0.9) { fprintf(stderr,"Correlation coefficient r^2: %g < 0.9\n", r_2); return 0; } return b; } static double proc_get_cpu_mhz(int no_cpu_freq_fail) { FILE* f; char buf[256]; double mhz = 0.0; f = fopen("/proc/cpuinfo","r"); if (!f) return 0.0; while(fgets(buf, sizeof(buf), f)) { double m; int rc; rc = sscanf(buf, "cpu MHz : %lf", &m); if (rc != 1) { /* PPC has a different format */ rc = sscanf(buf, "clock : %lf", &m); if (rc != 1) continue; } if (mhz == 0.0) { mhz = m; continue; } if (mhz != m) { fprintf(stderr, "Conflicting CPU frequency values" " detected: %lf != %lf\n", mhz, m); if (no_cpu_freq_fail) { fprintf(stderr, "Test integrity may be harmed !\n"); }else{ return 0.0; } continue; } } fclose(f); return mhz; } double get_cpu_mhz(int no_cpu_freq_fail) { double sample, proc, delta; sample = sample_get_cpu_mhz(); proc = proc_get_cpu_mhz(no_cpu_freq_fail); if (!proc || !sample) return 0; delta = proc > sample ? proc - sample : sample - proc; if (delta / proc > 0.01) { fprintf(stderr, "Warning: measured timestamp frequency " "%g differs from nominal %g MHz\n", sample, proc); return sample; } return proc; } trunk/get_clock.h0000755000175000017500000000476511234043545013666 0ustar benoitbenoit/* * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ * * Author: Michael S. Tsirkin */ #ifndef GET_CLOCK_H #define GET_CLOCK_H #if defined (__x86_64__) || defined(__i386__) /* Note: only x86 CPUs which have rdtsc instruction are supported. */ typedef unsigned long long cycles_t; static inline cycles_t get_cycles() { unsigned low, high; unsigned long long val; asm volatile ("rdtsc" : "=a" (low), "=d" (high)); val = high; val = (val << 32) | low; return val; } #elif defined(__PPC__) || defined(__PPC64__) /* Note: only PPC CPUs which have mftb instruction are supported. */ /* PPC64 has mftb */ typedef unsigned long cycles_t; static inline cycles_t get_cycles() { cycles_t ret; asm volatile ("mftb %0" : "=r" (ret) : ); return ret; } #elif defined(__ia64__) /* Itanium2 and up has ar.itc (Itanium1 has errata) */ typedef unsigned long cycles_t; static inline cycles_t get_cycles() { cycles_t ret; asm volatile ("mov %0=ar.itc" : "=r" (ret)); return ret; } #else #warning get_cycles not implemented for this architecture: attempt asm/timex.h #include #endif extern double get_cpu_mhz(int); #endif trunk/clock_test.c0000755000175000017500000000063211234043545014046 0ustar benoitbenoit#include #include #include "get_clock.h" int main() { int no_cpu_freq_fail = 0; double mhz; mhz = get_cpu_mhz(no_cpu_freq_fail); cycles_t c1, c2; if (!mhz) { printf("Unable to calibrate cycles. Exiting.\n"); return 2; } printf("Type CTRL-C to cancel.\n"); for(;;) { c1 = get_cycles(); sleep(1); c2 = get_cycles(); printf("1 sec = %g usec\n", (c2 - c1) / mhz); } } trunk/COPYING0000755000175000017500000000240111234043545012577 0ustar benoitbenoitThis software is available to you under a choice of one of two licenses. You may choose to be licensed under the terms of the GNU General Public License (GPL) Version 2, available from the file COPYING in the main directory of this source tree, or the OpenIB.org BSD license below: Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. trunk/read_bw.c0000755000175000017500000007215411240605171013323 0ustar benoitbenoit/* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "get_clock.h" #define PINGPONG_READ_WRID 1 #define VERSION 1.1 #define ALL 1 #define RC 0 struct user_parameters { const char *servername; int connection_type; int mtu; int all; /* run all msg size */ int iters; int tx_depth; int max_out_read; int use_event; int qp_timeout; int gid_index; /* if value not negative, we use gid AND gid_index=value */ }; static int sl = 0; static int page_size; cycles_t *tposted; cycles_t *tcompleted; struct pingpong_context { struct ibv_context *context; struct ibv_comp_channel *channel; struct ibv_pd *pd; struct ibv_mr *mr; struct ibv_cq *cq; struct ibv_qp *qp; void *buf; unsigned size; int tx_depth; struct ibv_sge list; struct ibv_send_wr wr; union ibv_gid dgid; }; struct pingpong_dest { int lid; int qpn; int psn; unsigned rkey; unsigned long long vaddr; union ibv_gid dgid; }; static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port) { struct ibv_port_attr attr; if (ibv_query_port(ctx->context, port, &attr)) return 0; return attr.lid; } static int pp_client_connect(const char *servername, int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int n; int sockfd = -1; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(servername, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); return sockfd; } return sockfd; } struct pingpong_dest * pp_client_exch_dest(int sockfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { struct pingpong_dest *rem_dest = NULL; char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; int parsed; sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn,my_dest->rkey,my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(sockfd, msg, sizeof msg) != sizeof msg) { perror("client write"); fprintf(stderr, "Couldn't send local address\n"); goto out; } if (read(sockfd, msg, sizeof msg) != sizeof msg) { perror("client read"); fprintf(stderr, "Couldn't read remote address\n"); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } out: return rem_dest; } int pp_server_connect(int port) { struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM }; char *service; int sockfd = -1, connfd; int n; if (asprintf(&service, "%d", port) < 0) return -1; n = getaddrinfo(NULL, service, &hints, &res); if (n < 0) { fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); return n; } for (t = res; t; t = t->ai_next) { sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) break; close(sockfd); sockfd = -1; } } freeaddrinfo(res); if (sockfd < 0) { fprintf(stderr, "Couldn't listen to port %d\n", port); return sockfd; } listen(sockfd, 1); connfd = accept(sockfd, NULL, 0); if (connfd < 0) { perror("server accept"); fprintf(stderr, "accept() failed\n"); close(sockfd); return connfd; } close(sockfd); return connfd; } static struct pingpong_dest *pp_server_exch_dest(int connfd, const struct pingpong_dest *my_dest, struct user_parameters *user_parm) { char msg[sizeof "0000:000000:000000:00000000:0000000000000000:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00"]; struct pingpong_dest *rem_dest = NULL; int parsed; int n; n = read(connfd, msg, sizeof msg); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); goto out; } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; if (user_parm->gid_index < 0) { parsed = sscanf(msg, "%x:%x:%x:%x:%Lx", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr); if (parsed != 5) { fprintf(stderr, "Couldn't parse line <%.*s>\n",(int)sizeof msg, msg); free(rem_dest); rem_dest = NULL; goto out; } }else{ char *pstr = msg, *term; char tmp[20]; int i; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->lid = (int)strtol(tmp, NULL, 16); // LID pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->qpn = (int)strtol(tmp, NULL, 16); // QPN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->psn = (int)strtol(tmp, NULL, 16); // PSN pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->rkey = (unsigned)strtol(tmp, NULL, 16); // RKEY pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->vaddr = strtoull(tmp, NULL, 16); // VA for (i = 0; i < 15; ++i) { pstr += term - pstr + 1; term = strpbrk(pstr, ":"); memcpy(tmp, pstr, term - pstr); tmp[term - pstr] = 0; rem_dest->dgid.raw[i] = (unsigned char)strtoll(tmp, NULL, 16); } pstr += term - pstr + 1; strcpy(tmp, pstr); rem_dest->dgid.raw[15] = (unsigned char)strtoll(tmp, NULL, 16); } sprintf(msg, "%04x:%06x:%06x:%08x:%016Lx:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", my_dest->lid, my_dest->qpn, my_dest->psn, my_dest->rkey, my_dest->vaddr, my_dest->dgid.raw[0], my_dest->dgid.raw[1], my_dest->dgid.raw[2], my_dest->dgid.raw[3], my_dest->dgid.raw[4], my_dest->dgid.raw[5], my_dest->dgid.raw[6], my_dest->dgid.raw[7], my_dest->dgid.raw[8], my_dest->dgid.raw[9], my_dest->dgid.raw[10], my_dest->dgid.raw[11], my_dest->dgid.raw[12], my_dest->dgid.raw[13], my_dest->dgid.raw[14], my_dest->dgid.raw[15]); if (write(connfd, msg, sizeof msg) != sizeof msg) { perror("server write"); fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); rem_dest = NULL; goto out; } out: return rem_dest; } static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, unsigned size, int tx_depth, int port, struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) user_parm->mtu = 1024; else user_parm->mtu = 2048; } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says: * The Consumer is not allowed to assign Remote Write or Remote Atomic to * a Memory Region that has not been assigned Local Write. */ ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->cq; attr.recv_cq = ctx->cq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.qp_type = IBV_QPT_RC; ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr; attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; attr.qp_access_flags = IBV_ACCESS_REMOTE_READ; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } return ctx; } static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, struct pingpong_dest *dest, struct user_parameters *user_parm) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof attr); attr.qp_state = IBV_QPS_RTR; switch (user_parm->mtu) { case 256 : attr.path_mtu = IBV_MTU_256; break; case 512 : attr.path_mtu = IBV_MTU_512; break; case 1024 : attr.path_mtu = IBV_MTU_1024; break; case 2048 : attr.path_mtu = IBV_MTU_2048; break; case 4096 : attr.path_mtu = IBV_MTU_4096; break; } printf("Mtu : %d\n", user_parm->mtu); attr.dest_qp_num = dest->qpn; attr.rq_psn = dest->psn; attr.max_dest_rd_atomic = user_parm->max_out_read; attr.min_rnr_timer = 12; if (user_parm->gid_index<0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = dest->lid; attr.ah_attr.sl = sl; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = dest->dgid; attr.ah_attr.grh.hop_limit = 1; attr.ah_attr.sl = 0; } attr.ah_attr.src_path_bits = 0; attr.ah_attr.port_num = port; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTR\n"); return 1; } attr.timeout = user_parm->qp_timeout; attr.retry_cnt = 7; attr.rnr_retry = 7; attr.qp_state = IBV_QPS_RTS; attr.sq_psn = my_psn; attr.max_rd_atomic = user_parm->max_out_read; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { fprintf(stderr, "Failed to modify RC QP to RTS\n"); return 1; } return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port= listen on/connect to port (default 18515)\n"); printf(" -d, --ib-dev= use IB device (default first device found)\n"); printf(" -i, --ib-port= use port of IB device (default 1)\n"); printf(" -m, --mtu= mtu size (256 - 4096. default for hermon is 2048)\n"); printf(" -o, --outs= num of outstanding read/atom(default 4)\n"); printf(" -s, --size= size of message to exchange (default 65536)\n"); printf(" -a, --all Run sizes from 2 till 2^23\n"); printf(" -t, --tx-depth= size of tx queue (default 100)\n"); printf(" -n, --iters= number of exchanges (at least 2, default 1000)\n"); printf(" -u, --qp-timeout= QP timeout, timeout value is 4 usec * 2 ^(timeout), default 14\n"); printf(" -S, --sl= SL (default 0)\n"); printf(" -x, --gid-index= test uses GID with GID index taken from command line (for RDMAoE index should be 0)\n"); printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n"); printf(" -V, --version display version number\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -F, --CPU-freq do not fail even if cpufreq_ondemand module is loaded\n"); } static void print_report(unsigned int iters, unsigned size, int duplex, cycles_t *tposted, cycles_t *tcompleted, int no_cpu_freq_fail) { double cycles_to_units; unsigned long tsize; /* Transferred size, in megabytes */ int i, j; int opt_posted = 0, opt_completed = 0; cycles_t opt_delta; cycles_t t; opt_delta = tcompleted[opt_posted] - tposted[opt_completed]; /* Find the peak bandwidth */ for (i = 0; i < iters; ++i) for (j = i; j < iters; ++j) { t = (tcompleted[j] - tposted[i]) / (j - i + 1); if (t < opt_delta) { opt_delta = t; opt_posted = i; opt_completed = j; } } cycles_to_units = get_cpu_mhz(no_cpu_freq_fail) * 1000000; tsize = duplex ? 2 : 1; tsize = tsize * size; printf("%7d %d %7.2f %7.2f\n", size,iters,tsize * cycles_to_units / opt_delta / 0x100000, tsize * iters * cycles_to_units /(tcompleted[iters - 1] - tposted[0]) / 0x100000); } int run_iter(struct pingpong_context *ctx, struct user_parameters *user_param, struct pingpong_dest *rem_dest, int size) { struct ibv_qp *qp; int scnt, ccnt ; ctx->list.addr = (uintptr_t) ctx->buf; ctx->list.length = size; ctx->list.lkey = ctx->mr->lkey; ctx->wr.wr.rdma.remote_addr = rem_dest->vaddr; ctx->wr.wr.rdma.rkey = rem_dest->rkey; ctx->wr.wr_id = PINGPONG_READ_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_RDMA_READ; ctx->wr.send_flags = IBV_SEND_SIGNALED; ctx->wr.next = NULL; scnt = 0; ccnt = 0; qp = ctx->qp; /* Done with setup. Start the test. */ while (scnt < user_param->iters || ccnt < user_param->iters) { while (scnt < user_param->iters && (scnt - ccnt) < user_param->tx_depth ) { struct ibv_send_wr *bad_wr; tposted[scnt] = get_cycles(); if (ibv_post_send(qp, &ctx->wr, &bad_wr)) { fprintf(stderr, "Couldn't post send: scnt=%d\n", scnt); return 1; } ++scnt; } if (ccnt < user_param->iters) { struct ibv_wc wc; int ne; if (user_param->use_event) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq_event\n"); return 1; } if (ev_cq != ctx->cq) { fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } do { ne = ibv_poll_cq(ctx->cq, 1, &wc); if (ne) { tcompleted[ccnt] = get_cycles(); if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion wth error at %s:\n", user_param->servername ? "client" : "server"); fprintf(stderr, "Failed status %d: wr_id %d syndrom 0x%x\n", wc.status, (int) wc.wr_id, wc.vendor_err); fprintf(stderr, "scnt=%d, ccnt=%d\n", scnt, ccnt); return 1; } ccnt = ccnt + ne; } } while (ne > 0 ); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } } return 0; } int main(int argc, char *argv[]) { struct ibv_device **dev_list; struct ibv_device *ib_dev; struct pingpong_context *ctx; struct pingpong_dest my_dest; struct pingpong_dest *rem_dest; struct user_parameters user_param; char *ib_devname = NULL; int port = 18515; int ib_port = 1; long long size = 65536; int sockfd; int duplex = 0; int i = 0; int no_cpu_freq_fail = 0; union ibv_gid gid; /* init default values to user's parameters */ memset(&user_param, 0, sizeof(struct user_parameters)); user_param.mtu = 0; user_param.iters = 1000; user_param.tx_depth = 100; user_param.servername = NULL; user_param.use_event = 0; user_param.max_out_read = 4; /* the device capability on gen2 */ user_param.qp_timeout = 14; user_param.gid_index = -1; /*gid will not be used*/ /* Parameter parsing. */ while (1) { int c; static struct option long_options[] = { { .name = "port", .has_arg = 1, .val = 'p' }, { .name = "ib-dev", .has_arg = 1, .val = 'd' }, { .name = "ib-port", .has_arg = 1, .val = 'i' }, { .name = "mtu", .has_arg = 1, .val = 'm' }, { .name = "outs", .has_arg = 1, .val = 'o' }, { .name = "size", .has_arg = 1, .val = 's' }, { .name = "iters", .has_arg = 1, .val = 'n' }, { .name = "tx-depth", .has_arg = 1, .val = 't' }, { .name = "qp-timeout", .has_arg = 1, .val = 'u' }, { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "gid-index", .has_arg = 1, .val = 'x' }, { .name = "all", .has_arg = 0, .val = 'a' }, { .name = "bidirectional", .has_arg = 0, .val = 'b' }, { .name = "version", .has_arg = 0, .val = 'V' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "CPU-freq", .has_arg = 0, .val = 'F' }, { 0 } }; c = getopt_long(argc, argv, "p:d:i:m:o:s:n:t:u:S:x:abVeF", long_options, NULL); if (c == -1) break; switch (c) { case 'p': port = strtol(optarg, NULL, 0); if (port < 0 || port > 65535) { usage(argv[0]); return 1; } break; case 'd': ib_devname = strdupa(optarg); break; case 'e': ++user_param.use_event; break; case 'm': user_param.mtu = strtol(optarg, NULL, 0); break; case 'o': user_param.max_out_read = strtol(optarg, NULL, 0); break; case 'a': user_param.all = ALL; break; case 'V': printf("read_bw version : %.2f\n",VERSION); return 0; break; case 'i': ib_port = strtol(optarg, NULL, 0); if (ib_port < 0) { usage(argv[0]); return 1; } break; case 's': size = strtoll(optarg, NULL, 0); if (size < 1 || size > UINT_MAX / 2) { usage(argv[0]); return 1; } break; case 't': user_param.tx_depth = strtol(optarg, NULL, 0); if (user_param.tx_depth < 1) { usage(argv[0]); return 1; } break; case 'n': user_param.iters = strtol(optarg, NULL, 0); if (user_param.iters < 2) { usage(argv[0]); return 1; } break; case 'b': duplex = 1; break; case 'F': no_cpu_freq_fail = 1; break; case 'u': user_param.qp_timeout = strtol(optarg, NULL, 0); break; case 'S': sl = strtol(optarg, NULL, 0); if (sl > 15) { usage(argv[0]); return 1; } break; case 'x': user_param.gid_index = strtol(optarg, NULL, 0); if (user_param.gid_index > 63) { usage(argv[0]); return 1; } break; default: usage(argv[0]); return 1; } } if (optind == argc - 1) user_param.servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; } printf("------------------------------------------------------------------\n"); if (duplex == 1) printf(" RDMA_Read Bidirectional BW Test\n"); else printf(" RDMA_Read BW Test\n"); printf("Connection type : RC\n"); if (user_param.gid_index > -1) { printf("Using GID to support RDMAoE configuration. Refer to port type as Ethernet, default MTU 1024B\n"); } /* Done with parameter parsing. Perform setup. */ if (user_param.all == ALL) /*since we run all sizes */ size = 8388608; /*2^23 */ srand48(getpid() * time(NULL)); page_size = sysconf(_SC_PAGESIZE); dev_list = ibv_get_device_list(NULL); if (!ib_devname) { ib_dev = dev_list[0]; if (!ib_dev) { fprintf(stderr, "No IB devices found\n"); return 1; } } else { for (; (ib_dev = *dev_list); ++dev_list) if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; if (!ib_dev) { fprintf(stderr, "IB device %s not found\n", ib_devname); return 1; } } ctx = pp_init_ctx(ib_dev, size, user_param.tx_depth, ib_port, &user_param); if (!ctx) return 1; if (user_param.gid_index != -1) { int err=0; err = ibv_query_gid (ctx->context, ib_port, user_param.gid_index, &gid); if (err) { return -1; } ctx->dgid=gid; } /* Create connection between client and server. * We do it by exchanging data over a TCP socket connection. */ my_dest.lid = pp_get_local_lid(ctx, ib_port); my_dest.qpn = ctx->qp->qp_num; my_dest.psn = lrand48() & 0xffffff; if (user_param.gid_index < 0) {/*We do not fail test upon lid in RDMA0E/Eth conf*/ if (!my_dest.lid) { fprintf(stderr, "Local lid 0x0 detected. Is an SM running? If you are running on an RMDAoE interface you must use GIDs\n"); return 1; } } my_dest.dgid = gid; my_dest.rkey = ctx->mr->rkey; my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size; printf(" local address: LID %#04x, QPN %#06x, PSN %#06x " "RKey %#08x VAddr %#016Lx\n", my_dest.lid, my_dest.qpn, my_dest.psn, my_dest.rkey, my_dest.vaddr); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", my_dest.dgid.raw[0],my_dest.dgid.raw[1], my_dest.dgid.raw[2], my_dest.dgid.raw[3], my_dest.dgid.raw[4], my_dest.dgid.raw[5], my_dest.dgid.raw[6], my_dest.dgid.raw[7], my_dest.dgid.raw[8], my_dest.dgid.raw[9], my_dest.dgid.raw[10], my_dest.dgid.raw[11], my_dest.dgid.raw[12], my_dest.dgid.raw[13], my_dest.dgid.raw[14], my_dest.dgid.raw[15]); } if (user_param.servername) { sockfd = pp_client_connect(user_param.servername, port); if (sockfd < 0) return 1; rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); } else { sockfd = pp_server_connect(port); if (sockfd < 0) return 1; rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); } if (!rem_dest) return 1; printf(" remote address: LID %#04x, QPN %#06x, PSN %#06x, " "RKey %#08x VAddr %#016Lx\n", rem_dest->lid, rem_dest->qpn, rem_dest->psn, rem_dest->rkey, rem_dest->vaddr); if (user_param.gid_index > -1) { printf(" GID %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", rem_dest->dgid.raw[0],rem_dest->dgid.raw[1], rem_dest->dgid.raw[2], rem_dest->dgid.raw[3], rem_dest->dgid.raw[4], rem_dest->dgid.raw[5], rem_dest->dgid.raw[6], rem_dest->dgid.raw[7], rem_dest->dgid.raw[8], rem_dest->dgid.raw[9], rem_dest->dgid.raw[10], rem_dest->dgid.raw[11], rem_dest->dgid.raw[12], rem_dest->dgid.raw[13], rem_dest->dgid.raw[14], rem_dest->dgid.raw[15]); } if (pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest, &user_param)) return 1; /* An additional handshake is required *after* moving qp to RTR. Arbitrarily reuse exch_dest for this purpose. */ if (user_param.servername) rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); else rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); if (!rem_dest) return 1; /* For half duplex tests, server just waits for client to exit */ if (!user_param.servername && !duplex) { rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("server write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); return 0; } else if (user_param.use_event) { printf("Test with events.\n"); if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } printf("------------------------------------------------------------------\n"); printf(" #bytes #iterations BW peak[MB/sec] BW average[MB/sec] \n"); tposted = malloc(user_param.iters * sizeof *tposted); if (!tposted) { perror("malloc"); return 1; } tcompleted = malloc(user_param.iters * sizeof *tcompleted); if (!tcompleted) { perror("malloc"); return 1; } if (user_param.all == ALL) { for (i = 1; i < 24 ; ++i) { size = 1 << i; if(run_iter(ctx, &user_param, rem_dest, size)) return 17; print_report(user_param.iters, size, duplex, tposted, tcompleted, no_cpu_freq_fail); } } else { if(run_iter(ctx, &user_param, rem_dest, size)) return 18; print_report(user_param.iters, size, duplex, tposted, tcompleted, no_cpu_freq_fail); } if (user_param.servername) rem_dest = pp_client_exch_dest(sockfd, &my_dest, &user_param); else rem_dest = pp_server_exch_dest(sockfd, &my_dest, &user_param); if (write(sockfd, "done", sizeof "done") != sizeof "done"){ perror("server write"); fprintf(stderr, "Couldn't write to socket\n"); return 1; } close(sockfd); free(tposted); free(tcompleted); printf("------------------------------------------------------------------\n"); return 0; }