/* --------------------------------------------------------------  */
/* (C)Copyright 2006,2007,                                         */
/* International Business Machines Corporation                     */
/* All Rights Reserved.                                            */
/*                                                                 */
/* Redistribution and use in source and binary forms, with or      */
/* without modification, are permitted provided that the           */
/* following conditions are met:                                   */
/*                                                                 */
/* - Redistributions of source code must retain the above copyright*/
/*   notice, this list of conditions and the following disclaimer. */
/*                                                                 */
/* - Redistributions in binary form must reproduce the above       */
/*   copyright notice, this list of conditions and the following   */
/*   disclaimer in the documentation and/or other materials        */
/*   provided with the distribution.                               */
/*                                                                 */
/* - Neither the name of IBM Corporation nor the names of its      */
/*   contributors may be used to endorse or promote products       */
/*   derived from this software without specific prior written     */
/*   permission.                                                   */
/*                                                                 */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
/* --------------------------------------------------------------  */
/* PROLOG END TAG zYx                                              */
/* ********************************************************
 *
 * Filename : dmabench.c
 *
 * Purpose  : Linux program to run DMA benchmarks
 *
 ******************************************************** */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <libspe2.h>
#include <pthread.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <stdint.h>
#include <sys/time.h>
#include <ppu_intrinsics.h>
#include "get_cpu_info.h"

#define _GNU_SOURCE
#include <getopt.h>

#include "dmabench.h"

#define MAX_SPES 	16

#ifndef NREQS
#define NREQS 		1
#endif

#define BUFSIZE 	2048

#define CACHE_LINE_SIZE 128

#define DEFAULT_ENTRY_SIZE	128

/* NUM_ITER is used to extend the test for improved timing accuracy.
 */
#define NUM_ITER 100

#define MIN(a,b) (((a)<(b)) ? (a) : (b))

extern spe_program_handle_t seqdma, dmalist;

typedef struct benchmark {
    char                        *name;                       
    spe_program_handle_t        *spe_pgm;
    int                         test_code;
} benchmark_t;

#define NUM_BENCHMARKS 12
static benchmark_t benchmark_list[NUM_BENCHMARKS] = {
    { "seqdmar", &seqdma, READ },
    { "seqdmaw", &seqdma, WRITE },
    { "seqdmarw", &seqdma, READWRITE },
    { "seqdmalr", &seqdma, LOCAL_READ },
    { "seqdmalw", &seqdma, LOCAL_WRITE },
    { "seqdmalrw", &seqdma, LOCAL_READWRITE },
    { "dmalistr", &dmalist, READ },
    { "dmalistw", &dmalist, WRITE },
    { "dmalistrw", &dmalist, READWRITE },
    { "dmalistlr", &dmalist, LOCAL_READ },
    { "dmalistlw", &dmalist, LOCAL_WRITE },
    { "dmalistlrw", &dmalist, LOCAL_READWRITE }
};

/* Allocate space for parameters for each SPU */
static dmabench_parms parms[MAX_SPES] __attribute__ ((aligned (16)));

/* Cache-line sized blocks for use in barrier calls. */
static unsigned int bar[CACHE_LINE_SIZE/sizeof(unsigned int)]  __attribute__ ((aligned (128)));

/* Buffers in main memory that will be read or written by DMAs */
static uint64_t tgt_buf[MAX_SPES][NUM_ITER*NREQS*BUFSIZE] __attribute__ ((aligned (4096)));


static int no_spe_error = 0;
static int spe_error = 1;

void *spe_thread_function(void *arg) {
    int *status;
    spe_context_ptr_t ctx;
    spe_stop_info_t info;
    unsigned int entry = SPE_DEFAULT_ENTRY;
    
    ctx = *((spe_context_ptr_t *)arg);
    if (spe_context_run(ctx, &entry, 0, NULL, NULL, &info) < 0) {
        fprintf(stderr, "Failed spe_context_run, errno=%d\n", errno);
        exit(1);
    }
    
    status = ((info.stop_reason != SPE_EXIT) || 
              (info.result.spe_exit_code != 0)) ? &spe_error : &no_spe_error;

    return ((void *)status);
}


static struct option longopts[] = {
    /* name        has_arg                flag   val */
    { "help",      no_argument,           NULL,  'h'},
    { "affinity",  no_argument,           NULL,  'a'},
    { "minsize",   required_argument,     NULL,  's'},
    { "maxsize",   required_argument,     NULL,  'S'},
    { "entrysize", required_argument,     NULL,  'e'},
    { "numspes",   required_argument,     NULL,  'n'},
    { "numreqs",   required_argument,     NULL,  'r'},
    { "offset",    no_argument,           NULL,  'o'},
    {0, 0, 0, 0}
};

void print_usage(void) {
    int i; 
    printf("Usage: dmabench [options] <benchmark>\n");

    printf("Perform DMA benchmark. Supported options include:\n");
    printf("    --affinity     Specifies logical affinity should be used to ensure\n");
    printf("                   the threads are scheduled on the SPEs with close proximity\n");
    printf("                   to the SPEs in which they communicate. This option only\n");
    printf("                   affects benchmarks that target the local store of another \n");
    printf("                   SPE. The default is no affinity.\n");
    printf("    --entrysize n  Specifies n bytes as the size of the data transferred\n");
    printf("                   for each DMA list entry in the dmalist benchmark. Default\n");
    printf("                   is %d bytes. Valid values are from 8 bytes to 16K.\n", DEFAULT_ENTRY_SIZE); 
    printf("    --help         Display this help message and exit.\n");
    printf("    --maxsize n    Specifies n bytes as the largest DMA transfer to be\n");
    printf("                   performed in the execution of the benchmark. The default\n");
    printf("                   value is 16K bytes for the sequential DMA benchmarks and the\n");
    printf("                   size of 2048 list entries for the DMA list benchmarks. Valid\n"); 
    printf("                   values are from 8 to 16K.\n");
    printf("    --minsize n    Specifies n bytes as the smallest DMA transfer to be\n");
    printf("                   performed in the execution of the benchmark. The default\n");
    printf("                   value is 8 bytes for the sequential DMA benchmarks and the\n");
    printf("                   size of one list entry for the DMA list benchmarks. Valid\n");
    printf("                   values are from 8 to 16K.\n");
    printf("    --numreqs n    Specifies the number of requests issued in sequence within\n");
    printf("                   the timing window. The SPEs wait for DMA completion only\n");
    printf("                   after issuing all the requests. The default is a single\n");
    printf("                   request. Valid values are from 1 to 32 requests.\n");
    printf("    --numspes n    Specifies that n SPEs should concurrently execute the\n");
    printf("                   benchmark. The default is to execute the benchmark on a\n");
    printf("                   single SPE. When the benchmark is executed on more than one\n");
    printf("                   SPE, the SPEs are synchronized so that the benchmark code\n");
    printf("                   starts at roughly the same time on all SPEs\n");
    printf("    --offset       Specifies that the starting address for system memory buffers\n");
    printf("                   should be offset by 128 bytes (one cache line) to distribute\n");
    printf("                   accesses across memory banks.\n");
    printf("Supported DMA benchmarks include:\n");
    for (i=0; i<NUM_BENCHMARKS; i++) printf("    %s\n", benchmark_list[i].name);
}

int main(int argc, char *argv[])
{
    spe_gang_context_ptr_t gang = NULL;
    spe_context_ptr_t ctx[MAX_SPES];
    void *ls[MAX_SPES];
    pthread_t thread[MAX_SPES];
    pthread_attr_t pthread_attr;
    char *benchmark;
    spe_program_handle_t *spe_pgm;
    int rc;
    int test_code;
    int local;
    int affinity = 0;
    int offset = 0;
    int num_spes = 1;
    int extra_spe;
    int num_iter = NUM_ITER;
    int num_spes_failed = 0;
    int num_reqs = 1;
    int min_dma_size = 8;
    int max_dma_size = 128*2048;
    int entry_size = DEFAULT_ENTRY_SIZE;
    double tbfreq, pclk_ratio;
    int i, j;
    int c;


    while ((c = getopt_long(argc, argv, "has:S:e:n:r:", longopts, NULL)) != -1) {
        switch (c) {
	case 'h':            /* help */
	    print_usage();
	    return(-1);
	    break;
	case 'a':		   /* affinity */
	    affinity = 1;
	    break;
	case 's':            /* minsize */
	    min_dma_size = atoi(optarg);
	    break;
	case 'S':            /* maxsize */
	    max_dma_size = atoi(optarg);
	    break;
	case 'e':            /* entrysize */
	    entry_size = atoi(optarg);
	    break;
	case 'n':            /* numspes */
	    num_spes = atoi(optarg);
	    num_spes = MIN(num_spes, MAX_SPES);
	    break;
	case 'r':            /* numreqs */
	    num_reqs = atoi(optarg);
	    break;
	case 'o':            /* offset */
	    offset = 1;
	    break;
	default:
	    print_usage();
	    return(-1);
	    break;
	}
    }

    if (optind >= argc) {
        print_usage();
        return(-1);
    }

    benchmark = argv[optind];

    for (i=0;i<NUM_BENCHMARKS;i++) {
        if (strcmp(benchmark, benchmark_list[i].name) == 0)
            break;
    }

    if (i>=NUM_BENCHMARKS) {
        print_usage();
        return(-1);
    }

    spe_pgm = benchmark_list[i].spe_pgm;
    test_code = benchmark_list[i].test_code;
    local = IS_LOCAL(test_code);

    /* Check size of dmabench_parms */
    if ((sizeof(dmabench_parms) & (16-1)) != 0) {
        fprintf(stderr, "Warning: dmabench_parms size of %d is not a multiple of 16\n", sizeof(dmabench_parms));
    }

    tbfreq = get_timebase_frequency();
    printf("Time base frequency = %6.3f MHz\n", tbfreq/1.0E6);

    pclk_ratio = get_cpu_frequency()/tbfreq;

    /* Initialize main buffer */
    for (i=0; i<MAX_SPES; ++i)
        for (j=0; j<(NUM_ITER*NREQS*BUFSIZE); ++j) tgt_buf[i][j] = j;

    /* Create the SPE threads. One additional SPE thread is created
     * as a source/destination DMA target, if the DMA is local and 
     * only one SPE thread was requested.
     */
    extra_spe = (local && num_spes==1) ? 1 : 0;
    if (affinity) {
        if ((gang = spe_gang_context_create(0)) == NULL) {
	    perror("Failed spe_gang_context_create");
	    return(-1);
	}
    }

    if ((rc = pthread_attr_init(&pthread_attr))) {
        fprintf(stderr, "Failed pthread_attr_init: %s\n", strerror(rc));
	return (-1);
    }
    if ((rc = pthread_attr_setschedpolicy(&pthread_attr, SCHED_RR))) {
        fprintf(stderr, "Failed pthread_attr_setschedpolicy: %s\n", strerror(rc));
	return (-1);
    }
    

    for (i=0; i<num_spes+extra_spe; i++) {
        if (affinity) {
  	    if ((ctx[i] = spe_context_create_affinity(0, (i==0) ? NULL : ctx[i-1], gang)) == NULL) {
	        perror("Failed spe_context_create_affinity");
		return(-1);
	    }
	} else {
	    if ((ctx[i] = spe_context_create(0, NULL)) == NULL) {
	        perror("Failed spe_context_create");
		return(-1);
	    }
	}
    }

    for (i=0; i<num_spes+extra_spe; i++) {
        if (spe_program_load(ctx[i], spe_pgm)) {
            perror("Failed spe_program_load");
            return(-1);
        }
        if ((ls[i] = spe_ls_area_get(ctx[i])) == NULL) {
            perror("Failed spe_ls_area_get");
            return(-1);
        }
    
        /* Launch a seperate execution thread for each of the SPE contexts. */
        if ((rc = pthread_create(&thread[i], &pthread_attr, &spe_thread_function, &ctx[i]))) {
	    fprintf(stderr, "Failed pthread_create: %s\n", strerror(rc));
            return(-1);
        }
    }

    /* Initialize control block structures. */
    for (i=0; i<num_spes; i++) {
        parms[i].id = i;
        parms[i].barptr = &bar;
        if (local) {
            parms[i].tgt_start_addr = ls[(i+1)%(num_spes+extra_spe)];
        } else {
            parms[i].tgt_start_addr = &tgt_buf[i];
            if (offset) {
                parms[i].tgt_start_addr += i*CACHE_LINE_SIZE;
            }
        }
        parms[i].test_code = test_code;
        parms[i].nprocs = num_spes;
        parms[i].num_iter = num_iter;
        parms[i].nreqs = num_reqs;
        parms[i].min_dma_size = min_dma_size;
        parms[i].max_dma_size = max_dma_size;
        parms[i].entry_size = entry_size;
    }

    /* Wait for an indication that all SPEs have started. */
    for (i=0;i<num_spes+extra_spe;i++) {
        unsigned int data;
        while (spe_out_mbox_status(ctx[i])==0) {sleep(1);}
        (void)spe_out_mbox_read(ctx[i], &data, 1);
    }

    /* Now provide the SPEs their parms, which starts the computation */
    for (i=0;i<num_spes;i++) {
      unsigned int data;
      data = (unsigned int)(&parms[i]);
      spe_in_mbox_write(ctx[i], &data, 1, SPE_MBOX_ALL_BLOCKING);
    }

    /* Now wait for all SPEs to finish */
    for (i=0; i<num_spes; ++i) {
        void *status;
        if (pthread_join(thread[i], &status)) {
            perror("Failed pthread_join");
            return(-1);
        }
    
        if (*((int *)status)) {
            num_spes_failed++;
        }

        if (spe_context_destroy(ctx[i])) {
            perror("Failed spe_context_destroy");
            return(-1);
        }
    }

    /* If we created an extra SPE used for ls to ls tests, kill it off. */
    if (extra_spe) {
        if (pthread_cancel(thread[num_spes])) {
            perror("Failed pthread_cancel");
            return(-1);
        }
        if (spe_context_destroy(ctx[num_spes])) {
            perror("Failed spe_context_destroy");
            return(-1);
        }
    }

    /* Destroy the gang if one was created */
    if (gang) {
        if (spe_gang_context_destroy(gang)) {
	    perror("Failed spe_gang_context_destory");
	}
    }


    if (num_spes_failed == 0) {
        fprintf(stdout, "All SPEs completed successfully!\n"); fflush(stdout);
    } else {
        fprintf(stdout, "%d SPEs failed!\n", num_spes_failed); fflush(stdout);
    }

    num_reqs = parms[0].nreqs;

    printf("dmabench results: %s numspes=%d numreqs=%d entrysize=%d\n", benchmark, num_spes, num_reqs, parms[0].entry_size);
    printf("%12s  %12s  %12s  %12s  %12s\n", "   dma_size ", "      ticks ", "    pclocks ", " microsecs ", "aggr GB/s ");
    printf("%12s  %12s  %12s  %12s  %12s\n", "------------", "------------", "------------", "-----------", "------------");

    /* Roll up and print results */
    for (j=0; j<parms[0].result_count; ++j) {
        double tot = 0.0;
        for (i=0; i<num_spes; ++i) {
            tot += (double)parms[i].result[j];
        }

        tot = tot / (double)num_iter / (double)num_spes;
        
        printf("% 11d   % 11.1f   % 11lld   % 11.2f   % 11.4f\n",
               parms[0].result_size[j], tot, 
               (long long)(tot*pclk_ratio),
               tot*(1.0E6/tbfreq),
               (num_spes * parms[0].result_size[j] * num_reqs * tbfreq) / (tot * 1.0E9));
        fflush(stdout);
    }
        
    return 0;
}
