/* --------------------------------------------------------------  */
/* (C)Copyright 2006,2007,                                         */
/* International Business Machines Corporation                     */
/* All Rights Reserved.                                            */
/*                                                                 */
/* Redistribution and use in source and binary forms, with or      */
/* without modification, are permitted provided that the           */
/* following conditions are met:                                   */
/*                                                                 */
/* - Redistributions of source code must retain the above copyright*/
/*   notice, this list of conditions and the following disclaimer. */
/*                                                                 */
/* - Redistributions in binary form must reproduce the above       */
/*   copyright notice, this list of conditions and the following   */
/*   disclaimer in the documentation and/or other materials        */
/*   provided with the distribution.                               */
/*                                                                 */
/* - Neither the name of IBM Corporation nor the names of its      */
/*   contributors may be used to endorse or promote products       */
/*   derived from this software without specific prior written     */
/*   permission.                                                   */
/*                                                                 */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
/* --------------------------------------------------------------  */
/* PROLOG END TAG zYx                                              */
/* ********************************************************
 *
 * Filename : seqdma.c
 *
 * Purpose  : Benchmarks to measure performance of sequential DMAs
 *
 ******************************************************** */

#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include <stdint.h>

#include "dmabench.h"
#include "barrier_heavy.h"
#include "utils.h"

#define CACHE_LINE_SIZE 128

#define MAX_DMASIZE 16384

#define MAX_REQS 32

static dmabench_parms parms __attribute__ ((aligned (128)));

static char lsbuf[128] __attribute__ ((aligned (128)));

/* Allocate buffer in local storage */
#define BUFSIZE 16384
static uint64_t buffer[BUFSIZE] __attribute__ ((aligned (128)));

int main(int speid __attribute__((unused)), uint64_t argp __attribute__((unused)), uint64_t envp __attribute__((unused)))
{
    int nreqs, nprocs, id;
    int i, j;
    void *barptr;
    int dma_size; /* size of dma in bytes */
    int num_iter;
    int min_dma_size, max_dma_size;
    int req_buf_size;
    unsigned int tag = 31, tid = 0, rid = 0;
    unsigned long tgt_start_addr, tgt_addr;
    unsigned int tgt_incr;
    unsigned int parm_ptr;
    uint64_t *req_buf;

    /* Signal the PPE to indicate that the SPE is ready to start */
    spu_write_out_mbox(1);

    /* Get parameter pointer from inbound mailbox */
    parm_ptr = spu_read_in_mbox();

    /* DMA control block information from system memory. */
    mfc_get((void*) &parms, parm_ptr, (sizeof(parms)+15)&~0xF, tag, tid, rid);
    mfc_write_tag_mask(1<<tag);
    mfc_read_tag_status_all();   /* Wait for DMA to complete */

    barptr = parms.barptr;
    nprocs = parms.nprocs;
    id = parms.id;
    num_iter = parms.num_iter;

    nreqs = MIN( MAX_REQS, parms.nreqs );
    parms.nreqs = nreqs;
    /* For the read/write tests, multiply NREQS by 2 and divide the number of 
     * iterations by 2.
     */
    if (parms.test_code == READWRITE || parms.test_code == LOCAL_READWRITE) {
        nreqs *= 2;
	num_iter /= 2;
    }


    /* Force the min size to at least 8. If not 8, then it must be a multiple of 16. */
    min_dma_size = MAX(8, parms.min_dma_size);
    min_dma_size = (min_dma_size < 16) ? 8 : (min_dma_size & ~(16-1));
    max_dma_size = MIN( MAX_DMASIZE, parms.max_dma_size );


    /* Divide buffer into sections for each request.  Note that if the number of requests is large,
       this can limit the size of largest DMA performed. */
    req_buf_size = ((BUFSIZE/ nreqs) + 15) & ~15;
    if (max_dma_size > (req_buf_size*(int)sizeof(buffer[0])))
        max_dma_size = req_buf_size*sizeof(buffer[0]);

    tgt_start_addr = (unsigned long) parms.tgt_start_addr;

    // For LS-to-LS transfers, adjust the target address to point to the buffer in target SPU
    if (IS_LOCAL(parms.test_code)) {
        tgt_start_addr = tgt_start_addr + (unsigned long)buffer;
    }

    /* Initialize main buffer */
    for (j=0; j<BUFSIZE; ++j) buffer[j] = j;

    /* Touch each page of the target buffer so that the page tables and TLBs are all loaded up */
    tgt_addr = tgt_start_addr;

    tgt_incr = 4096;       /* one page */

    for (i=0; i<(int)sizeof(buffer); i+=tgt_incr) {
        mfc_get((void *) buffer, tgt_addr, 128, tag, tid, rid);
        mfc_write_tag_mask(1<<tag);
        mfc_read_tag_status_all();   /* Wait for DMA to complete */
        mfc_put((void *) buffer, tgt_addr, 128, tag, tid, rid);
        mfc_write_tag_mask(1<<tag);
        mfc_read_tag_status_all();   /* Wait for DMA to complete */
        tgt_addr += tgt_incr;
    }
 
    for (dma_size = min_dma_size; dma_size <= max_dma_size; dma_size *= 2) {

        tgt_incr = (dma_size < CACHE_LINE_SIZE) ? CACHE_LINE_SIZE : dma_size;

	/* wait until all SPEs are ready to start, then let them go all at once. */
	_barrier_heavy((unsigned int)barptr, id, lsbuf, nprocs);

        if (parms.test_code == READ || parms.test_code == LOCAL_READ) {

            clear_counter();           // clear performance info
            start_counter();           // start recording performance info

            for (i=0; i<num_iter; i++) {

                tgt_addr = tgt_start_addr;
                req_buf = buffer;
                for (j=0; j<nreqs; j++) {
		    mfc_get((void *) req_buf, tgt_addr, dma_size, tag, tid, rid);
                    tgt_addr += tgt_incr;
                    req_buf += req_buf_size;
                }
                mfc_write_tag_mask(1<<tag);
                mfc_read_tag_status_all();   /* Wait for DMA to complete */
            }

            stop_counter();         // stop recording performance info
        }
        else if (parms.test_code == WRITE || parms.test_code == LOCAL_WRITE) {

            clear_counter();           // clear performance info
            start_counter();           // start recording performance info

            for (i=0; i<num_iter; i++) {

                tgt_addr = tgt_start_addr;
                req_buf = buffer;
                for (j=0; j<nreqs; j++) {
                    mfc_put((void *) req_buf, tgt_addr, dma_size, tag, tid, rid);
                    tgt_addr += tgt_incr;
                    req_buf += req_buf_size;
                }
                mfc_write_tag_mask(1<<tag);
                mfc_read_tag_status_all();   /* Wait for DMA to complete */
            }

            stop_counter();         // stop recording performance info
        }
        else if (parms.test_code == READWRITE || parms.test_code == LOCAL_READWRITE) {

            clear_counter();           // clear performance info
            start_counter();           // start recording performance info

            for (i=0; i<num_iter; i++) {

                tgt_addr = tgt_start_addr;
                req_buf = buffer;
                for (j=0; j<nreqs; j++) {
                    mfc_get((void *) req_buf, tgt_addr, dma_size, tag, tid, rid);
                    tgt_addr += tgt_incr;
                    req_buf += req_buf_size;
                    j++;
                    mfc_put((void *) req_buf, tgt_addr, dma_size, tag, tid, rid);
                    tgt_addr += tgt_incr;
                    req_buf += req_buf_size;
                }
                mfc_write_tag_mask(1<<tag);
                mfc_read_tag_status_all();   /* Wait for DMA to complete */
            }

            stop_counter();         // stop recording performance info
        }

        parms.result_size[parms.result_count] = dma_size;
        parms.result[parms.result_count] = counter;
        parms.result_count++;
    }

    /* wait until all SPEs are done. */
    _barrier_heavy((unsigned int)barptr, id, lsbuf, nprocs);

    /* DMA results back to system memory. */
    mfc_put((void*) &parms, parm_ptr, sizeof(parms), tag, tid, rid);
    mfc_write_tag_mask(1<<tag);
    mfc_read_tag_status_all();   /* Wait for DMA to complete */

    /* signal done */

    return 0;
}
