/* ********************************************************
 *
 * Filename : dmalist.c
 *
 * Purpose  : Benchmarks to measure performance of list-form DMAs
 *
 ******************************************************** */

#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include <stdint.h>

#include "dmabench.h"
#include "barrier_heavy.h"
#include "utils.h"

#define CACHE_LINE_SIZE 128

#define MAX_DMASIZE 16384

#define MAX_REQS 32

#define MAX_LIST_SIZE 2048

static dmabench_parms parms __attribute__ ((aligned (128)));

static char lsbuf[128] __attribute__ ((aligned (128)));

/* Allocate buffer in local storage */
#define BUFSIZE 16384
uint64_t buffer[BUFSIZE] __attribute__ ((aligned (128)));

typedef struct dma_list_entry {
    unsigned int length;    /* bit 0 is stall bit, bits 1:16 are reserved */
    unsigned int address;   /* Low Word of 64-bit effective address */
} dma_list_entry;

/* Data storage for DMA List command */
dma_list_entry dma_list[MAX_LIST_SIZE] __attribute__ ((aligned (128)));
dma_list_entry *req_list[MAX_REQS];

int main(int speid __attribute__((unused)), uint64_t argp __attribute__((unused)), uint64_t envp __attribute__((unused)))
{
    int nreqs, nprocs, id;
    int i, j;
    void *barptr;
    int entry_size; /* size of dma in bytes */
    int num_iter;
    int min_dma_size, max_dma_size;
    int list_size, min_list_size, max_list_size;
    int req_buf_size;
    unsigned int tag = 31, tid = 0, rid = 0;
    unsigned long tgt_start_addr, tgt_addr;
    unsigned int tgt_incr;
    unsigned int parm_ptr;
    uint64_t *req_buf;

    /* Signal the PPE to indicate that the SPE is ready to start */
    spu_write_out_mbox(1);

    /* Get parameter pointer from inbound mailbox */
    parm_ptr = spu_read_in_mbox();

    /* DMA control block information from system memory. */
    mfc_get((void*) &parms, parm_ptr, (sizeof(parms)+15)&~0xF, tag, tid, rid);
    mfc_write_tag_mask(1<<tag);
    mfc_read_tag_status_all();   /* Wait for DMA to complete */

    barptr = parms.barptr;
    nprocs = parms.nprocs;
    id = parms.id;
    num_iter = parms.num_iter;

    nreqs = MIN( MAX_REQS, parms.nreqs );
    parms.nreqs = nreqs;
    /* For the read/write tests, multiply NREQS by 2  */
    if (parms.test_code == READWRITE || parms.test_code == LOCAL_READWRITE) {
        nreqs *= 2;
	num_iter /= 2;
    }

    min_dma_size = MAX( 8, parms.min_dma_size );
    max_dma_size = parms.max_dma_size;

    /* Divide buffer into sections for each request.  Note that if the number 
     * of requests is large, this can limit the size of largest DMA performed. 
     */
    req_buf_size = ((BUFSIZE/ nreqs) + 15) & ~15;
    if (max_dma_size > (req_buf_size*(int)sizeof(buffer[0])))
        max_dma_size = req_buf_size*sizeof(buffer[0]);

    /* Make sure the data will for each request will fit in the buffer */
    entry_size = MIN( MAX_DMASIZE, MAX( 8, parms.entry_size ) );
    min_list_size = MAX( 1, (min_dma_size/entry_size) );
    max_list_size = MIN( MAX_LIST_SIZE, (max_dma_size/entry_size) );

    parms.entry_size = entry_size;

    /* Make sure we don't overflow the list */
    if (max_list_size > (MAX_LIST_SIZE / nreqs))
        max_list_size = (MAX_LIST_SIZE / nreqs);

    tgt_start_addr = (unsigned long) parms.tgt_start_addr;

    // For LS-to-LS transfers, adjust the target address to point to the buffer in target SPU
    if (IS_LOCAL(parms.test_code)) {
        tgt_start_addr = tgt_start_addr + (unsigned long)buffer;
    }

    /* Initialize main buffer */
    for (j=0; j<BUFSIZE; ++j) buffer[j] = j;

    /* Touch each page of the target buffer so that the page tables and TLBs are all loaded up */
    tgt_addr = tgt_start_addr;

    tgt_incr = 4096;       /* one page */

    for (i=0; i<(int)sizeof(buffer); i+=tgt_incr) {

        mfc_get((void *) buffer, tgt_addr, 128, tag, tid, rid);
        mfc_write_tag_mask(1<<tag);
        mfc_read_tag_status_all();   /* Wait for DMA to complete */
        mfc_put((void *) buffer, tgt_addr, 128, tag, tid, rid);
        mfc_write_tag_mask(1<<tag);
        mfc_read_tag_status_all();   /* Wait for DMA to complete */
        tgt_addr += tgt_incr;

    }

    for (list_size = min_list_size; list_size <= max_list_size; list_size *= 2) {

        tgt_addr = tgt_start_addr;

        tgt_incr = (entry_size < CACHE_LINE_SIZE) ? CACHE_LINE_SIZE : entry_size;

        /* create the DMA lists */
        for (i=0; i<nreqs; i++) {
            req_list[i] = &dma_list[i*max_list_size];
            for (j=0; j<list_size; j++) {
                req_list[i][j].length = entry_size;
                req_list[i][j].address = tgt_addr;
                tgt_addr += tgt_incr;
            }
        }

        /* wait until all SPEs are ready to start, then let them go all at once. */
        _barrier_heavy((unsigned int)barptr, id, lsbuf, nprocs);

        if (parms.test_code == READ || parms.test_code == LOCAL_READ) {

            clear_counter();           // clear performance info
            start_counter();           // start recording performance info

            for (i=0; i<num_iter; i++) {

                req_buf = buffer;
                for (j=0; j<nreqs; j++) {
                    mfc_getl((void *) req_buf, mfc_ea2h(tgt_addr), (void *) req_list[j], sizeof(dma_list_entry)*list_size, tag, tid, rid);
                    req_buf += req_buf_size;
                }
                mfc_write_tag_mask(1<<tag);
                mfc_read_tag_status_all();   /* Wait for DMA to complete */
            }

            stop_counter();         // stop recording performance info
        }
        else if (parms.test_code == WRITE || parms.test_code == LOCAL_WRITE) {

            clear_counter();           // clear performance info
            start_counter();           // start recording performance info

            for (i=0; i<num_iter; i++) {

                req_buf = buffer;
                for (j=0; j<nreqs; j++) {
                    mfc_putl((void *) req_buf, mfc_ea2h(tgt_addr), (void *) req_list[j], sizeof(dma_list_entry)*list_size, tag, tid, rid);
                    req_buf += req_buf_size;
                }
                mfc_write_tag_mask(1<<tag);
                mfc_read_tag_status_all();   /* Wait for DMA to complete */
            }

            stop_counter();         // stop recording performance info
        }
        else if (parms.test_code == READWRITE || parms.test_code == LOCAL_READWRITE) {

            clear_counter();           // clear performance info
            start_counter();           // start recording performance info

            for (i=0; i<num_iter; i++) {

                req_buf = buffer;
                for (j=0; j<nreqs; j++) {
                    mfc_getl((void *) req_buf, mfc_ea2h(tgt_addr), (void *) req_list[j], sizeof(dma_list_entry)*list_size, tag, tid, rid);
                    req_buf += req_buf_size;
                    j++;
                    mfc_putl((void *) req_buf, mfc_ea2h(tgt_addr), (void *) req_list[j], sizeof(dma_list_entry)*list_size, tag, tid, rid);
                    req_buf += req_buf_size;
                }
                mfc_write_tag_mask(1<<tag);
                mfc_read_tag_status_all();   /* Wait for DMA to complete */
            }

            stop_counter();         // stop recording performance info
        }

        parms.result_size[parms.result_count] = entry_size*list_size;
        parms.result[parms.result_count] = counter;
        parms.result_count++;
    }

    /* wait until all SPEs are done. */
    _barrier_heavy((unsigned int)barptr, id, lsbuf, nprocs);


    /* DMA results back to system memory. */
    mfc_put((void*) &parms, parm_ptr, sizeof(parms), tag, tid, rid);
    mfc_write_tag_mask(1<<tag);
    mfc_read_tag_status_all();   /* Wait for DMA to complete */

    /* signal done */

    return 0;
}
