// -------------------------------------------------------------- 
// (C)Copyright 2007,                                         
// International Business Machines Corporation, 
// All Rights Reserved.
// -------------------------------------------------------------- 

#include <stdio.h>
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include "common.h"

// Local store structures and buffers.
volatile parm_context ctx __attribute__ ((aligned(16)));;
volatile uint32_t ls_in_data[2][ELEMENTS_PER_BLOCK] __attribute__ ((aligned(128)));
volatile uint32_t ls_out_data[2][ELEMENTS_PER_BLOCK] __attribute__ ((aligned(128)));
volatile uint32_t status __attribute__ ((aligned(128)));

uint32_t tag_id[2];
	
int main(unsigned long long spu_id, unsigned long long argv)
{
	int buf, nxt_buf, cnt, nxt_cnt, left, i;
	volatile uint32_t *in_data, *nxt_in_data, *out_data, *nxt_out_data;

	//printf("<SPE: start\n");
	
	tag_id[1] = mfc_tag_reserve();

	// Input parameter argv is a pointer to the SPE context - fetch the parameter context, waiting for it to complete.
	mfc_get((void *)(&ctx), (uint32_t)argv, sizeof(parm_context), tag_id[0], 0, 0);
	mfc_write_tag_mask(1<<tag_id[0]);
	mfc_read_tag_status_all(); 

	// For each double buffered block of elements
	in_data  = ctx.in_data;
	out_data = ctx.out_data;
	left     = ctx.size;

	//printf("in=0x%x, out=0x%x, size=%d\n",(uint32_t)ctx.in_data,(uint32_t)ctx.out_data, ctx.size);
	
	cnt = (left < ELEMENTS_PER_BLOCK) ? left : ELEMENTS_PER_BLOCK;

	// Prefetch first buffer of input data.
	buf = 0;
	mfc_getb((void *)(ls_in_data), (uint32_t)(in_data), cnt*sizeof(uint32_t), tag_id[0], 0, 0);

	while (cnt < left) {
		left -= cnt;

		nxt_in_data  = in_data + cnt;
		nxt_out_data = out_data + cnt;
		nxt_cnt = (left < ELEMENTS_PER_BLOCK) ? left : ELEMENTS_PER_BLOCK;

		// Prefetch next buffer so the data is available for computation on next loop iteration.
		// The first DMA is barriered so that we don't GET data before the previous iteration's data is PUT.
		nxt_buf = buf^1;

		mfc_getb((void *)(&ls_in_data[nxt_buf][0]), (uint32_t)(nxt_in_data), nxt_cnt * sizeof(uint32_t), tag_id[nxt_buf], 0, 0);	  
			
		// Wait for previously prefetched data
		mfc_write_tag_mask(1<<tag_id[buf]);
		mfc_read_tag_status_all();  

		for (i=0; i<ELEMENTS_PER_BLOCK; i++){
			ls_out_data[buf][i] = ~(ls_in_data[buf][i]);
		}

		// Put the buffer's position data back into system address space
		mfc_put((void*)(&ls_out_data[buf][0]), (uint32_t)(out_data), cnt * sizeof(uint32_t),tag_id[buf],0,0);

		//printf("out: (first,last)=0x(%x,%x)\n",~ls_out_data[buf][0],~ls_out_data[buf][ELEMENTS_PER_BLOCK-1]);
		in_data  = nxt_in_data;
		out_data = nxt_out_data;

		buf = nxt_buf;
		cnt = nxt_cnt;		  
    }

    // Wait for previously prefetched data
    mfc_write_tag_mask(1<<tag_id[buf]);
    mfc_read_tag_status_all();  

    // process_buffer
	for (i=0; i<ELEMENTS_PER_BLOCK; i++){
		ls_out_data[buf][i] = ~(ls_in_data[buf][i]);
	}

    // Put the buffer's position data back into system address space
	// Put barrier to ensure all data i written to memory before writing status
    mfc_putb((void*)(&ls_out_data[buf][0]), (uint32_t)(out_data), cnt * sizeof(uint32_t), tag_id[buf],0,0);
	//printf("out: (first,last)=0x(%x,%x)\n",~ls_out_data[buf][0],~ls_out_data[buf][ELEMENTS_PER_BLOCK-1]);
	
    // Wait for DMAs to complete before starting the next step in time.
    mfc_write_tag_mask(1<<tag_id[buf]);
    mfc_read_tag_status_all();  

// update the status so PPE knows that all data is in place
	status = STATUS_DONE;
	
    mfc_put((void*)&status, (uint32_t)(ctx.status), sizeof(uint32_t), tag_id[buf],0,0);
	mfc_write_tag_mask(1<<tag_id[buf]);
    mfc_read_tag_status_all();  
	
	mfc_tag_release(tag_id[0]);
	mfc_tag_release(tag_id[1]);
  
	//printf("<SPE: end\n");
	return (0);

}
