
// -------------------------------------------------------------- 
// (C)Copyright 2007,                                         
// International Business Machines Corporation, 
// All Rights Reserved.
// Author: Eitan Peri, eitanp@il.ibm.com
// -------------------------------------------------------------- 

#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <libspe2.h>
#include <cbe_mfc.h>
#include <pthread.h>

#include <unistd.h>

#include "common.h"

#define NUM_ITER 30

extern spe_program_handle_t spu;


// Data structures to work with the SPE
//============================================================================
spe_program_handle_t *program[2];

volatile parm_context ctx[NUM_SPES] __attribute__ ((aligned(16)));

uint64_t ea_ls_base[NUM_SPES];
uint32_t ls_offset[NUM_SPES];
uint32_t ls_addr_my[2];

volatile status_s status[NUM_SPES] __attribute__ ((aligned(128)));

// Data structures for the get that for the first SPE
volatile uint32_t data_other[2][BUFF_SIZE] __attribute__ ((aligned(128)));

// Data structures for the get that I will do from last SPE
volatile uint32_t data_my[2][BUFF_SIZE] __attribute__ ((aligned(128)));
volatile uint64_t data_my_ptr[2];

// Daata structure for running SPE thread
//============================================================================
typedef struct spu_data {
  spe_context_ptr_t spe_ctx;
  pthread_t pthread;
  void *argp;
} spu_data_t;

spu_data_t data[NUM_SPES];

//============================================================================
// Create and run one SPE thread
//============================================================================
void *spu_pthread(void *arg) {

	spu_data_t *datap = (spu_data_t *)arg;
	uint32_t entry = SPE_DEFAULT_ENTRY;
	
	//printf(")PPE: spe thread start run\n" );

	if (spe_context_run(datap->spe_ctx, &entry, 0, datap->argp, NULL, NULL) < 0) {
		perror ("Failed running context");
		exit (1);
	}

	//printf(")PPE: spe thread finish run\n");
	pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
	int i, num, ret, tag=5;
	uint32_t tmp_off, zero=0, tag_status;
	uint32_t iter, next_iter, num_iter=0, total_num_iter;

	if (argc > 1){
		total_num_iter = atoi(argv[1]);
	}else{
		total_num_iter = NUM_ITER;
	}
	
	for( i=0; i<BUFF_SIZE; i++){
		data_my[0][i]=0;	data_my[1][i]=0;
		data_other[0][i]=1;	data_other[1][i]=0;	
	}
	
	printf(")PPE: ---> Load SPE programs SPEs\n" );
	for( num=0; num<NUM_SPES; num++){

		// create SPE context
		if ((data[num].spe_ctx = spe_context_create (0, NULL)) == NULL) {
			perror("Failed creating context"); exit(1);
		}

		// load SPE program into the SPE context
		if (spe_program_load ( data[num].spe_ctx, &spu)) {
			perror("Failed loading program"); exit(1);
		}		
	}

	//printf(")PPE: data_my  = 0x%llx\n", (uint64_t)data_my);
	//printf(")PPE: data_other = 0x%llx\n", (uint64_t)data_other);
	
	// update the parameters of each SPE
	for( num=0; num<NUM_SPES; num++){
		if( (ea_ls_base[num] = (uint64_t)(spe_ls_area_get(data[num].spe_ctx)))==NULL){
			perror("Failed map LS to main storage"); exit(1);
		}
		//ea_ls_base[num]=ea_ls_base[num]&0xffffffff; // TBD - 64b to 32b
		//printf(")PPE: ea_ls_base[%d] = 0x%llx\n",num, ea_ls_base[num]);
	}
	
	// initiate SPE parameters
	printf(")PPE: ---> send EAs to SPEs\n" );
	for( num=0; num<NUM_SPES; num++){
		
		status[num].d=0;
		
		ctx[num].num = num;
		ctx[num].ea_status = (uint64_t)(&status[num].d);

		//printf(")PPE: SPE %d: ea_status=0x%llx\n",num, ctx[num].ea_status);
		
		if(num==0){
			ctx[num].ea_base = (uint64_t)data_other;
		}else{
			ctx[num].ea_base = ea_ls_base[num-1];
		}

		data[num].argp = (void*)&ctx[num];	
			
		//printf(")PPE: SPE %d: ea_base=0x%llx, \n",num,ctx[num].ea_base,);
	}
			
	// create SPE pthreads
	for( num=0; num<NUM_SPES; num++){
		if (pthread_create (&data[num].pthread, NULL, &spu_pthread, &data[num])) {
			perror("Failed creating thread");  exit(1);
		}   
		
	}
		
	// collect the offsets from all the SPEs
	printf(")PPE: ---> collect offsets from SPEs\n" );
	for( num=0; num<NUM_SPES; num++){
		while(!spe_out_mbox_status(data[num].spe_ctx));
		spe_out_mbox_read(data[num].spe_ctx, &ls_offset[num], 1);	
	}

	// intialize my offsets
	ls_addr_my[0]  = ls_offset[NUM_SPES-1];
	ls_addr_my[1]  = ls_addr_my[0] + BUFF_SIZE*sizeof(uint32_t);
	data_my_ptr[0] = ((uint64_t)(&data_my[0][0]))&0xffffffff;
	data_my_ptr[1] = ((uint64_t)(&data_my[1][0]))&0xffffffff;
	
	// tell the SPE what offset they should work on  -also a sign to start working
	printf(")PPE: ---> send offsets to SPEs\n" );
	for( num=0; num<NUM_SPES; num++){
		if(num==0){
			tmp_off = zero;
		}else{
			tmp_off = ls_offset[num-1];
		}
		spe_in_mbox_write(data[num].spe_ctx, &tmp_off,1,1);
		
		//printf(")PPE: SPE %d: offset=0x%x \n",num,tmp_off);
	}

	iter = 0;
	
	for(num_iter=0; num_iter<=total_num_iter; num_iter=num_iter){

		next_iter = iter^1;
		
		// wait for SPE data to be written into memory
		//printf(")PPE: Wait for SPEs to be ready\n");
		for( num=0; num<NUM_SPES; num++){
			while (status[num].d <= num_iter);
			//printf(")PPE: SPE %d is ready for iter %u\n", num, num_iter);
		}	
		
		num_iter++;
		
		//printf(")PPE: All SPE are ready for iter %u\n", num_iter);
		for( num=0; num<NUM_SPES; num++){
			spe_in_mbox_write(data[num].spe_ctx, &num_iter,1,1);
		}
		//printf(")PPE: All SPE should start iter %u\n",num_iter);
			
		//printf(")PPE: Get data from 0x%x to 0x%llx\n", ls_addr_my[iter], (uint64_t)(data_my_ptr[next_iter]));
		do{
			ret=spe_mfcio_put(data[NUM_SPES-1].spe_ctx, ls_addr_my[iter], (void*)data_my_ptr[next_iter],BUFF_SIZE*sizeof(uint32_t),tag,0,0);	
		}while( ret!=0);
		
		for( i=0; i<BUFF_SIZE; i++){
			data_other[next_iter][i]=num_iter+1;
		}

		__lwsync(); // make sure that writing to LS is complete before writing the mailbox notification of next iteration
		
		
		// wait for completion of the put command
		//ret = spe_mfcio_tag_status_read(data[NUM_SPES-1].spe_ctx, (1<<tag), SPE_TAG_ALL, &tag_status);
		ret = spe_mfcio_tag_status_read(data[NUM_SPES-1].spe_ctx, 0, SPE_TAG_ALL, &tag_status);

		if(ret!=0){
			perror ("Failed execute mfcio_put command"); exit (1);
		}

		//printf(")PPE: process: [%d,%d]->[%d,%d]\n", data_my[next_iter][0],data_my[next_iter][BUFF_SIZE-1],data_other[next_iter][0],data_other[next_iter][BUFF_SIZE-1]);	
		
		if(num_iter>2*NUM_SPES){ // perform check
			if( ((data_my[next_iter][0]+NUM_SPES)!=num_iter) ||
				((data_my[next_iter][BUFF_SIZE-1]+NUM_SPES)!=num_iter) ){
				printf("PPE: ERROR - fail in iteration %d, data=(%d,%d)\n",num_iter, data_my[next_iter][0],data_my[next_iter][BUFF_SIZE-1]);
				break;
				//perror("Fail in transferring the data"); exit (1);
			}
		}
		
		iter = next_iter;		
	}
	
	for( num=0; num<NUM_SPES; num++){
		while (status[num].d <= num_iter);
		//printf(")PPE: SPE %d is ready for iter %u\n", num, num_iter);
	}	
		
	printf(")PPE: Tell all SPEs that we're done\n");
	num_iter = DATA_DONE;
	for( num=0; num<NUM_SPES; num++){
		spe_in_mbox_write(data[num].spe_ctx, &num_iter,1,1);
	}
	
	for( num=0; num<NUM_SPES; num++){
		// wait for all the SPE pthread to complete
		if (pthread_join (data[num].pthread, NULL)) {
			perror("Failed joining thread"); exit (1);
		}

		// destroy the SPE contexts
		if (spe_context_destroy( data[num].spe_ctx   )) {
			perror("Failed spe_context_destroy"); exit(1);
		}
	}

	printf(")PPE:) Complete %d iterations with great success\n", total_num_iter);
	
	return (0);
}

