// -------------------------------------------------------------- 
// (C)Copyright 2007,                                         
// International Business Machines Corporation, 
// All Rights Reserved.
// -------------------------------------------------------------- 

#include <stdio.h>
#include <stdlib.h>
#include <spu_mfcio.h>

typedef struct {
	int			processingStep; // Variable to contain the overall workload processing step
	int 		exitSignal; // Broadcast signal to end processing step
	
	uint64_t 	accumulatedTime[8]; // Structure to contain workload dynamic execution statistics
	int			accumulatedSteps[8];
} SharedData_s;

// local version of the shared structure
// size of this structure is a single cache line
static volatile SharedData_s SharedData __attribute__ ((aligned(128)));

// effective address of the shared sturture
uint64_t SharedData_ea;

// argp - effective address pointer to the shared structure in main memory
// envp - spu id of the spu
int main( uint64_t spuid , uint64_t argp, uint64_t envp )
{
	int i;
	unsigned int status;
	unsigned int t_start, t_spu;
	int exitFlag = 0;
	int spuNum = envp;
	SharedData_ea = argp;

	printf("SPU %d start.\n");
	
	// Initialize random number generator for fake workload example
	srand( spu_read_decrementer() );
	
	do
	{
		exitFlag = 0;
		
		// Start performace profile information collection
		spu_write_decrementer(0x7fffffff);
		t_start = spu_read_decrementer();
			
		// Data processing here		
		// ...
		// Fake example workload:
		// 1) The first random number < 100 ends the first step of the process
		// 2) The first number < 10 ends the second step of the process
		//
		// Different SPEs process a different amount of data to generate different execution time statistics
		// The processingStep variable is shared, so all the SPEs will process the same step until one encounters the desired result
		// Multiple SPEs can reach the desired result, but the first one to reach it will trigger the advancement of the processing step
		
		switch( SharedData.processingStep )
		{
			case 0:
				for( i = 0 ; i < (spuNum * 10) + 10 ; ++i )
				{
					if( rand() <= 100 )
					{
						printf("SPU %d found the first result.\n", spuNum);
						exitFlag = 1;				
						break;
					}
				}
			break;

			case 1:
				for( i = 0 ; i < (spuNum * 10) + 10 ; ++i )
				{
					if( rand() <= 10 )
					{
						printf("SPU %d found the second result.\n", spuNum);
						exitFlag = 1;				
						break;
					}
				}
			break;
		}
		
		// End performance profile information collection
		t_spu = t_start - spu_read_decrementer();
	
		// ...
		// Because we have statistics on all the SPEs average workload time
		// we can have some inter-SPE dynamic load balancing, especially for
		// workloads that operate in pipelined fashion using multiple SPEs
		// ...
	
		
		do 
		{
			// get and lock the cache line of the shared structure
			mfc_getllar((void*)&SharedData, SharedData_ea, 0, 0);
			(void)mfc_read_atomic_status();
			
			// Update shared structure
			SharedData.accumulatedTime[spuNum] += (uint64_t) t_spu;
			SharedData.accumulatedSteps[spuNum]++;

			if( exitFlag ) 
			{
				SharedData.processingStep++;
				if(SharedData.processingStep > 1) SharedData.exitSignal = 1;
			}
			
			mfc_putllc((void*)&SharedData, SharedData_ea, 0, 0);
			status = mfc_read_atomic_status() & MFC_PUTLLC_STATUS;
			
		} while (status);
		
	} while (SharedData.exitSignal == 0);

	return 0;
}

