
#include <spu_intrinsics.h>
#include <stdio.h>

// scalar multiply
void mult_(float *in1, float *in2, float *out, int N){
	int i;
	for (i=0; i<N; i++){
		out[i] = in1[i] * in2[i];
	}
}

// vectorized multiply
// assume the arrays are quadword aligned and N is divisible by 4
void vmult_(float *in1, float *in2, float *out, int N){
	int i, Nv;
	Nv = N>>2; //divide by 4;
	
	vec_float4 *vin1 = (vec_float4*)in1, *vin2 = (vec_float4*)in2;
	vec_float4 *vout = (vec_float4*)out;
	
	for (i=0; i<Nv; i++){
		vout[i] = spu_mul( vin1[i], vin2[i]);
	}
}

#define NN 100
int main( )
{

	float in1[NN] __attribute__((aligned (16)));
	float in2[NN] __attribute__((aligned (16)));
	float out[NN] __attribute__((aligned (16)));
	
	int i;
	
	for (i=0; i<NN; i++){ in1[i] = i;	in2[i] = i+1; }
	
	mult_(in1, in2, out, (int)NN);
	
	for (i=0; i<NN; i+=4) 
		printf("out[%d..%d]=[%f,%f,%f,%f]\n", i,i+3,out[i],out[i+1],out[i+2],out[i+3]);
	
	for (i=0; i<NN; i++){ in1[i] = i*10;	in2[i] = (i+1)*10; }
	vmult_(in1, in2, out, (int)NN);
	
	for (i=0; i<NN; i+=4) 
		printf("out[%d..%d]=[%f,%f,%f,%f]\n", i,i+3,out[i],out[i+1],out[i+2],out[i+3]);
		
	return 0;
}


