Dot Product

This code sample demonstrates how to use C, MMX™ technology, and Streaming SIMD Extensions 3 (SSE3) intrinsics to calculate the dot product of two vectors. The following outputs are typical of this code when computed by C or SSE3 intrinsics: 506.000000 and when computed by MMX intrinsics: 506. Output may vary depending on your compiler version and the components of your computing platform.

SSE3 intrinsics do not run on processors from the Pentium® III family or earlier.

* The information and source code contained herein is the exclusive property

* of Intel Corporation and may not be disclosed, examined, or reproduced in

* whole or in part without explicit written authorization from the Company.

* [Description]

* This code sample demonstrates how to use C, MMX, and SSE3

* instrinsics to calculate the dot product of two vectors.

* [Compile]

* icc dot_prodcut.c (linux) | icl dot_product.c (windows)

* [Output]

* Dot Product computed by C: 506.000000

* Dot Product computed by SSE2 intrinsics: 506.000000

* Dot Product computed by MMX intrinsics: 506

#include <stdio.h>

#include <pmmintrin.h>

#define SIZE 12 //assumes size is a multiple of 4 because MMX and SSE

//registers will store 4 elements.

//Computes dot product using C

float dot_product(float *a, float *b);

//Computes dot product using intrinsics

float dot_product_intrin(float *a, float *b);

//Computes dot product using MMX intrinsics

short MMX_dot_product(short *a, short *b);

int main()

{

float x[SIZE], y[SIZE];

short a[SIZE], b[SIZE];

int i;

float product;

short mmx_product;

for(i=0; i<SIZE; i++)

{

x[i]=i;

y[i]=i;

a[i]=i;

b[i]=i;

}

product= dot_product(x, y);

printf("Dot Product computed by C: %f\n", product);

#if __INTEL_COMPILER

product =dot_product_intrin(x,y);

printf("Dot Product computed by SSE2 intrinsics: %f\n", product);

mmx_product =MMX_dot_product(a,b);

printf("Dot Product computed by MMX intrinsics: %d\n", mmx_product);

#else

printf("Use INTEL compiler in order to calculate dot product\n");

printf("usng intrinsics\n");

#endif

return 0;

}

float dot_product(float *a, float *b)

{

int i;

int sum=0;

for(i=0; i<SIZE; i++)

{

sum += a[i]*b[i];

}

return sum;

}

#if __INTEL_COMPILER

float dot_product_intrin(float *a, float *b)

{

float arr[4];

float total;

int i;

__m128 num1, num2, num3, num4;

num4= _mm_setzero_ps(); //sets sum to zero

for(i=0; i<SIZE; i+=4)

{

num1 = _mm_loadu_ps(a+i); //loads unaligned array a into num1 num1= a[3] a[2] a[1] a[0]

num2 = _mm_loadu_ps(b+i); //loads unaligned array b into num2 num2= b[3] b[2] b[1] b[0]

num3 = _mm_mul_ps(num1, num2); //performs multiplication num3 = a[3]*b[3] a[2]*b[2] a[1]*b[1] a[0]*b[0]

num3 = _mm_hadd_ps(num3, num3); //performs horizontal addition

//num3= a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0] a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0]

num4 = _mm_add_ps(num4, num3); //performs vertical addition

}

num4= _mm_hadd_ps(num4, num4);

_mm_store_ss(&total,num4);

return total;

}

//MMX technology cannot handle single precision floats

short MMX_dot_product(short *a, short *b)

{

int i;

short result, data;

__m64 num3, sum;

__m64 *ptr1, *ptr2;

sum = _mm_setzero_si64(); //sets sum to zero

for(i=0; i<SIZE; i+=4){

ptr1 = (__m64*)&a[i]; //Converts array a to a pointer of type

//__m64 and stores four elements into MMX

//registers

ptr2 = (__m64*)&b[i];

num3 = _m_pmaddwd(*ptr1, *ptr2); //multiplies elements and adds lower

//elements with lower element and

//higher elements with higher

sum = _m_paddw(sum, num3);

}

data = _m_to_int(sum); //converts __m64 data type to an int

sum= _m_psrlqi(sum,32); //shifts sum

result = _m_to_int(sum);

result= result+data;

_m_empty(); //clears the MMX registers and MMX state.

return result;

}

#endif