This code sample demonstrates how to use C, MMX™ technology, and Streaming SIMD Extensions 3 (SSE3) intrinsics to calculate the dot product of two vectors. The following outputs are typical of this code when computed by C or SSE3 intrinsics: 506.000000 and when computed by MMX intrinsics: 506. Output may vary depending on your compiler version and the components of your computing platform.
SSE3 intrinsics do not run on processors from the Pentium® III family or earlier.
/*
* Copyright (C) 2006 Intel Corporation. All rights reserved.
*
* The information and source code contained herein is the exclusive property
* of Intel Corporation and may not be disclosed, examined, or reproduced in
* whole or in part without explicit written authorization from the Company.
*
* [Description]
* This code sample demonstrates how to use C, MMX, and SSE3
* instrinsics to calculate the dot product of two vectors.
*
* [Compile]
* icc dot_prodcut.c (linux) | icl dot_product.c (windows)
*
* [Output]
* Dot Product computed by C: 506.000000
* Dot Product computed by SSE2 intrinsics: 506.000000
* Dot Product computed by MMX intrinsics: 506
*/
#include <stdio.h>
#include <pmmintrin.h>
#define SIZE 12 //assumes size is a multiple of 4 because MMX and SSE
//registers will store 4 elements.
//Computes dot product using C
float dot_product(float *a, float *b);
//Computes dot product using intrinsics
float dot_product_intrin(float *a, float *b);
//Computes dot product using MMX intrinsics
short MMX_dot_product(short *a, short *b);
int main()
{
float x[SIZE], y[SIZE];
short a[SIZE], b[SIZE];
int i;
float product;
short mmx_product;
for(i=0; i<SIZE; i++)
{
x[i]=i;
y[i]=i;
a[i]=i;
b[i]=i;
}
product= dot_product(x, y);
printf("Dot Product computed by C: %f\n", product);
#if __INTEL_COMPILER
product =dot_product_intrin(x,y);
printf("Dot Product computed by SSE2 intrinsics: %f\n", product);
mmx_product =MMX_dot_product(a,b);
printf("Dot Product computed by MMX intrinsics: %d\n", mmx_product);
#else
printf("Use INTEL compiler in order to calculate dot product\n");
printf("usng intrinsics\n");
#endif
return 0;
}
float dot_product(float *a, float *b)
{
int i;
int sum=0;
for(i=0; i<SIZE; i++)
{
sum += a[i]*b[i];
}
return sum;
}
#if __INTEL_COMPILER
float dot_product_intrin(float *a, float *b)
{
float arr[4];
float total;
int i;
__m128 num1, num2, num3, num4;
num4= _mm_setzero_ps(); //sets sum to zero
for(i=0; i<SIZE; i+=4)
{
num1 = _mm_loadu_ps(a+i); //loads unaligned array a into num1 num1= a[3] a[2] a[1] a[0]
num2 = _mm_loadu_ps(b+i); //loads unaligned array b into num2 num2= b[3] b[2] b[1] b[0]
num3 = _mm_mul_ps(num1, num2); //performs multiplication num3 = a[3]*b[3] a[2]*b[2] a[1]*b[1] a[0]*b[0]
num3 = _mm_hadd_ps(num3, num3); //performs horizontal addition
//num3= a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0] a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0]
num4 = _mm_add_ps(num4, num3); //performs vertical addition
}
num4= _mm_hadd_ps(num4, num4);
_mm_store_ss(&total,num4);
return total;
}
//MMX technology cannot handle single precision floats
short MMX_dot_product(short *a, short *b)
{
int i;
short result, data;
__m64 num3, sum;
__m64 *ptr1, *ptr2;
sum = _mm_setzero_si64(); //sets sum to zero
for(i=0; i<SIZE; i+=4){
ptr1 = (__m64*)&a[i]; //Converts array a to a pointer of type
//__m64 and stores four elements into MMX
//registers
ptr2 = (__m64*)&b[i];
num3 = _m_pmaddwd(*ptr1, *ptr2); //multiplies elements and adds lower
//elements with lower element and
//higher elements with higher
sum = _m_paddw(sum, num3);
}
data = _m_to_int(sum); //converts __m64 data type to an int
sum= _m_psrlqi(sum,32); //shifts sum
result = _m_to_int(sum);
result= result+data;
_m_empty(); //clears the MMX registers and MMX state.
return result;
}
#endif