Matrix & Vector Operations using CUDA
Title:
Design parallel algorithm to
1. Add two large vectors
2. Multiply Vector and Matrix
3. Multiply two N × N arrays using n 2 processors
Source Code:
- Vector Addition
- Matrix Multiplication
- Matrix vector Multiplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<cstdlib> | |
using namespace std; | |
//VectorAdd parallel function | |
__global__ void vectorAdd(int *a, int *b, int *result, int n) | |
{ | |
int tid=threadIdx.x+blockIdx.x*blockDim.x; | |
if(tid<n) | |
{ | |
result[tid]=a[tid]+b[tid]; | |
} | |
} | |
int main() | |
{ | |
int *a,*b,*c; | |
int *a_dev,*b_dev,*c_dev; | |
int n=1<<24; | |
a=new int[n]; | |
b=new int[n]; | |
c=new int[n]; | |
int *d=new int[n]; | |
int size=n*sizeof(int); | |
cudaMalloc(&a_dev,size); | |
cudaMalloc(&b_dev,size); | |
cudaMalloc(&c_dev,size); | |
//Array initialization..You can use Randon function to assign values | |
for(int i=0;i<n;i++) | |
{ | |
a[i]=1; | |
b[i]=2; | |
d[i]=a[i]+b[i]; //calculating serial addition | |
} | |
cudaEvent_t start,end; | |
cudaEventCreate(&start); | |
cudaEventCreate(&end); | |
cudaMemcpy(a_dev,a,size,cudaMemcpyHostToDevice); | |
cudaMemcpy(b_dev,b,size,cudaMemcpyHostToDevice); | |
int threads=1024; | |
int blocks=(n+threads-1)/threads; | |
cudaEventRecord(start); | |
//Parallel addition program | |
vectorAdd<<<blocks,threads>>>(a_dev,b_dev,c_dev,n); | |
cudaEventRecord(end); | |
cudaEventSynchronize(end); | |
float time=0.0; | |
cudaEventElapsedTime(&time,start,end); | |
cudaMemcpy(c,c_dev,size,cudaMemcpyDeviceToHost); | |
//Calculate the error term. | |
int error=0; | |
for(int i=0;i<n;i++){ | |
error+=d[i]-c[i]; | |
//cout<<" gpu "<<c[i]<<" CPU "<<d[i]; | |
} | |
cout<<"Error : "<<error; | |
cout<<"\nTime Elapsed: "<<time; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<cstdlib> | |
#include<cmath> | |
using namespace std; | |
//Matrix multiplication Cuda | |
__global__ void matrixMultiplication(int *a, int *b, int *c, int n) | |
{ | |
int row=threadIdx.y+blockDim.y*blockIdx.y; | |
int col=threadIdx.x+blockDim.x*blockIdx.x; | |
int sum=0; | |
if(row<n && col<n) | |
for(int j=0;j<n;j++) | |
{ | |
sum=sum+a[row*n+j]*b[j*n+col]; | |
} | |
c[n*row+col]=sum; | |
} | |
int main() | |
{ | |
int *a,*b,*c; | |
int *a_dev,*b_dev,*c_dev; | |
int n=3; | |
a=new int[n*n]; | |
b=new int[n*n]; | |
c=new int[n*n]; | |
int *d=new int[n*n]; | |
int size=n*n*sizeof(int); | |
cudaMalloc(&a_dev,size); | |
cudaMalloc(&b_dev,size); | |
cudaMalloc(&c_dev,size); | |
//Array initialization | |
for(int i=0;i<n*n;i++) | |
{ | |
a[i]=2; //rand()%n; | |
b[i]=1;//rand()%n; | |
// d[i]=a[i]+b[i]; | |
} | |
cudaEvent_t start,end; | |
cudaEventCreate(&start); | |
cudaEventCreate(&end); | |
cudaMemcpy(a_dev,a,size,cudaMemcpyHostToDevice); | |
cudaMemcpy(b_dev,b,size,cudaMemcpyHostToDevice); | |
dim3 threadsPerBlock(n, n); | |
dim3 blocksPerGrid(1, 1); | |
if(n*n>512){ | |
threadsPerBlock.x=512; | |
threadsPerBlock.y=512; | |
blocksPerGrid.x=ceil((double)n/(double)threadsPerBlock.x); | |
blocksPerGrid.y=ceil((double)n/(double)threadsPerBlock.y); | |
} | |
//GPU Multiplication | |
cudaEventRecord(start); | |
matrixMultiplication<<<blocksPerGrid,threadsPerBlock>>>(a_dev,b_dev,c_dev,n); | |
cudaEventRecord(end); | |
cudaEventSynchronize(end); | |
float time=0.0; | |
cudaEventElapsedTime(&time,start,end); | |
cudaMemcpy(c,c_dev,size,cudaMemcpyDeviceToHost); | |
//CPU matrix multiplication | |
int sum=0; | |
for(int row=0;row<n;row++) | |
{ | |
for(int col=0;col<n;col++) | |
{ | |
sum=0; | |
for(int k=0;k<n;k++) | |
sum=sum+a[row*n+k]*b[k*n+col]; | |
d[row*n+col]=sum; | |
} | |
} | |
int error=0; | |
for(int i=0;i<n*n;i++){ | |
error+=d[i]-c[i]; | |
//cout<<" gpu "<<c[i]<<" CPU "<<d[i]<<endl; | |
} | |
cout<<"Error : "<<error; | |
cout<<"\nTime Elapsed: "<<time; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<cstdlib> | |
#include<cmath> | |
#include<time.h> | |
using namespace std; | |
__global__ void matrixVectorMultiplication(int *a, int *b, int *c, int n) | |
{ | |
int row=threadIdx.x+blockDim.x*blockIdx.x; | |
int sum=0; | |
if(row<n) | |
for(int j=0;j<n;j++) | |
{ | |
sum=sum+a[row*n+j]*b[j]; | |
} | |
c[row]=sum; | |
} | |
int main() | |
{ | |
int *a,*b,*c; | |
int *a_dev,*b_dev,*c_dev; | |
int n=32; | |
a=new int[n*n]; | |
b=new int[n]; | |
c=new int[n]; | |
int *d=new int[n]; | |
int size=n*sizeof(int); | |
cudaMalloc(&a_dev,size*size); | |
cudaMalloc(&b_dev,size); | |
cudaMalloc(&c_dev,size); | |
for(int i=0;i<n;i++) | |
{ | |
for(int j=0;j<n;j++) | |
{ | |
a[i*n+j]= i*n+j+1; //rand()%n; | |
} | |
b[i]= i+1; //rand()%n; | |
//cout<<a[i]<<" "; | |
// d[i]=a[i]+b[i]; | |
} | |
cudaEvent_t start,end; | |
cudaEventCreate(&start); | |
cudaEventCreate(&end); | |
cudaMemcpy(a_dev,a,size*size,cudaMemcpyHostToDevice); | |
cudaMemcpy(b_dev,b,size,cudaMemcpyHostToDevice); | |
dim3 threadsPerBlock(n, n); | |
dim3 blocksPerGrid(1, 1); | |
if(n*n>512){ | |
threadsPerBlock.x=512; | |
threadsPerBlock.y=512; | |
blocksPerGrid.x=ceil((double)n/(double)threadsPerBlock.x); | |
blocksPerGrid.y=ceil((double)n/(double)threadsPerBlock.y); | |
} | |
cudaEventRecord(start); | |
matrixVectorMultiplication<<<blocksPerGrid,threadsPerBlock>>>(a_dev,b_dev,c_dev,n); | |
cudaEventRecord(end); | |
cudaEventSynchronize(end); | |
float time=0.0; | |
cudaEventElapsedTime(&time,start,end); | |
cudaMemcpy(c,c_dev,size,cudaMemcpyDeviceToHost); | |
cout<<"\nGPU Time Elapsed: "<<time; | |
//CPU matrixVector multiplication | |
clock_t t=clock(); | |
int sum=0; | |
for(int row=0;row<n;row++) | |
{ | |
sum=0; | |
for(int col=0;col<n;col++) | |
{ | |
sum=sum+a[row*n+col]*b[col]; | |
} | |
d[row]=sum; | |
} | |
t=clock()-t; | |
cout<<"\nCPU Time Elapsed: "<<((double)t); //((double)t)/CLOCKS_PER_SEC; | |
int error=0; | |
for(int i=0;i<n;i++){ | |
error+=d[i]-c[i]; | |
// cout<<" gpu "<<c[i]<<" CPU "<<d[i]<<endl; | |
} | |
cout<<"Error : "<<error; | |
return 0; | |
} |
No comments: