Matrix & Vector Operations using CUDA

Title:

Design parallel algorithm to

1. Add two large vectors

2. Multiply Vector and Matrix

3. Multiply two N × N arrays using n 2 processors

Source Code:

Vector Addition

	#include<iostream>
	#include<cstdlib>

	using namespace std;

	//VectorAdd parallel function
	__global__ void vectorAdd(int a, int b, int *result, int n)
	{
	int tid=threadIdx.x+blockIdx.x*blockDim.x;
	if(tid<n)
	{
	result[tid]=a[tid]+b[tid];
	}
	}
	int main()
	{
	int a,b,*c;
	int a_dev,b_dev,*c_dev;
	int n=1<<24;

	a=new int[n];
	b=new int[n];
	c=new int[n];
	int *d=new int[n];
	int size=n*sizeof(int);
	cudaMalloc(&a_dev,size);
	cudaMalloc(&b_dev,size);
	cudaMalloc(&c_dev,size);

	//Array initialization..You can use Randon function to assign values
	for(int i=0;i<n;i++)
	{
	a[i]=1;
	b[i]=2;
	d[i]=a[i]+b[i]; //calculating serial addition
	}


	cudaEvent_t start,end;

	cudaEventCreate(&start);
	cudaEventCreate(&end);

	cudaMemcpy(a_dev,a,size,cudaMemcpyHostToDevice);
	cudaMemcpy(b_dev,b,size,cudaMemcpyHostToDevice);
	int threads=1024;
	int blocks=(n+threads-1)/threads;
	cudaEventRecord(start);

	//Parallel addition program
	vectorAdd<<<blocks,threads>>>(a_dev,b_dev,c_dev,n);

	cudaEventRecord(end);
	cudaEventSynchronize(end);

	float time=0.0;
	cudaEventElapsedTime(&time,start,end);

	cudaMemcpy(c,c_dev,size,cudaMemcpyDeviceToHost);

	//Calculate the error term.
	int error=0;
	for(int i=0;i<n;i++){
	error+=d[i]-c[i];
	//cout<<" gpu "<<c[i]<<" CPU "<<d[i];
	}

	cout<<"Error : "<<error;
	cout<<"\nTime Elapsed: "<<time;

	return 0;
	}

view raw vectorAddition.cu hosted with ❤ by GitHub

Matrix Multiplication

	#include<iostream>
	#include<cstdlib>
	#include<cmath>
	using namespace std;

	//Matrix multiplication Cuda
	__global__ void matrixMultiplication(int a, int b, int *c, int n)
	{
	int row=threadIdx.y+blockDim.y*blockIdx.y;
	int col=threadIdx.x+blockDim.x*blockIdx.x;
	int sum=0;

	if(row<n && col<n)
	for(int j=0;j<n;j++)
	{
	sum=sum+a[rown+j]b[j*n+col];
	}

	c[n*row+col]=sum;
	}
	int main()
	{
	int a,b,*c;
	int a_dev,b_dev,*c_dev;
	int n=3;

	a=new int[n*n];
	b=new int[n*n];
	c=new int[n*n];
	int d=new int[nn];
	int size=nnsizeof(int);
	cudaMalloc(&a_dev,size);
	cudaMalloc(&b_dev,size);
	cudaMalloc(&c_dev,size);

	//Array initialization
	for(int i=0;i<n*n;i++)
	{
	a[i]=2; //rand()%n;
	b[i]=1;//rand()%n;
	// d[i]=a[i]+b[i];
	}


	cudaEvent_t start,end;

	cudaEventCreate(&start);
	cudaEventCreate(&end);

	cudaMemcpy(a_dev,a,size,cudaMemcpyHostToDevice);
	cudaMemcpy(b_dev,b,size,cudaMemcpyHostToDevice);


	dim3 threadsPerBlock(n, n);
	dim3 blocksPerGrid(1, 1);

	if(n*n>512){
	threadsPerBlock.x=512;
	threadsPerBlock.y=512;
	blocksPerGrid.x=ceil((double)n/(double)threadsPerBlock.x);
	blocksPerGrid.y=ceil((double)n/(double)threadsPerBlock.y);
	}
	//GPU Multiplication
	cudaEventRecord(start);
	matrixMultiplication<<<blocksPerGrid,threadsPerBlock>>>(a_dev,b_dev,c_dev,n);

	cudaEventRecord(end);
	cudaEventSynchronize(end);

	float time=0.0;
	cudaEventElapsedTime(&time,start,end);

	cudaMemcpy(c,c_dev,size,cudaMemcpyDeviceToHost);


	//CPU matrix multiplication
	int sum=0;
	for(int row=0;row<n;row++)
	{
	for(int col=0;col<n;col++)
	{
	sum=0;
	for(int k=0;k<n;k++)
	sum=sum+a[rown+k]b[k*n+col];
	d[row*n+col]=sum;
	}

	}
	int error=0;
	for(int i=0;i<n*n;i++){
	error+=d[i]-c[i];
	//cout<<" gpu "<<c[i]<<" CPU "<<d[i]<<endl;
	}

	cout<<"Error : "<<error;
	cout<<"\nTime Elapsed: "<<time;

	return 0;
	}

view raw matrixMulCUDA.cu hosted with ❤ by GitHub

Matrix vector Multiplication

	#include<iostream>
	#include<cstdlib>
	#include<cmath>
	#include<time.h>
	using namespace std;


	__global__ void matrixVectorMultiplication(int a, int b, int *c, int n)
	{
	int row=threadIdx.x+blockDim.x*blockIdx.x;
	int sum=0;

	if(row<n)
	for(int j=0;j<n;j++)
	{
	sum=sum+a[rown+j]b[j];
	}

	c[row]=sum;
	}
	int main()
	{
	int a,b,*c;
	int a_dev,b_dev,*c_dev;
	int n=32;

	a=new int[n*n];
	b=new int[n];
	c=new int[n];
	int *d=new int[n];
	int size=n*sizeof(int);
	cudaMalloc(&a_dev,size*size);
	cudaMalloc(&b_dev,size);
	cudaMalloc(&c_dev,size);

	for(int i=0;i<n;i++)
	{
	for(int j=0;j<n;j++)
	{
	a[in+j]= in+j+1; //rand()%n;

	}

	b[i]= i+1; //rand()%n;
	//cout<<a[i]<<" ";
	// d[i]=a[i]+b[i];
	}


	cudaEvent_t start,end;

	cudaEventCreate(&start);
	cudaEventCreate(&end);

	cudaMemcpy(a_dev,a,size*size,cudaMemcpyHostToDevice);
	cudaMemcpy(b_dev,b,size,cudaMemcpyHostToDevice);


	dim3 threadsPerBlock(n, n);
	dim3 blocksPerGrid(1, 1);

	if(n*n>512){
	threadsPerBlock.x=512;
	threadsPerBlock.y=512;
	blocksPerGrid.x=ceil((double)n/(double)threadsPerBlock.x);
	blocksPerGrid.y=ceil((double)n/(double)threadsPerBlock.y);
	}

	cudaEventRecord(start);
	matrixVectorMultiplication<<<blocksPerGrid,threadsPerBlock>>>(a_dev,b_dev,c_dev,n);

	cudaEventRecord(end);
	cudaEventSynchronize(end);

	float time=0.0;
	cudaEventElapsedTime(&time,start,end);

	cudaMemcpy(c,c_dev,size,cudaMemcpyDeviceToHost);
	cout<<"\nGPU Time Elapsed: "<<time;

	//CPU matrixVector multiplication
	clock_t t=clock();
	int sum=0;
	for(int row=0;row<n;row++)
	{
	sum=0;
	for(int col=0;col<n;col++)
	{
	sum=sum+a[rown+col]b[col];

	}
	d[row]=sum;
	}
	t=clock()-t;
	cout<<"\nCPU Time Elapsed: "<<((double)t); //((double)t)/CLOCKS_PER_SEC;


	int error=0;
	for(int i=0;i<n;i++){
	error+=d[i]-c[i];
	// cout<<" gpu "<<c[i]<<" CPU "<<d[i]<<endl;
	}

	cout<<"Error : "<<error;


	return 0;
	}

view raw matVecMul.cu hosted with ❤ by GitHub

Header Ads

Matrix & Vector Operations using CUDA

Title:

Source Code:

No comments:

Submit an article

Featured post

Most frequently asked interview Questions

Popular Posts

Report Abuse

Total Pageviews

Tags

Authors

Github

Header Ads

Matrix & Vector Operations using CUDA

Title:

Source Code:

You Might Also Like

No comments:

Submit an article

Featured post

Most frequently asked interview Questions

Popular Posts

Report Abuse

Total Pageviews

Tags

Authors

Github