介绍忙等待,互斥量,信号量,条件变量:以估计π为例(并行计算,用Pthread库)

首先是一个hellowerld程序,创建多个线程并打印语句,主要是想说明这三个函数:

int pthread_create(pthread_t* thread,const pthread_attr_t* attr,void* fuc,void* arg);//创建线程并关联运行函数

int pthread_join(pthread_t* thread, void **retval);//等待其他线程结束,这里的结束也意味着内存空间的释放

free(pthread_t* )//手动释放程序分配的空间,如malloc()

下面是代码(gcc -g -Wall -o helloworld heloworld.c -lpthread ; ./helloworld):

#include
#include
#include
/*

   int pthread_create(pthread_t* thread,const pthread_attr_t* attr,void* fuc,void* arg);

   int pthread_join(pthread_t* thread, void **retval);

*/

//thread's num
int thread_count;

void *Hello(void* rank);

int main(int argc,char* argv[]){
	//Use long in case of 64-bit system
	long thread;
	pthread_t* thread_handles;

	//Get number of threads from command line
	thread_count = strtol(argv[1],NULL,10);
	thread_handles = malloc(thread_count*sizeof(pthread_t));

	for(thread = 0;thread < thread_count;thread++){
        //Create threads
		pthread_create(&thread_handles[thread],NULL,Hello,(void*)thread);
	}
	printf("Hello from the main thread\n");

	for(thread = 0;thread < thread_count;thread++){
        //Wait util thread_handles[thread] complete
		pthread_join(thread_handles[thread],NULL);
	}
	free(thread_handles);
	return 0;
}//main

void *Hello(void *rank){
	long my_rank=(long)rank;
	printf("Hello from thread %ld of %d\n",my_rank,thread_count);
	return NULL;
}//Hello

熟悉了基本的代码(Hello函数可以被你想让线程执行的函数所替代)了之后,来看用并行的方法来估计π的大小。

公式是这样的:pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)

我们用多个线程来处理这些计算,每个线程处理一部分。把所有计算都加起来,就是π的值了。

这就涉及到一个问题了,假设变量sum为全局变量,那么多个线程对它(临界区)进行修改的时候,会产生覆盖的情况。

下面我分别用忙等待,互斥量,信号量来解决临界区问题。

/

忙等待:

#include
#include
#include
/*
   pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)

   critical_sections problem
   method:Busy-Waiting
       but it will continually use the CPU accomplishing nothing
*/

int thread_count;            //thread's num
int n = 1000000;             //10^6
double sum = 0.0;
int flag = 0;                

void *Thread_sum(void* rank);

int main(int argc,char* argv[]){
	//Use long in case of 64-bit system
	long thread;
	pthread_t* thread_handles;	
	//Get number of threads from command line
	thread_count = strtol(argv[1],NULL,10);
	thread_handles = malloc(thread_count*sizeof(pthread_t));

	for(thread = 0;thread < thread_count;thread++){
        //Create threads
		pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);
	}

	printf("Hello from the main thread\n");

	for(thread = 0;thread < thread_count;thread++){
        //Wait util thread_handles[thread] complete
		pthread_join(thread_handles[thread],NULL);

	}

	free(thread_handles);
	printf("%f",4*sum);
	return 0;
}//main

void *Thread_sum(void *rank){
	long my_rank=(long)rank;
	double factor,my_sum = 0.0;
	long long i;
	long long my_n = n/thread_count;
	long long my_first_i = my_n*my_rank;
	long long my_last_i = my_first_i + my_n;

	if(my_first_i % 2 == 0)
		factor = 1.0;
	else
        factor = -1.0;

	for(i = my_first_i;i < my_last_i;i++,factor = -factor){
		my_sum += factor/(2*i+1);
	}

	//Use Busy-Waiting to solve critical sections after loop
	while(flag != my_rank);
	sum += my_sum;
	flag = (flag+1) % thread_count;
	return NULL;
}//Thread_sum

缺点:

1.但是这种方法持续的占用CPU资源(因为使用了while),而且执行相加的顺序是按照编号来。

2.如果打开编译器优化,它也可能是不可靠的

/

互斥量:

#include

#include

#include

/*

   pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)



   critical_sections problem

   method:Mutexes

*/



int thread_count;            //thread's num

int n = 1000000;             //10^6

double sum = 0.0;

int flag = 0;                

pthread_mutex_t mutex;



void *Thread_sum(void* rank);



int main(int argc,char* argv[]){

	//Use long in case of 64-bit system

	long thread;

	pthread_t* thread_handles;

	

	//Get number of threads from command line

	thread_count = strtol(argv[1],NULL,10);

	thread_handles = malloc(thread_count*sizeof(pthread_t));

	//initialize Mutex

	pthread_mutex_init(&mutex,NULL);



	for(thread = 0;thread < thread_count;thread++){

        	//Create threads

		pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);

	}

	printf("Hello from the main thread\n");

	for(thread = 0;thread < thread_count;thread++){

                //Wait util thread_handles[thread] complete

		pthread_join(thread_handles[thread],NULL);

	}



	free(thread_handles);

	pthread_mutex_destroy(&mutex);	

	printf("%f",4*sum);

	return 0;

}//main



void *Thread_sum(void *rank){

	long my_rank=(long)rank;

	double factor,my_sum = 0.0;

	long long i;

	long long my_n = n/thread_count;

	long long my_first_i = my_n*my_rank;

	long long my_last_i = my_first_i + my_n;



	if(my_first_i % 2 == 0)

		factor = 1.0;

	else

		factor = -1.0;

	for(i = my_first_i;i < my_last_i;i++,factor = -factor){

		my_sum += factor/(2*i+1);

	}



	//Use Mutexes to solve critical sections after loop

	pthread_mutex_lock(&mutex);

	sum += my_sum;

	pthread_mutex_unlock(&mutex);



	return NULL;

}//Thread_sum

在这个例子里通过对临界区加锁(pthread_mutex_lock())和解锁(pthread_mutex_unlock())来限制每次只能有一个线程访问临界区。所以互斥量可以用来避免对关键部分的冲突访问

///

信号量:

#include

#include

#include

#include

/*

   pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)



   critical_sections problem

   method:semaphore

*/



int thread_count;            //thread's num

int n = 1000000;             //10^6

double sum = 0.0;

int flag = 0;                

sem_t sem;



void *Thread_sum(void* rank);



int main(int argc,char* argv[]){

	//Use long in case of 64-bit system

	long thread;

	pthread_t* thread_handles;

	

	//Get number of threads from command line

	thread_count = strtol(argv[1],NULL,10);

	thread_handles = malloc(thread_count*sizeof(pthread_t));

	//initialize semaphore

	sem_init(&sem,0,1);



	for(thread = 0;thread < thread_count;thread++){

        	//Create threads

		pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);

	}

	printf("Hello from the main thread\n");

	for(thread = 0;thread < thread_count;thread++){

                //Wait util thread_handles[thread] complete

		pthread_join(thread_handles[thread],NULL);

	}



	free(thread_handles);

	sem_destroy(&sem);	

	printf("%f",4*sum);

	return 0;

}//main



void *Thread_sum(void *rank){

	long my_rank=(long)rank;

	double factor,my_sum = 0.0;

	long long i;

	long long my_n = n/thread_count;

	long long my_first_i = my_n*my_rank;

	long long my_last_i = my_first_i + my_n;



	if(my_first_i % 2 == 0)

		factor = 1.0;

	else

		factor = -1.0;

	for(i = my_first_i;i < my_last_i;i++,factor = -factor){

		my_sum += factor/(2*i+1);

	}



	//Use semaphore to solve critical sections after loop

	sem_wait(&sem);

	sum += my_sum;

	sem_post(&sem);



	return NULL;

}//Thread_sum

这里初始化设定sem=1,所以在第一个要访问临界区的线程,运行sem_wait(sem),sem减一,并可以访问临界区。而其他线程执行sem_wait(sem)时,因sem=0,所以处于等待的状态。

优点:信号量比互斥量更强大,因为它们可以初始化为任何非负的值

/

barrier是程序中的一个点,在这个点上,线程阻塞,直到所有线程都到达它为止。也可以理解为多线程同步问题。

下面用条件变量结合互斥量来模拟这一过程:

#include

#include

#include

/*

   pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)



   critical_sections problem

   method:Mutexes

   barrier problem

   method:condition variables

*/



int thread_count;            //thread's num

int n = 1000000;             //10^6

double sum = 0.0;

int flag = 0;

int count = 0;         //Use it to judge whether all of threads arrive barrier                

pthread_mutex_t mutex;

pthread_cond_t cond_var;



void *Thread_sum(void* rank);



int main(int argc,char* argv[]){

	//Use long in case of 64-bit system

	long thread;

	pthread_t* thread_handles;

	

	//Get number of threads from command line

	thread_count = strtol(argv[1],NULL,10);

	thread_handles = malloc(thread_count*sizeof(pthread_t));

	//initialize Mutex

	pthread_mutex_init(&mutex,NULL);



	for(thread = 0;thread < thread_count;thread++){

        	//Create threads

		pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);

	}

	printf("Hello from the main thread\n");

	for(thread = 0;thread < thread_count;thread++){

                //Wait util thread_handles[thread] complete

		pthread_join(thread_handles[thread],NULL);

	}



	free(thread_handles);

	pthread_mutex_destroy(&mutex);	

	printf("%f",4*sum);

	return 0;

}//main



void *Thread_sum(void *rank){

	long my_rank=(long)rank;

	double factor,my_sum = 0.0;

	long long i;

	long long my_n = n/thread_count;

	long long my_first_i = my_n*my_rank;

	long long my_last_i = my_first_i + my_n;



	if(my_first_i % 2 == 0)

		factor = 1.0;

	else

		factor = -1.0;

	for(i = my_first_i;i < my_last_i;i++,factor = -factor){

		my_sum += factor/(2*i+1);

	}



	//Use Mutexes to solve critical sections after loop

	//Use condition variables to solve barrier problem

	pthread_mutex_lock(&mutex);

	sum += my_sum;

	count++;

	if(count == thread_count){

		count = 0;

		pthread_cond_broadcast(&cond_var);

		printf("%ld(the last thread) has arrive at barrier\n",my_rank);

	}else{

		while(pthread_cond_wait(&cond_var,&mutex) != 0);

		printf("%ld wake up\n",my_rank);

	}

	pthread_mutex_unlock(&mutex);



	return NULL;

}//Thread_sum

这里的pthread_cond_wait()执行的机制是:

1,线程放在等待队列上,解锁

2,等待 pthread_cond_signal或者pthread_cond_broadcast信号之后去竞争锁

3,若竞争到互斥索则加锁。

这样就可以用修改临界区变量count(计数器)来解决barrier问题了。

 

 

你可能感兴趣的:(并行计算)