首先是一个hellowerld程序,创建多个线程并打印语句,主要是想说明这三个函数:
int pthread_create(pthread_t* thread,const pthread_attr_t* attr,void* fuc,void* arg);//创建线程并关联运行函数
int pthread_join(pthread_t* thread, void **retval);//等待其他线程结束,这里的结束也意味着内存空间的释放
free(pthread_t* )//手动释放程序分配的空间,如malloc()
下面是代码(gcc -g -Wall -o helloworld heloworld.c -lpthread ; ./helloworld):
#include
#include
#include
/*
int pthread_create(pthread_t* thread,const pthread_attr_t* attr,void* fuc,void* arg);
int pthread_join(pthread_t* thread, void **retval);
*/
//thread's num
int thread_count;
void *Hello(void* rank);
int main(int argc,char* argv[]){
//Use long in case of 64-bit system
long thread;
pthread_t* thread_handles;
//Get number of threads from command line
thread_count = strtol(argv[1],NULL,10);
thread_handles = malloc(thread_count*sizeof(pthread_t));
for(thread = 0;thread < thread_count;thread++){
//Create threads
pthread_create(&thread_handles[thread],NULL,Hello,(void*)thread);
}
printf("Hello from the main thread\n");
for(thread = 0;thread < thread_count;thread++){
//Wait util thread_handles[thread] complete
pthread_join(thread_handles[thread],NULL);
}
free(thread_handles);
return 0;
}//main
void *Hello(void *rank){
long my_rank=(long)rank;
printf("Hello from thread %ld of %d\n",my_rank,thread_count);
return NULL;
}//Hello
熟悉了基本的代码(Hello函数可以被你想让线程执行的函数所替代)了之后,来看用并行的方法来估计π的大小。
公式是这样的:pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)
我们用多个线程来处理这些计算,每个线程处理一部分。把所有计算都加起来,就是π的值了。
这就涉及到一个问题了,假设变量sum为全局变量,那么多个线程对它(临界区)进行修改的时候,会产生覆盖的情况。
下面我分别用忙等待,互斥量,信号量来解决临界区问题。
/
忙等待:
#include
#include
#include
/*
pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)
critical_sections problem
method:Busy-Waiting
but it will continually use the CPU accomplishing nothing
*/
int thread_count; //thread's num
int n = 1000000; //10^6
double sum = 0.0;
int flag = 0;
void *Thread_sum(void* rank);
int main(int argc,char* argv[]){
//Use long in case of 64-bit system
long thread;
pthread_t* thread_handles;
//Get number of threads from command line
thread_count = strtol(argv[1],NULL,10);
thread_handles = malloc(thread_count*sizeof(pthread_t));
for(thread = 0;thread < thread_count;thread++){
//Create threads
pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);
}
printf("Hello from the main thread\n");
for(thread = 0;thread < thread_count;thread++){
//Wait util thread_handles[thread] complete
pthread_join(thread_handles[thread],NULL);
}
free(thread_handles);
printf("%f",4*sum);
return 0;
}//main
void *Thread_sum(void *rank){
long my_rank=(long)rank;
double factor,my_sum = 0.0;
long long i;
long long my_n = n/thread_count;
long long my_first_i = my_n*my_rank;
long long my_last_i = my_first_i + my_n;
if(my_first_i % 2 == 0)
factor = 1.0;
else
factor = -1.0;
for(i = my_first_i;i < my_last_i;i++,factor = -factor){
my_sum += factor/(2*i+1);
}
//Use Busy-Waiting to solve critical sections after loop
while(flag != my_rank);
sum += my_sum;
flag = (flag+1) % thread_count;
return NULL;
}//Thread_sum
缺点:
1.但是这种方法持续的占用CPU资源(因为使用了while),而且执行相加的顺序是按照编号来。
2.如果打开编译器优化,它也可能是不可靠的
/
互斥量:
#include
#include
#include
/*
pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)
critical_sections problem
method:Mutexes
*/
int thread_count; //thread's num
int n = 1000000; //10^6
double sum = 0.0;
int flag = 0;
pthread_mutex_t mutex;
void *Thread_sum(void* rank);
int main(int argc,char* argv[]){
//Use long in case of 64-bit system
long thread;
pthread_t* thread_handles;
//Get number of threads from command line
thread_count = strtol(argv[1],NULL,10);
thread_handles = malloc(thread_count*sizeof(pthread_t));
//initialize Mutex
pthread_mutex_init(&mutex,NULL);
for(thread = 0;thread < thread_count;thread++){
//Create threads
pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);
}
printf("Hello from the main thread\n");
for(thread = 0;thread < thread_count;thread++){
//Wait util thread_handles[thread] complete
pthread_join(thread_handles[thread],NULL);
}
free(thread_handles);
pthread_mutex_destroy(&mutex);
printf("%f",4*sum);
return 0;
}//main
void *Thread_sum(void *rank){
long my_rank=(long)rank;
double factor,my_sum = 0.0;
long long i;
long long my_n = n/thread_count;
long long my_first_i = my_n*my_rank;
long long my_last_i = my_first_i + my_n;
if(my_first_i % 2 == 0)
factor = 1.0;
else
factor = -1.0;
for(i = my_first_i;i < my_last_i;i++,factor = -factor){
my_sum += factor/(2*i+1);
}
//Use Mutexes to solve critical sections after loop
pthread_mutex_lock(&mutex);
sum += my_sum;
pthread_mutex_unlock(&mutex);
return NULL;
}//Thread_sum
在这个例子里通过对临界区加锁(pthread_mutex_lock())和解锁(pthread_mutex_unlock())来限制每次只能有一个线程访问临界区。所以互斥量可以用来避免对关键部分的冲突访问
///
信号量:
#include
#include
#include
#include
/*
pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)
critical_sections problem
method:semaphore
*/
int thread_count; //thread's num
int n = 1000000; //10^6
double sum = 0.0;
int flag = 0;
sem_t sem;
void *Thread_sum(void* rank);
int main(int argc,char* argv[]){
//Use long in case of 64-bit system
long thread;
pthread_t* thread_handles;
//Get number of threads from command line
thread_count = strtol(argv[1],NULL,10);
thread_handles = malloc(thread_count*sizeof(pthread_t));
//initialize semaphore
sem_init(&sem,0,1);
for(thread = 0;thread < thread_count;thread++){
//Create threads
pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);
}
printf("Hello from the main thread\n");
for(thread = 0;thread < thread_count;thread++){
//Wait util thread_handles[thread] complete
pthread_join(thread_handles[thread],NULL);
}
free(thread_handles);
sem_destroy(&sem);
printf("%f",4*sum);
return 0;
}//main
void *Thread_sum(void *rank){
long my_rank=(long)rank;
double factor,my_sum = 0.0;
long long i;
long long my_n = n/thread_count;
long long my_first_i = my_n*my_rank;
long long my_last_i = my_first_i + my_n;
if(my_first_i % 2 == 0)
factor = 1.0;
else
factor = -1.0;
for(i = my_first_i;i < my_last_i;i++,factor = -factor){
my_sum += factor/(2*i+1);
}
//Use semaphore to solve critical sections after loop
sem_wait(&sem);
sum += my_sum;
sem_post(&sem);
return NULL;
}//Thread_sum
这里初始化设定sem=1,所以在第一个要访问临界区的线程,运行sem_wait(sem),sem减一,并可以访问临界区。而其他线程执行sem_wait(sem)时,因sem=0,所以处于等待的状态。
优点:信号量比互斥量更强大,因为它们可以初始化为任何非负的值
/
barrier是程序中的一个点,在这个点上,线程阻塞,直到所有线程都到达它为止。也可以理解为多线程同步问题。
下面用条件变量结合互斥量来模拟这一过程:
#include
#include
#include
/*
pi=4(1-1/3+1/5-1/7+...+((-1)^n)*(1/(2n+1))+...)
critical_sections problem
method:Mutexes
barrier problem
method:condition variables
*/
int thread_count; //thread's num
int n = 1000000; //10^6
double sum = 0.0;
int flag = 0;
int count = 0; //Use it to judge whether all of threads arrive barrier
pthread_mutex_t mutex;
pthread_cond_t cond_var;
void *Thread_sum(void* rank);
int main(int argc,char* argv[]){
//Use long in case of 64-bit system
long thread;
pthread_t* thread_handles;
//Get number of threads from command line
thread_count = strtol(argv[1],NULL,10);
thread_handles = malloc(thread_count*sizeof(pthread_t));
//initialize Mutex
pthread_mutex_init(&mutex,NULL);
for(thread = 0;thread < thread_count;thread++){
//Create threads
pthread_create(&thread_handles[thread],NULL,Thread_sum,(void*)thread);
}
printf("Hello from the main thread\n");
for(thread = 0;thread < thread_count;thread++){
//Wait util thread_handles[thread] complete
pthread_join(thread_handles[thread],NULL);
}
free(thread_handles);
pthread_mutex_destroy(&mutex);
printf("%f",4*sum);
return 0;
}//main
void *Thread_sum(void *rank){
long my_rank=(long)rank;
double factor,my_sum = 0.0;
long long i;
long long my_n = n/thread_count;
long long my_first_i = my_n*my_rank;
long long my_last_i = my_first_i + my_n;
if(my_first_i % 2 == 0)
factor = 1.0;
else
factor = -1.0;
for(i = my_first_i;i < my_last_i;i++,factor = -factor){
my_sum += factor/(2*i+1);
}
//Use Mutexes to solve critical sections after loop
//Use condition variables to solve barrier problem
pthread_mutex_lock(&mutex);
sum += my_sum;
count++;
if(count == thread_count){
count = 0;
pthread_cond_broadcast(&cond_var);
printf("%ld(the last thread) has arrive at barrier\n",my_rank);
}else{
while(pthread_cond_wait(&cond_var,&mutex) != 0);
printf("%ld wake up\n",my_rank);
}
pthread_mutex_unlock(&mutex);
return NULL;
}//Thread_sum
这里的pthread_cond_wait()执行的机制是:
1,线程放在等待队列上,解锁
2,等待 pthread_cond_signal或者pthread_cond_broadcast信号之后去竞争锁
3,若竞争到互斥索则加锁。
这样就可以用修改临界区变量count(计数器)来解决barrier问题了。