int main(int argc, char* argv[])
{
#pragma omp parallel for
for (int i = 0; i < 10; i++ )
{
printf("i = %d/n", i);
}
return 0;
}
void test()
{
int a = 0;
clock_t t1 = clock();
for (int i = 0; i < 100000000; i++)
{
a = i+1;
}
clock_t t2 = clock();
printf("Time = %d/n", t2-t1);
}
int main(int argc, char* argv[])
{
clock_t t1 = clock();
#pragma omp parallel for
for ( int j = 0; j < 2; j++ ){
test();
}
clock_t t2 = clock();
printf("Total time = %d/n", t2-t1);
test();
return 0;
}
在test()函数中,执行了1亿次循环,主要是用来执行一个长时间的操作。
fork/join并行执行模式的概念
int main(int argc, char* argv[])
{
clock_t t1 = clock();
#pragma omp parallel for
for ( int j = 0; j < 2; j++ ){
test();
}
clock_t t2 = clock();
printf("Total time = %d/n", t2-t1);
test();
return 0;
}
在没有执行完for循环中的代码之前,后面的clock_t t2 = clock();这行代码是不会执行的,如果和调用线程创建函数相比,它相当于先创建线程,并等待线程执行完,所以这种并行模式中在主线程里创建的线程并没有和主线程并行运行。
OpenMP指令和库函数介绍
parallel 指令的用法
#pragma omp parallel [for | sections] [子句[子句]…]
{
//代码
}
parallel语句后面要跟一个大括号对将要并行执行的代码括起来。
void main(int argc, char *argv[]) {
#pragma omp parallel
{
printf(“Hello, World!/n”);
}
}
void main(int argc, char *argv[]) {
#pragma omp parallel num_threads(8)
{
printf(“Hello, World!, ThreadId=%d/n”, omp_get_thread_num() );
}
}
for指令的使用方法
int j = 0;
#pragma omp for
for ( j = 0; j < 4; j++ ){
printf(“j = %d, ThreadId = %d/n”, j, omp_get_thread_num());
}
执行以上代码后打印出以下结果
int j = 0;
#pragma omp parallel for
for ( j = 0; j < 4; j++ ){
printf(“j = %d, ThreadId = %d/n”, j, omp_get_thread_num());
}
执行后会打印出以下结果:
int j = 0;
#pragma omp parallel
{
#pragma omp for
for ( j = 0; j < 4; j++ ){
printf(“j = %d, ThreadId = %d/n”, j, omp_get_thread_num());
}
}
执行以上代码会打印出以下结果:
int j;
#pragma omp parallel
{
#pragma omp for
for ( j = 0; j < 100; j++ ){
…
}
#pragma omp for
for ( j = 0; j < 100; j++ ){
…
}
…
}
for 循环语句中,书写是需要按照一定规范来写才可以的,即for循环小括号内的语句要按照一定的规范进行书写,for语句小括号里共有三条语句
for( i=start; i < end; i++)
i=start; 是for循环里的第一条语句,必须写成 “变量=初值” 的方式。如 i=0
和section指令的用法
void main(int argc, char *argv)
{
#pragma omp parallel sections {
#pragma omp section
printf(“section 1 ThreadId = %d/n”, omp_get_thread_num());
#pragma omp section
printf(“section 2 ThreadId = %d/n”, omp_get_thread_num());
#pragma omp section
printf(“section 3 ThreadId = %d/n”, omp_get_thread_num());
#pragma omp section
printf(“section 4 ThreadId = %d/n”, omp_get_thread_num());
}
void main(int argc, char *argv)
{
#pragma omp parallel {
#pragma omp sections
{
#pragma omp section
printf(“section 1 ThreadId = %d/n”, omp_get_thread_num());
#pragma omp section
printf(“section 2 ThreadId = %d/n”, omp_get_thread_num());
}
#pragma omp sections
{
#pragma omp section
printf(“section 3 ThreadId = %d/n”, omp_get_thread_num());
#pragma omp section
printf(“section 4 ThreadId = %d/n”, omp_get_thread_num());
}
}
执行后将打印出以下结果:
中的数据处理子句
相关文档连接:
private子句
int k = 100;
#pragma omp parallel for private(k)
for ( k=0; k < 10; k++)
{
printf("k=%d/n", k);
}
printf("last k=%d/n", k);
firstprivate子句
int k = 100;
#pragma omp parallel for firstprivate(k)
for ( i=0; i < 4; i++)
{
k+=i;
printf("k=%d/n",k);
}
printf("last k=%d/n", k);
lastprivate子句
int k = 100;
#pragma omp parallel for firstprivate(k),lastprivate(k)
for ( i=0; i < 4; i++)
{
k+=i;
printf("k=%d/n",k);
}
printf("last k=%d/n", k);
上面代码执行后的打印结果如下:
threadprivate子句
int counter = 0;
#pragma omp threadprivate(counter)
int increment_counter()
{
counter++;
return(counter);
}
int increment_counter2()
{
static int counter = 0;
#pragma omp threadprivate(counter)
counter++;
return(counter);
}
shared子句
子句
reduction子句
Operator
|
Initialization value
|
+
|
0
|
*
|
1
|
-
|
0
|
&
|
~0
|
|
|
0
|
^
|
0
|
&&
|
1
|
||
|
0
|
int i, sum = 100;
#pragma omp parallel for reduction(+: sum)
for ( i = 0; i < 1000; i++ )
{
sum += i;
}
printf( "sum = %ld/n", sum);
copyin子句
int main(int argc, char* argv[])
{
int iterator;
#pragma omp parallel sections copyin(counter)
{
#pragma omp section
{
int count1;
for ( iterator = 0; iterator < 100; iterator++ )
{
count1 = increment_counter();
}
printf("count1 = %ld/n", count1);
}
#pragma omp section
{
int count2;
for ( iterator = 0; iterator < 200; iterator++ )
{
count2 = increment_counter();
}
printf("count2 = %ld/n", count2);
}
}
printf("counter = %ld/n", counter);
}
打印结果如下:
copyprivate子句
int counter = 0;
#pragma omp threadprivate(counter)
int increment_counter()
{
counter++;
return(counter);
}
#pragma omp parallel
{
int count;
#pragma omp single copyprivate(counter)
{
counter = 50;
}
count = increment_counter();
printf("ThreadId: %ld, count = %ld/n", omp_get_thread_num(), count);
}
Ananth Grama, Anshul Gupta,“并行计算导论”,张武等译,机械工业出版社,2005.01
Michael J. Quinn, “MPI与OpenMP并行程序设计”,陈文光等译,清华大学出版社,2004.10
中的任务调度
int i, j;
int a[100][100] = {0};
for ( i =0; i < 100; i++)
{
for( j = i; j < 100; j++ )
{
a[i][j] = i*j;
}
}
如果将最外层循环并行化的话,比如使用4个线程,如果给每个线程平均分配25次循环迭代计算的话,显然i=0和i=99的计算量相差了100倍,那么各个线程间可能出现较大的负载不平衡情况。为了解决这些问题,OpenMP中提供了几种对for循环并行化的任务调度方案。
schedule子句用法
size参数表示循环迭代次数,size参数必须是整数。static、dynamic、guided三种调度方式都可以使用size参数,也可以不使用size参数。当type参数类型为runtime时,size参数是非法的(不需要使用,如果使用的话编译器会报错)。
当parallel for编译指导语句没有带schedule子句时,大部分系统中默认采用static调度方式,这种调度方式非常简单。假设有n次循环迭代,t个线程,那么给每个线程静态分配大约n/t次迭代计算。这里为什么说大约分配n/t次呢?因为n/t不一定是整数,因此实际分配的迭代次数可能存在差1的情况,如果指定了size参数的话,那么可能相差一个size。
#pragma omp parallel for schedule(static)
for(i = 0; i < 10; i++ )
{
printf("i=%d, thread_id=%d/n", i, omp_get_thread_num());
}
#pragma omp parallel for schedule(static, 2)
for(i = 0; i < 10; i++ )
{
printf("i=%d, thread_id=%d/n", i, omp_get_thread_num());
}
#pragma omp parallel for schedule(dynamic)
for(i = 0; i < 10; i++ )
{
printf("i=%d, thread_id=%d/n", i, omp_get_thread_num());
}
打印结果如下:
#pragma omp parallel for schedule(dynamic, 2)
for(i = 0; i < 10; i++ )
{
printf("i=%d, thread_id=%d/n", i, omp_get_thread_num());
}
打印结果如下:
#pragma omp parallel for schedule(guided,2)
for(i = 0; i < 10; i++ )
{
printf("i=%d, thread_id=%d/n", i, omp_get_thread_num());
}
OpenMP创建线程中的锁及原子操作性能比较
// TestLock.cpp : OpenMP任务中的原子操作和锁性能测试程序。
//
#include
#include
#include
#include
#include
void TestAtomic()
{
clock_t t1,t2;
int i = 0;
volatile LONG a = 0;
t1 = clock();
for( i = 0; i < 2000000; i++ )
{
InterlockedIncrement( &a);
}
t2 = clock();
printf("SingleThread, InterlockedIncrement 2,000,000: a = %ld, time = %ld/n", a, t2-t1);
t1 = clock();
#pragma omp parallel for
for( i = 0; i < 2000000; i++ )
{
InterlockedIncrement( &a);
}
t2 = clock();
printf("MultiThread, InterlockedIncrement 2,000,000: a = %ld, time = %ld/n", a, t2-t1);
}
void TestOmpLock()
{
clock_t t1,t2;
int i;
int a = 0;
omp_lock_t mylock;
omp_init_lock(&mylock);
t1 = clock();
for( i = 0; i < 2000000; i++ )
{
omp_set_lock(&mylock);
a+=1;
omp_unset_lock(&mylock);
}
t2 = clock();
printf("SingleThread,omp_lock 2,000,000:a = %ld, time = %ld/n", a, t2-t1);
t1 = clock();
#pragma omp parallel for
for( i = 0; i < 2000000; i++ )
{
omp_set_lock(&mylock);
a+=1;
omp_unset_lock(&mylock);
}
t2 = clock();
printf("MultiThread,omp_lock 2,000,000:a = %ld, time = %ld/n", a, t2-t1);
omp_destroy_lock(&mylock);
}
void TestCriticalSection()
{
clock_t t1,t2;
int i;
int a = 0;
CRITICAL_SECTION cs;
InitializeCriticalSection(&cs);
t1 = clock();
for( i = 0; i < 2000000; i++ )
{
EnterCriticalSection(&cs);
a+=1;
LeaveCriticalSection(&cs);
}
t2 = clock();
printf("SingleThread, Critical_Section 2,000,000:a = %ld, time = %ld/n", a, t2-t1);
t1 = clock();
#pragma omp parallel for
for( i = 0; i < 2000000; i++ )
{
EnterCriticalSection(&cs);
a+=1;
LeaveCriticalSection(&cs);
}
t2 = clock();
printf("MultiThread, Critical_Section, 2,000,000:a = %ld, time = %ld/n", a, t2-t1);
DeleteCriticalSection(&cs);
}
int main(int argc, char* argv[])
{
TestAtomic();
TestCriticalSection();
TestOmpLock();
return 0;
}
OpenMP程序设计的两个小技巧
动态设置并行循环的线程数量
const int MIN_ITERATOR_NUM = 4;
int ncore = omp_get_num_procs(); //获取执行核的数量
int max_tn = n / MIN_ITERATOR_NUM;
int tn = max_tn > 2 * ncore ? 2 * ncore : max_tn; //tn表示要设置的线程数量
#pragma omp parallel for if( tn > 1) num_threads(tn)
for ( i = 0; i < n; i++ )
{
printf("Thread Id = %ld/n", omp_get_thread_num());
//Do some work here
}
const int g_ncore = omp_get_num_procs(); //获取执行核的数量
/** 计算循环迭代需要的线程数量
根据循环迭代次数和CPU核数及一个线程最少需要的循环迭代次数
来计算出需要的线程数量,计算出的最大线程数量不超过CPU核数
@param int n - 循环迭代次数
@param int min_n - 单个线程需要的最少迭代次数
@return int - 线程数量
*/
int dtn(int n, int min_n)
{
int max_tn = n / min_n;
int tn = max_tn > g_ncore ? g_ncore : max_tn; //tn表示要设置的线程数量
if ( tn < 1 )
{
tn = 1;
}
return tn;
}
#pragma omp parallel for num_threads(dtn(n, MIN_ITERATOR_NUM))
for ( i = 0; i < n; i++ )
{
printf("Thread Id = %ld/n", omp_get_thread_num());
//Do some work here
}
/** 矩阵串行乘法函数
@param int *a - 指向要相乘的第个矩阵的指针
@param int row_a - 矩阵a的行数
@param int col_a - 矩阵a的列数
@param int *b - 指向要相乘的第个矩阵的指针
@param int row_b - 矩阵b的行数
@param int col_b - 矩阵b的列数
@param int *c - 计算结果的矩阵的指针
@param int c_size - 矩阵c的空间大小(总元素个数)
@return void - 无
*/
void Matrix_Multiply(int *a, int row_a, int col_a,
int *b, int row_b,int col_b,
int *c, int c_size)
{
if ( col_a != row_b || c_size < row_a * col_b )
{
return;
}
int i, j, k;
//#pragma omp for private(i, j, k)
for ( i = 0; i < row_a; i++ )
{
int row_i = i * col_a;
int row_c = i * col_b;
for ( j = 0; j < col_b; j++ )
{
c[row_c + j] = 0;
for ( k = 0; k < row_b; k++ )
{
c[row_c + j] += a[row_i + k] * b[k * col_b + j];
}
}
}
}
void Parallel_Matrix_Multiply(int *a, int row_a, int col_a,
int *b, int row_b,int col_b,
int *c, int c_size )
{
if ( col_a != row_b )
{
return;
}
int i, j, k;
int index;
int border = row_a * col_b;
i = 0;
j = 0;
#pragma omp parallel private(i,j,k) num_threads(dtn(border, 1))
for ( index = 0; index < border; index++ )
{
i = index / col_b;
j = index % col_b;
int row_i = i * col_a;
int row_c = i * col_b;
c[row_c+j] = 0;
for ( k = 0; k < row_b; k++ )
{
c[row_c + j] += a[row_i+k] * b[k*col_b+j];
}
}
}
if ( j == col_b )
{
j = 0;
i++;
}
// …… 此处代表实际的矩阵乘法代码
j++;