#include <omp.h>
を入れるのはそもそも論。ループを初めとする分割箇所では、分割の頭にディレクティブとかいうものを突っ込むことで実現できる。
#pragma omp parallel for // ←コレ
for( i = 0 ; i < 1000 ; i++ ) a[i] += a[i] * a[i];
ディレクティブの必須項目はomp。OpenMP の指示文だということを示すんだから当然。他は適当にくっつけよう。
あとはOpenMP対応の関数がある。各スレッドのIDや全スレッド数を知るのに使うらしい。
全スレッドで共有する変数と各スレッドがバラバラに持つ(けれどもプログラム内での宣言は1つという)変数があるのは当然として、その区別法について。(原稿途中)
setenv OMP_NUM_THREADS n 環境変数を設定する。シェルによって書き方は異なる。(この例はcsh式) #pragma omp parallel num_threads(n) プログラムで設定する方法その1。プログラム内で静的に変更。 omp_set_num_threads(n); プログラムで設定する方法その2。動的に変更できるはず。
意外と奥が深い for の分割。単純な例としては
#pragma omp parallel for for( i = 0 ; i < 1000 ; i++ ) a[i] += a[i] * a[i];
だけですむ。4 スレッドを実行する環境だと
for( i = 0 ; i < 250 ; i++ ) a[i] += a[i] * a[i]; // スレッド0 の仕事 for( i = 250 ; i < 500 ; i++ ) a[i] += a[i] * a[i]; // スレッド1 の仕事 for( i = 500 ; i < 750 ; i++ ) a[i] += a[i] * a[i]; // スレッド2 の仕事 for( i = 750 ; i < 1000 ; i++ ) a[i] += a[i] * a[i]; // スレッド3 の仕事
の様に分割される。つまりブロック分割される、ということになる。
説明は面倒なので省略。指定の仕方は以下の通り。
#pragma omp parallel for schedule(割り当て方式, 割り当て幅) for( i = 0 ; i < 1000 ; i++ ) function(i);
まず割り当て幅が簡単なので説明しておくと、1つのスレッドに割り当てるループの数を決める。例えばここに 10 を指定すると、
for( i = 0 ; i < 10 ; i++ ) function(i); for( i = 10 ; i < 20 ; i++ ) function(i); for( i = 20 ; i < 30 ; i++ ) function(i); for( i = 30 ; i < 40 ; i++ ) function(i); for( i = 40 ; i < 50 ; i++ ) function(i); :
のようにループが分割される。で、この沢山のループをどのスレッドに任せるか、ということを決めるのが割り当て方式だ。これには static か dynamic の2種類が指定できる。
まずstaticを指定すると、ブロックサイクリック分割される。つまり 3 スレッドの環境では割り当てが
for( i = 0 ; i < 10 ; i++ ) function(i); // スレッド0 の仕事 for( i = 10 ; i < 20 ; i++ ) function(i); // スレッド1 の仕事 for( i = 20 ; i < 30 ; i++ ) function(i); // スレッド2 の仕事 for( i = 30 ; i < 40 ; i++ ) function(i); // スレッド0 の仕事 for( i = 40 ; i < 50 ; i++ ) function(i); // スレッド1 の仕事 :
のように、0→1→2→0→1→2→…と順番に振られていく。一方、dynamicを指定した場合は、処理が終わった順に振られていく。つまり(という程分かりやすくなっているかがわからないが)待ち行列のキューに分割されたループが(順番に)入っていて、処理が終わって暇をしているスレッドがいればそこへ割り当てられる、ということだ。
int t[] = { 3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9};
#pragma omp parallel for schedule(dynamic, 1)
for( i = 0 ; i < sizeof(t) / sizeof(t[0]) ; i++ ) {
printf("%dth job is on CPU[%d]\n", i, omp_get_thread_num());
sleep(t[i]);
printf("%dth job took %d sec.\n", i, t[i]);
}
なんかを実行してみればどうなってるか分かりやすいかもしれない。参考までに、上記のプログラムに必要なものをつけて動かした結果は
8スレッド環境
0th job is on CPU[4]
1th job is on CPU[1]
6th job is on CPU[2]
7th job is on CPU[5]
5th job is on CPU[0]
4th job is on CPU[6]
2th job is on CPU[3]
3th job is on CPU[7]
1th job took 1 sec.
8th job is on CPU[1]
3th job took 1 sec.
9th job is on CPU[7]
6th job took 2 sec.
10th job is on CPU[2]
0th job took 3 sec.
1th job is on CPU[4]
2th job took 4 sec.
12th job is on CPU[3]
9th job took 3 sec.
13th job is on CPU[7]
4th job took 5 sec.
14th job is on CPU[6]
8th job took 5 sec.
7th job took 6 sec.
10th job took 5 sec.
5th job took 9 sec.
11th job took 8 sec.
13th job took 7 sec.
12th job took 9 sec.
14th job took 9 sec.
6スレッド環境
0th job is on CPU[0]
1th job is on CPU[4]
4th job is on CPU[5]
2th job is on CPU[3]
5th job is on CPU[2]
3th job is on CPU[1]
1th job took 1 sec.
6th job is on CPU[4]
3th job took 1 sec.
7th job is on CPU[1]
0th job took 3 sec.
8th job is on CPU[0]
6th job took 2 sec.
9th job is on CPU[4]
2th job took 4 sec.
10th job is on CPU[3]
4th job took 5 sec.
11th job is on CPU[5]
9th job took 3 sec.
12th job is on CPU[4]
7th job took 6 sec.
13th job is on CPU[1]
8th job took 5 sec.
14th job is on CPU[0]
10th job took 5 sec.
5th job took 9 sec.
11th job took 8 sec.
13th job took 7 sec.
12th job took 9 sec.
14th job took 9 sec.
4スレッド環境
0th job is on CPU[0]
3th job is on CPU[2]
1th job is on CPU[1]
2th job is on CPU[3]
3th job took 1 sec.
4th job is on CPU[2]
1th job took 1 sec.
5th job is on CPU[1]
0th job took 3 sec.
6th job is on CPU[0]
2th job took 4 sec.
7th job is on CPU[3]
6th job took 2 sec.
8th job is on CPU[0]
4th job took 5 sec.
9th job is on CPU[2]
9th job took 3 sec.
10th job is on CPU[2]
8th job took 5 sec.
11th job is on CPU[0]
5th job took 9 sec.
12th job is on CPU[1]
7th job took 6 sec.
13th job is on CPU[3]
10th job took 5 sec.
14th job is on CPU[2]
13th job took 7 sec.
11th job took 8 sec.
12th job took 9 sec.
14th job took 9 sec.
とまぁ、こんな感じになる。
ちなみに、ループの中身の処理時間が系統だって表すことができる場合には、この dynamic 割り当てはおすすめできない。特にその時間がループカウンタ i について単調増加ならなおさら。