本文内容遵从CC版权协议, 可以随意转载, 但必须以超链接形式标明文章原始出处和作者信息及版权声明
网址: http://www.penglixun.com/tech/database/mysql_outer_join_analyse.html 上次讨论了MySQL INNER JOIN算法的效率,怪自己没看仔细官方文档,实际上MySQL对内联查询采用了“下推”的方法,见官方文档。
理论上下推也是可以用到外联接上的,没看懂官方的那段伪代码,根据自己的想法写了一段测试代码,就是昨天代码的改进。
下面是官方给出的采用下推的算法:
FOR each row t1 in T1 such that C1(t1) { BOOL f1:=FALSE; FOR each row t2 in T2 such that P1(t1,t2) AND (f1?C2(t2):TRUE) { BOOL f2:=FALSE; FOR each row t3 in T3 such that P2(t2,t3) AND (f1&&f2?C3(t3):TRUE) { IF (f1&&f2?TRUE:(C2(t2) AND C3(t3))) { t:=t1||t2||t3; OUTPUT t; } f2=TRUE; f1=TRUE; } IF (!f2) { IF (f1?TRUE:C2(t2) && P(t1,t2,NULL)) { t:=t1||t2||NULL; OUTPUT t; } f1=TRUE; } } IF (!f1 && P(t1,NULL,NULL)) { t:=t1||NULL||NULL; OUTPUT t; } } |
下面是我写的测试,包括内联查询和左联查询的测试:
#include <iostream> #include <cstdlib> #include <time.h> #define MAXN 10000 #define LIMIT 500 using namespace std; //计时器 class Timer { public : //构造函数 Timer (); //析构函数 ~Timer (); //开始计时 void begin(); //计时结束 void end(); //获取时间 double get_time(); private : clock_t start, finish; double time; }; Timer::Timer () { start = 0; finish = 0; } Timer::~Timer () { start = 0; finish = 0; } void Timer::begin () { start = clock(); } void Timer::end () { finish = clock(); } double Timer::get_time() { time = (double)(finish-start)/CLOCKS_PER_SEC; return time; } int a[MAXN]; int b[MAXN]; int c[MAXN]; int d[MAXN]; int p[4][2]; //初始化测试数据 void init () { srand(time(0)); //参与关键查询的数据 //cout << "a\tb\tc\td" << endl; for(int i=0; i<MAXN; ++i) { a[i] = i; b[i] = rand()%MAXN; c[i] = rand()%MAXN; d[i] = rand()%MAXN; //cout << a[i] << "\t"<< b[i] << "\t" << c[i] << "\t" << d[i] << endl; } //查询的限制条件 for (int i=0; i<4; ++i) { cout << i << ": "; for (int j=0; j<2; ++j) { //p[i][0]随机一个小于MAXN的1/2的数,p[i][1]随机一个大于MAXN的1/2的数 p[i][j] = rand()%(int)(MAXN/2) + (int)(MAXN/2)*j; } cout << p[i][0] << ", " << p[i][1] << endl; } return ; } //格式化打印 void print(int &cnt,int x, int y, int z) { cnt++; if (cnt%LIMIT == 0) { cout << "Row " << cnt << ": "; if (z!=-1) { cout << x << ',' << y << ',' << z << endl; } else { if (y!=-1) { cout << x << ',' << y << ',' << "N" << endl; } else { cout << x << ',' << "N" << ',' << "N" << endl; } } } return ; } //内联查询测试 //SELECT * //FROM a INNER JOIN b ON a.id=b.id //INNER JOIN c ON b.id=c.id //WHERE a.id BETWEEN p00 AND p01 //AND b.id BETWEEN p10 AND p11 //AND c.id BETWEEN p20 AND p11 void innerJoin () { int count1, count2; Timer timer; double time1, time2; cout << "====Inner Join Test====" << endl; //直接JOIN再判断筛选条件 cout << "Test1:" << endl; count1 = 0; timer.begin (); for(int i=0; i<MAXN; ++i) { for(int j=0; j<MAXN; ++j) { if (a[i]==b[j]) { for(int k=0; k<MAXN; ++k) { if(b[j]==c[k]) { if (a[i]>p[0][0] && a[i]<p[0][1] \ && b[j]>p[1][0] && b[j]<p[1][1] \ && c[k]>p[2][0] && c[k]<p[2][1]) { print(count1, a[i], b[j], c[k]); } } } } } } timer.end (); time1 = timer.get_time(); //先判断筛选条件再JOIN cout << "Test2:" << endl; count2 = 0; timer.begin (); for(int i=0; i<MAXN; ++i) { if (a[i]>p[0][0] && a[i]<p[0][1]) { for(int j=0; j<MAXN; ++j) { if (a[i]==b[j] && b[j]>p[1][0] && b[j]<p[1][1]) { for(int k=0; k<MAXN; ++k) { if(b[j]==c[k] && c[k]>p[2][0] && c[k]<p[2][1]) { print(count2, a[i], b[j], c[k]); } } } } } } timer.end (); time2 = timer.get_time(); //校验数据的正确性并输出 if(count1 == count2) { cout << endl; cout << count1 << " Rows: "; cout << time1 << " VS " << time2 << endl; } return ; } //外联查询测试 //SELECT * //FROM a LEFT JOIN //(b LEFT JOIN c ON b.id=c.id) //ON a.id=b.id //WHERE a.id BETWEEN p00 AND p01 //AND b.id BETWEEN p10 AND p11 //AND c.id BETWEEN p20 AND p11 void leftJoin () { bool flag1, flag2; int count1, count2, count3; Timer timer; double time1, time2, time3; cout << "====Left Join Test====" << endl; //先JOIN再判断条件 cout << "Test1:" << endl; count1 = 0; timer.begin (); for(int i=0; i<MAXN; ++i) { flag1 = false; //标记a,b表是否有匹配行 for(int j=0; j<MAXN; ++j) { if (a[i]==b[j]) { flag2 = false; //标记b,c表是否有匹配行 for(int k=0; k<MAXN; ++k) { if(b[j]==c[k]) { if (a[i]>p[0][0] && a[i]<p[0][1] \ && b[j]>p[1][0] && b[j]<p[1][1] \ && c[k]>p[2][0] && c[k]<p[2][1]) { print(count1, a[i], b[j], c[k]); flag2 = true; flag1 = true; } } } if(!flag2) { if(a[i]>p[0][0] && a[i]<p[0][1] \ && b[j]>p[1][0] && b[j]<p[1][1]) { print(count1, a[i], b[j], -1); flag1 = true; } } } } if(!flag1) { if(a[i]>p[0][0] && a[i]<p[0][1]) { print(count1, a[i], -1, -1); } } } timer.end (); time1 = timer.get_time(); //先判断条件再JOIN cout << "Test2:" << endl; count2 = 0; timer.begin (); for(int i=0; i<MAXN; ++i) { flag1 = false; if (a[i]>p[0][0] && a[i]<p[0][1]) { for(int j=0; j<MAXN; ++j) { if (a[i]==b[j] && b[j]>p[1][0] && b[j]<p[1][1]) { flag2 = false; for(int k=0; k<MAXN; ++k) { if(b[j]==c[k] && c[k]>p[2][0] && c[k]<p[2][1]) { print(count2, a[i], b[j], c[k]); flag1 = true; flag2 = true; } } if(!flag2) { print(count2, a[i], b[j], -1); flag1 = true; } } } if(!flag1) { print(count2, a[i], -1, -1); } } } timer.end (); time2 = timer.get_time(); //先判断选择条件再判断关联条件再JOIN,其实跟上面基本是一样的 cout << "Test3:" << endl; count3 = 0; timer.begin (); for(int i=0; i<MAXN; ++i) { if (a[i]>p[0][0] && a[i]<p[0][1]) { flag1 = false; for(int j=0; j<MAXN; ++j) { if (b[j]>p[1][0] && b[j]<p[1][1]) { if(a[i]==b[j]) { flag2 = false; for(int k=0; k<MAXN; ++k) { if(c[k]>p[2][0] && c[k]<p[2][1]) { if(b[j]==c[k]) { print(count3, a[i], b[j], c[k]); flag1 = true; flag2 = true; } } } if(!flag2) { print(count3, a[i], b[j], -1); flag1 = true; } } } } if(!flag1) { print(count3, a[i], -1, -1); } } } timer.end (); time3 = timer.get_time(); //校验数据的正确性并输出 if(count1==count2 && count2==count3) { cout << endl; cout << "Fetch Rows: " << count1 << endl; cout << time1 << " VS " << time2 << " VS " << time3 << endl; } else { cout << "Error: " << count1 << " <> " << count2 << " <> " << count3 << endl; } return ; } int main() { init(); innerJoin(); leftJoin(); return 0; } |
对于左联查询,我的测试结果是,Test2的方法好于Test1,Test3经常还不如Test1,虽然加入了选择条件判断。
主要原因应该是,C++判断AND条件只要有一个不满足就判定为false,于是if越少判断越快。
欢迎探讨MySQL的算法实现和效率。