MySQL LEFT/RIGHT JOIN算法效率分析

本文内容遵从CC版权协议, 可以随意转载, 但必须以超链接形式标明文章原始出处和作者信息及版权声明
网址: http://www.penglixun.com/tech/database/mysql_outer_join_analyse.html
上次讨论了MySQL INNER JOIN算法的效率,怪自己没看仔细官方文档,实际上MySQL对内联查询采用了“下推”的方法,见官方文档
理论上下推也是可以用到外联接上的,没看懂官方的那段伪代码,根据自己的想法写了一段测试代码,就是昨天代码的改进。

 

下面是官方给出的采用下推的算法:

FOR each row t1 in T1 such that C1(t1) {
  BOOL f1:=FALSE;
  FOR each row t2 in T2
      such that P1(t1,t2) AND (f1?C2(t2):TRUE) {
    BOOL f2:=FALSE;
    FOR each row t3 in T3
        such that P2(t2,t3) AND (f1&&f2?C3(t3):TRUE) {
      IF (f1&&f2?TRUE:(C2(t2) AND C3(t3))) {
        t:=t1||t2||t3; OUTPUT t;
      }
      f2=TRUE;
      f1=TRUE;
    }
    IF (!f2) {
      IF (f1?TRUE:C2(t2) && P(t1,t2,NULL)) {
        t:=t1||t2||NULL; OUTPUT t;
      }
      f1=TRUE;
    }
  }
  IF (!f1 && P(t1,NULL,NULL)) {
      t:=t1||NULL||NULL; OUTPUT t;
  }
}

下面是我写的测试,包括内联查询和左联查询的测试:

#include <iostream>
#include <cstdlib>
#include <time.h>
#define MAXN 10000
#define LIMIT 500
 
using namespace std;
 
//计时器
class Timer {
public :
	//构造函数
	Timer ();
	//析构函数
	~Timer ();
	//开始计时
	void begin();
	//计时结束
	void end();
	//获取时间
	double get_time();
private :
	clock_t start, finish;
	double time;
};
 
Timer::Timer () {
	start = 0;
	finish = 0;
}
 
Timer::~Timer () {
	start = 0;
	finish = 0;
}
 
void Timer::begin () {
	start = clock();
}
 
void Timer::end () {
	finish = clock();
}
 
double Timer::get_time() {
	time = (double)(finish-start)/CLOCKS_PER_SEC;
	return time;
}
 
int a[MAXN];
int b[MAXN];
int c[MAXN];
int d[MAXN];
int p[4][2];
 
//初始化测试数据
void init () {
	srand(time(0));
	//参与关键查询的数据
	//cout << "a\tb\tc\td" << endl;
	for(int i=0; i<MAXN; ++i) {
		a[i] = i;
		b[i] = rand()%MAXN;
		c[i] = rand()%MAXN;
		d[i] = rand()%MAXN;
		//cout << a[i] << "\t"<< b[i] << "\t" << c[i] << "\t" << d[i] << endl;
	}
 
	//查询的限制条件
	for (int i=0; i<4; ++i) {
		cout << i << ": ";
		for (int j=0; j<2; ++j) {
			//p[i][0]随机一个小于MAXN的1/2的数,p[i][1]随机一个大于MAXN的1/2的数
			p[i][j] = rand()%(int)(MAXN/2) + (int)(MAXN/2)*j;
		}
		cout << p[i][0] << ", " << p[i][1] << endl;
	}
 
	return ;
}
 
//格式化打印
void print(int &cnt,int x, int y, int z) {
	cnt++;
	if (cnt%LIMIT == 0) {
		cout << "Row " << cnt << ": ";
		if (z!=-1) {
			cout << x << ',' << y << ',' << z << endl;
		}
		else {
			if (y!=-1) {
				cout << x << ',' << y << ',' << "N" << endl;
			}
			else {
				cout << x << ',' << "N" << ',' << "N" << endl;
			}
		}
	}
	return ;
}
 
//内联查询测试
//SELECT * 
//FROM a INNER JOIN b ON a.id=b.id 
//INNER JOIN c ON b.id=c.id
//WHERE a.id BETWEEN p00 AND p01
//AND b.id BETWEEN p10 AND p11
//AND c.id BETWEEN p20 AND p11
void innerJoin () {
	int count1, count2;
	Timer timer;
	double time1, time2;
 
	cout << "====Inner Join Test====" << endl;
 
	//直接JOIN再判断筛选条件
	cout << "Test1:" << endl;
	count1 = 0;
	timer.begin ();
	for(int i=0; i<MAXN; ++i) {
		for(int j=0; j<MAXN; ++j) {
			if (a[i]==b[j]) {
				for(int k=0; k<MAXN; ++k) {
					if(b[j]==c[k]) {
						if (a[i]>p[0][0] && a[i]<p[0][1] \
						&& b[j]>p[1][0] && b[j]<p[1][1] \
						&& c[k]>p[2][0] && c[k]<p[2][1]) {
							print(count1, a[i], b[j], c[k]);
						}
					}
				}
			}
		}
	}
	timer.end ();
	time1 = timer.get_time();
 
	//先判断筛选条件再JOIN
	cout << "Test2:" << endl;
	count2 = 0;
	timer.begin ();
	for(int i=0; i<MAXN; ++i) {
		if (a[i]>p[0][0] && a[i]<p[0][1]) {
			for(int j=0; j<MAXN; ++j) {
				if (a[i]==b[j] && b[j]>p[1][0] && b[j]<p[1][1]) {
					for(int k=0; k<MAXN; ++k) {
						if(b[j]==c[k] && c[k]>p[2][0] && c[k]<p[2][1]) {
							print(count2, a[i], b[j], c[k]);
						}
					}
				}
			}
		}
	}
	timer.end ();
	time2 = timer.get_time();
 
	//校验数据的正确性并输出
	if(count1 == count2) {
		cout << endl;
		cout << count1 << " Rows: ";
		cout << time1 << " VS " << time2 << endl;
	}
 
	return ;
}
 
//外联查询测试
//SELECT * 
//FROM a LEFT JOIN
//(b LEFT JOIN c ON b.id=c.id)
//ON a.id=b.id
//WHERE a.id BETWEEN p00 AND p01
//AND b.id BETWEEN p10 AND p11
//AND c.id BETWEEN p20 AND p11
void leftJoin () {
	bool flag1, flag2;
	int count1, count2, count3;
	Timer timer;
	double time1, time2, time3;
 
	cout << "====Left Join Test====" << endl;
 
	//先JOIN再判断条件
	cout << "Test1:" << endl;
	count1 = 0;
	timer.begin ();
	for(int i=0; i<MAXN; ++i) {
		flag1 = false; //标记a,b表是否有匹配行
		for(int j=0; j<MAXN; ++j) {
			if (a[i]==b[j]) {
				flag2 = false; //标记b,c表是否有匹配行
				for(int k=0; k<MAXN; ++k) {
					if(b[j]==c[k]) {
						if (a[i]>p[0][0] && a[i]<p[0][1] \
						&& b[j]>p[1][0] && b[j]<p[1][1] \
						&& c[k]>p[2][0] && c[k]<p[2][1]) {
							print(count1, a[i], b[j], c[k]);
							flag2 = true;
							flag1 = true;
						}
					}
				}
				if(!flag2) {
					if(a[i]>p[0][0] && a[i]<p[0][1] \
					&& b[j]>p[1][0] && b[j]<p[1][1]) {
						print(count1, a[i], b[j], -1);
						flag1 = true;
					}
				}
			}
		}
		if(!flag1) {
			if(a[i]>p[0][0] && a[i]<p[0][1]) {
				print(count1, a[i], -1, -1);
			}
		}	
	}
 
	timer.end ();
	time1 = timer.get_time();
 
	//先判断条件再JOIN
	cout << "Test2:" << endl;
	count2 = 0;
	timer.begin ();
	for(int i=0; i<MAXN; ++i) {
		flag1 = false;
		if (a[i]>p[0][0] && a[i]<p[0][1]) {
			for(int j=0; j<MAXN; ++j) {
				if (a[i]==b[j] && b[j]>p[1][0] && b[j]<p[1][1]) {
					flag2 = false;
					for(int k=0; k<MAXN; ++k) {
						if(b[j]==c[k] && c[k]>p[2][0] && c[k]<p[2][1]) {
							print(count2, a[i], b[j], c[k]);
							flag1 = true;
							flag2 = true;
						}
					}
					if(!flag2) {
						print(count2, a[i], b[j], -1);
						flag1 = true;
					}
				}
			}
			if(!flag1) {
				print(count2, a[i], -1, -1);
			}
		}
	}
	timer.end ();
	time2 = timer.get_time();
 
	//先判断选择条件再判断关联条件再JOIN,其实跟上面基本是一样的
	cout << "Test3:" << endl;
	count3 = 0;
	timer.begin ();
	for(int i=0; i<MAXN; ++i) {
		if (a[i]>p[0][0] && a[i]<p[0][1]) {
			flag1 = false;
			for(int j=0; j<MAXN; ++j) {
				if (b[j]>p[1][0] && b[j]<p[1][1]) {
					if(a[i]==b[j]) {
						flag2 = false;
						for(int k=0; k<MAXN; ++k) {
							if(c[k]>p[2][0] && c[k]<p[2][1]) {
								if(b[j]==c[k]) {
									print(count3, a[i], b[j], c[k]);
									flag1 = true;
									flag2 = true;
								}
							}
						}
						if(!flag2) {
							print(count3, a[i], b[j], -1);
							flag1 = true;
						}
					}
				}
			}
			if(!flag1) {
				print(count3, a[i], -1, -1);
			}
		}
	}
	timer.end ();
	time3 = timer.get_time();
 
	//校验数据的正确性并输出
	if(count1==count2 && count2==count3) {
		cout << endl;
		cout << "Fetch Rows: " << count1 << endl;
		cout << time1 << " VS " << time2 << " VS " << time3 << endl;
	}
	else {
		cout << "Error: " << count1 << " <> " << count2 << " <> " << count3 << endl;
	}
 
	return ;
}
 
int main() {
	init();
	innerJoin();
	leftJoin();
	return 0;
}

对于左联查询,我的测试结果是,Test2的方法好于Test1,Test3经常还不如Test1,虽然加入了选择条件判断。
主要原因应该是,C++判断AND条件只要有一个不满足就判定为false,于是if越少判断越快。
欢迎探讨MySQL的算法实现和效率。

你可能感兴趣的:(mysql,效率,分析,right join,Outer Join)