python解析sql字段血缘_##[开源]HIVE数仓数据血缘分析工具-SQL解析

代码如下: Block类

package com.xiaoju.products.parse;import java.util.ArrayList;import java.util.HashMap;import java.util.HashSet;import java.util.List;import java.util.Map;import java.util.Set;import java.util.Stack;import java.util.Map.Entry;import java.util.LinkedHashSet;import org.antlr.runtime.tree.Tree;import org.apache.hadoop.hive.ql.parse.ASTNode;import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;import org.apache.hadoop.hive.ql.parse.HiveParser;import org.apache.hadoop.hive.ql.parse.ParseDriver;import com.xiaoju.products.bean.Block;import com.xiaoju.products.bean.ColLine;import com.xiaoju.products.bean.QueryTree;import com.xiaoju.products.exception.SQLParseException;import com.xiaoju.products.exception.UnSupportedException;import com.xiaoju.products.util.Check;import com.xiaoju.products.util.MetaCache;import com.xiaoju.products.util.NumberUtil;import com.xiaoju.products.util.ParseUtil;import com.xiaoju.products.util.PropertyFileUtil;/** * hive sql解析类 * * 目的:实现HQL的语句解析,分析出输入输出表、字段和相应的处理条件。为字段级别的数据血缘提供基础。 * 重点:获取SELECT操作中的表和列的相关操作。其他操作这判断到字段级别。 * 实现思路:对AST深度优先遍历,遇到操作的token则判断当前的操作,遇到子句则压栈当前处理,处理子句。子句处理完,栈弹出。 * 处理字句的过程中,遇到子查询就保存当前子查询的信息,判断与其父查询的关系,最终形成树形结构; * 遇到字段或者条件处理则记录当前的字段和条件信息、组成Block,嵌套调用。 * 关键点解析 * 1、遇到TOK_TAB或TOK_TABREF则判断出当前操作的表 * 2、压栈判断是否是join,判断join条件 * 3、定义数据结构Block,遇到在where\select\join时获得其下相应的字段和条件,组成Block * 4、定义数据结构ColLine,遇到TOK_SUBQUERY保存当前的子查询信息,供父查询使用 * 5、定义数据结构ColLine,遇到TOK_UNION结束时,合并并截断当前的列信息 * 6、遇到select 或者未明确指出的字段,查询元数据进行辅助分析 * 7、解析结果进行相关校验 * 试用范围: * 1、支持标准SQL * 2、不支持transform using script * * @author yangyangthomas * /public class LineParser { private static final String SPLIT_DOT = "."; private static final String SPLIT_COMMA = ","; private static final String SPLIT_AND = "&"; private static final String TOK_EOF = ""; private static final String CON_WHERE = "WHERE:"; private static final String TOK_TMP_FILE = "TOK_TMP_FILE"; private Map> dbMap = new HashMap>(); private List queryTreeList = new ArrayList(); //子查询树形关系保存 private Stack> conditionsStack = new Stack>(); private Stack> colsStack = new Stack>(); private Map> resultQueryMap = new HashMap>(); private Set conditions = new HashSet(); //where or join 条件缓存 private List cols = new ArrayList(); //一个子查询内的列缓存 private Stack tableNameStack = new Stack(); private Stack joinStack = new Stack(); private Stack joinOnStack = new Stack(); private Map queryMap = new HashMap(); private boolean joinClause = false; private ASTNode joinOn = null; private String nowQueryDB = "default"; //hive的默认库 private boolean isCreateTable = false; //结果 private List colLines = new ArrayList(); private Set outputTables = new HashSet(); private Set inputTables = new HashSet(); private List tmpColLines = new ArrayList(); private Set tmpOutputTables = new HashSet(); private Set tmpInputTables = new HashSet(); public List getColLines() { return colLines; } public Set getOutputTables() { return outputTables; } public Set getInputTables() { return inputTables; } private void parseIteral(ASTNode ast) { prepareToParseCurrentNodeAndChilds(ast); parseChildNodes(ast); parseCurrentNode(ast); endParseCurrentNode(ast); } /** * 解析当前节点 * @param ast * @param set * @return / private void parseCurrentNode(ASTNode ast){ if (ast.getToken() != null) { switch (ast.getToken().getType()) { case HiveParser.TOK_CREATETABLE: //outputtable isCreateTable = true; String tableOut = fillDB(BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0))); tmpOutputTables.add(tableOut); MetaCache.getInstance().init(tableOut); //初始化数据,供以后使用 break; case HiveParser.TOK_TAB:// outputTable String tableTab = BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0)); String tableOut2 = fillDB(tableTab); tmpOutputTables.add(tableOut2); MetaCache.getInstance().init(tableOut2); //初始化数据,供以后使用 break; case HiveParser.TOK_TABREF:// inputTable ASTNode tabTree = (ASTNode) ast.getChild(0); String tableInFull = fillDB((tabTree.getChildCount() == 1) ? BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(0)) : BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(0)) + SPLIT_DOT + BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(1)) ); String tableIn = tableInFull.substring(tableInFull.indexOf(SPLIT_DOT) + 1); tmpInputTables.add(tableInFull); MetaCache.getInstance().init(tableInFull); //初始化数据,供以后使用 queryMap.clear(); String alia = null; if (ast.getChild(1) != null) { //(TOK_TABREF (TOK_TABNAME detail usersequence_client) c) alia = ast.getChild(1).getText().toLowerCase(); QueryTree qt = new QueryTree(); qt.setCurrent(alia); qt.getTableSet().add(tableInFull); QueryTree pTree = getSubQueryParent(ast); qt.setpId(pTree.getpId()); qt.setParent(pTree.getParent()); queryTreeList.add(qt); if (joinClause && ast.getParent() == joinOn) { // TOK_SUBQUERY join TOK_TABREF ,此处的TOK_SUBQUERY信息不应该清楚 for (QueryTree entry : queryTreeList) { //当前的查询范围 if (qt.getParent().equals(entry.getParent())) { queryMap.put(entry.getCurrent(), entry); } } } else { queryMap.put(qt.getCurrent(), qt); } } else { alia = tableIn.toLowerCase(); QueryTree qt = new QueryTree(); qt.setCurrent(alia); qt.getTableSet().add(tableInFull); QueryTree pTree = getSubQueryParent(ast); qt.setpId(pTree.getpId()); qt.setParent(pTree.getParent()); queryTreeList.add(qt); if (joinClause && ast.getParent() == joinOn) { for (QueryTree entry : queryTreeList) { if (qt.getParent().equals(entry.getParent())) { queryMap.put(entry.getCurrent(), entry); } } } else { queryMap.put(qt.getCurrent(), qt); //此处检查查询 select app.t1.c1,t1.c1 from t1 的情况 queryMap.put(tableInFull.toLowerCase(), qt); } } break; case HiveParser.TOK_SUBQUERY: if (ast.getChildCount() == 2) { String tableAlias = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(1).getText()); String aliaReal = ""; if(aliaReal.length() !=0){ aliaReal = aliaReal.substring(0, aliaReal.length()-1); } QueryTree qt = new QueryTree(); qt.setCurrent(tableAlias.toLowerCase()); qt.setColLineList(generateColLineList(cols, conditions)); QueryTree pTree = getSubQueryParent(ast); qt.setId(generateTreeId(ast)); qt.setpId(pTree.getpId()); qt.setParent(pTree.getParent()); qt.setChildList(getSubQueryChilds(qt.getId())); if (Check.notEmpty(qt.getChildList())) { for (QueryTree cqt : qt.getChildList()) { qt.getTableSet().addAll(cqt.getTableSet()); queryTreeList.remove(cqt); // 移除子节点信息 } } queryTreeList.add(qt); cols.clear(); queryMap.clear(); for (QueryTree _qt : queryTreeList) { if (qt.getParent().equals( _qt.getParent())) { //当前子查询才保存 queryMap.put(_qt.getCurrent(), _qt); } } } break; case HiveParser.TOK_SELEXPR: //输入输出字段的处理 /* * (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) * (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) * * (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) * (TOK_SELECT * (TOK_SELEXPR (. (TOK_TABLE_OR_COL p) datekey) datekey) * (TOK_SELEXPR (TOK_TABLE_OR_COL datekey)) * (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL base) userid)) buyer_count)) * (TOK_SELEXPR (TOK_FUNCTION when (> (. (TOK_TABLE_OR_COL base) userid) 5) (. (TOK_TABLE_OR_COL base) clienttype) (> (. (TOK_TABLE_OR_COL base) userid) 1) (+ (. (TOK_TABLE_OR_COL base) datekey) 5) (+ (. (TOK_TABLE_OR_COL base) clienttype) 1)) bbbaaa) / //解析需要插入的表 Tree tok_insert = ast.getParent().getParent(); Tree child = tok_insert.getChild(0).getChild(0); String tName = BaseSemanticAnalyzer.getUnescapedName((ASTNode) child.getChild(0)); String destTable = TOK_TMP_FILE.equals(tName) ? TOK_TMP_FILE : fillDB(tName); //select a.,* from t1 和 select * from (select c1 as a,c2 from t1) t 的情况 if (ast.getChild(0).getType() == HiveParser.TOK_ALLCOLREF) { String tableOrAlias = ""; if (ast.getChild(0).getChild(0) != null) { tableOrAlias = ast.getChild(0).getChild(0).getChild(0).getText(); } String[] result = getTableAndAlia(tableOrAlias); String _alia = result[1]; boolean isSub = false; //处理嵌套select * 的情况 if (Check.notEmpty(_alia)) { for (String string : _alia.split(SPLIT_AND)) { //迭代循环的时候查询 QueryTree qt = queryMap.get(string.toLowerCase()); if (null != qt) { List colLineList = qt.getColLineList(); if (Check.notEmpty(colLineList)) { isSub = true; for (ColLine colLine : colLineList) { cols.add(colLine); } } } } } if (!isSub) { //处理直接select * 的情况 String nowTable = result[0]; String[] tableArr = nowTable.split(SPLIT_AND); //fact.test&test2 for (String tables : tableArr) { String[] split = tables.split("\."); if (split.length > 2) { throw new SQLParseException("parse table:" + nowTable); } List colByTab = MetaCache.getInstance().getColumnByDBAndTable(tables); for (String column : colByTab) { Set fromNameSet = new LinkedHashSet(); fromNameSet.add(tables + SPLIT_DOT + column); ColLine cl = new ColLine(column, tables + SPLIT_DOT + column, fromNameSet, new LinkedHashSet() , destTable, column); cols.add(cl); } } } } else { Block bk = getBlockIteral((ASTNode)ast.getChild(0)); String toNameParse = getToNameParse(ast, bk); Set fromNameSet = filterData(bk.getColSet()); ColLine cl = new ColLine(toNameParse, bk.getCondition(), fromNameSet, new LinkedHashSet() , destTable, ""); cols.add(cl); } break; case HiveParser.TOK_WHERE: //3、过滤条件的处理select类 conditions.add(CON_WHERE + getBlockIteral((ASTNode) ast.getChild(0)).getCondition()); break; default: /** * (or * (> (. (TOK_TABLE_OR_COL p) orderid) (. (TOK_TABLE_OR_COL c) orderid)) * (and (= (. (TOK_TABLE_OR_COL p) a) (. (TOK_TABLE_OR_COL c) b)) * (= (. (TOK_TABLE_OR_COL p) aaa) (. (TOK_TABLE_OR_COL c) bbb)))) / //1、过滤条件的处理join类 if (joinOn != null && joinOn.getTokenStartIndex() == ast.getTokenStartIndex() && joinOn.getTokenStopIndex() == ast.getTokenStopIndex()) { ASTNode astCon = (ASTNode)ast.getChild(2); conditions.add(ast.getText().substring(4) + ":" + getBlockIteral(astCon).getCondition()); break; } } } } /* * 查找当前节点的父子查询节点 * @param ast / private QueryTree getSubQueryParent(Tree ast) { Tree _tree = ast; QueryTree qt = new QueryTree(); while(!(_tree = _tree.getParent()).isNil()){ if(_tree.getType() == HiveParser.TOK_SUBQUERY){ qt.setpId(generateTreeId(_tree)); qt.setParent(BaseSemanticAnalyzer.getUnescapedName((ASTNode)_tree.getChild(1))); return qt; } } qt.setpId(-1); qt.setParent("NIL"); return qt; } private int generateTreeId(Tree tree) { return tree.getTokenStartIndex() + tree.getTokenStopIndex(); } /* * 查找当前节点的子子查询节点(索引) * @param ast / private List getSubQueryChilds(int id) { List list = new ArrayList(); for (int i = 0; i < queryTreeList.size(); i++) { QueryTree qt = queryTreeList.get(i); if (id == qt.getpId()) { list.add(qt); } } return list; } /* * 获得要解析的名称 * @param ast * @param bk * @return / private String getToNameParse(ASTNode ast, Block bk) { String alia = ""; Tree child = ast.getChild(0); if (ast.getChild(1) != null) { //有别名 ip as alia alia = ast.getChild(1).getText(); } else if (child.getType() == HiveParser.DOT //没有别名 a.ip && child.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL && child.getChild(0).getChildCount() == 1 && child.getChild(1).getType() == HiveParser.Identifier) { alia = BaseSemanticAnalyzer.unescapeIdentifier(child.getChild(1).getText()); } else if (child.getType() == HiveParser.TOK_TABLE_OR_COL //没有别名 ip && child.getChildCount() == 1 && child.getChild(0).getType() == HiveParser.Identifier) { alia = BaseSemanticAnalyzer.unescapeIdentifier(child.getChild(0).getText()); } return alia; } /* * 获得解析的块,主要应用在WHERE、JOIN和SELECT端 * 如:

where a=1 *

t1 join t2 on t1.col1=t2.col1 and t1.col2=123 *

select count(distinct col1) from t1 * @param ast * @return / private Block getBlockIteral(ASTNode ast) { if (ast.getType() == HiveParser.KW_OR ||ast.getType() == HiveParser.KW_AND) { Block bk1 = getBlockIteral((ASTNode)ast.getChild(0)); Block bk2 = getBlockIteral((ASTNode)ast.getChild(1)); bk1.getColSet().addAll(bk2.getColSet()); bk1.setCondition("(" + bk1.getCondition() + " " + ast.getText() + " " + bk2.getCondition() + ")"); return bk1; } else if (ast.getType() == HiveParser.NOTEQUAL //判断条件 > < like in || ast.getType() == HiveParser.EQUAL || ast.getType() == HiveParser.LESSTHAN || ast.getType() == HiveParser.LESSTHANOREQUALTO || ast.getType() == HiveParser.GREATERTHAN || ast.getType() == HiveParser.GREATERTHANOREQUALTO || ast.getType() == HiveParser.KW_LIKE || ast.getType() == HiveParser.DIVIDE || ast.getType() == HiveParser.PLUS || ast.getType() == HiveParser.MINUS || ast.getType() == HiveParser.STAR || ast.getType() == HiveParser.MOD || ast.getType() == HiveParser.AMPERSAND || ast.getType() == HiveParser.TILDE || ast.getType() == HiveParser.BITWISEOR || ast.getType() == HiveParser.BITWISEXOR) { Block bk1 = getBlockIteral((ASTNode)ast.getChild(0)); if (ast.getChild(1) == null) { // -1 bk1.setCondition(ast.getText() + bk1.getCondition()); } else { Block bk2 = getBlockIteral((ASTNode)ast.getChild(1)); bk1.getColSet().addAll(bk2.getColSet()); bk1.setCondition(bk1.getCondition() + " " + ast.getText() + " " + bk2.getCondition()); } return bk1; } else if (ast.getType() == HiveParser.TOK_FUNCTIONDI) { Block col = getBlockIteral((ASTNode) ast.getChild(1)); String condition = ast.getChild(0).getText(); col.setCondition(condition + "(distinct (" + col.getCondition() +"))"); return col; } else if (ast.getType() == HiveParser.TOK_FUNCTION){ String fun = ast.getChild(0).getText(); Block col = ast.getChild(1) == null ? new Block() : getBlockIteral((ASTNode) ast.getChild(1)); if ("when".equalsIgnoreCase(fun)) { col.setCondition(getWhenCondition(ast)); Set processChilds = processChilds(ast, 1); col.getColSet().addAll(bkToCols(col, processChilds)); return col; } else if("IN".equalsIgnoreCase(fun)) { col.setCondition(col.getCondition() + " in (" + blockCondToString(processChilds(ast, 2)) + ")"); return col; } else if("TOK_ISNOTNULL".equalsIgnoreCase(fun) //isnull isnotnull || "TOK_ISNULL".equalsIgnoreCase(fun)){ col.setCondition(col.getCondition() + " " + fun.toLowerCase().substring(4)); return col; } else if("BETWEEN".equalsIgnoreCase(fun)){ col.setCondition(getBlockIteral((ASTNode) ast.getChild(2)).getCondition() + " between " + getBlockIteral((ASTNode) ast.getChild(3)).getCondition() + " and " + getBlockIteral((ASTNode) ast.getChild(4)).getCondition()); return col; } Set processChilds = processChilds(ast, 1); col.getColSet().addAll(bkToCols(col, processChilds)); col.setCondition(fun +"("+ blockCondToString(processChilds) + ")"); return col; } else if(ast.getType() == HiveParser.LSQUARE){ //map,array Block column = getBlockIteral((ASTNode) ast.getChild(0)); Block key = getBlockIteral((ASTNode) ast.getChild(1)); column.setCondition(column.getCondition() +"["+ key.getCondition() + "]"); return column; } else { return parseBlock(ast); } } private Set bkToCols(Block col, Set processChilds) { Set set = new LinkedHashSet(processChilds.size()); for (Block colLine : processChilds) { if (Check.notEmpty(colLine.getColSet())) { set.addAll(colLine.getColSet()); } } return set; } private String blockCondToString(Set processChilds) { StringBuilder sb = new StringBuilder(); for (Block colLine : processChilds) { sb.append(colLine.getCondition()).append(SPLIT_COMMA); } if (sb.length()>0) { sb.setLength(sb.length()-1); } return sb.toString(); } /* * 解析when条件 * @param ast * @return case when c1>100 then col1 when c1>0 col2 else col3 end / private String getWhenCondition(ASTNode ast) { int cnt = ast.getChildCount(); StringBuilder sb = new StringBuilder(); for (int i = 1; i < cnt; i++) { String condition = getBlockIteral((ASTNode)ast.getChild(i)).getCondition(); if (i == 1) { sb.append("(case when " + condition); } else if (i == cnt-1) { //else sb.append(" else " + condition + " end)"); } else if (i % 2 == 0){ //then sb.append(" then " + condition); } else { sb.append(" when " + condition); } } return sb.toString(); } /* * 保存subQuery查询别名和字段信息 * @param sqlIndex * @param tableAlias / private void putResultQueryMap(int sqlIndex, String tableAlias) { List list = generateColLineList(cols, conditions); String key = sqlIndex == 0 ? tableAlias : tableAlias + sqlIndex; //没有重名的情况就不用标记 resultQueryMap.put(key, list); } private List generateColLineList(List cols, Set conditions) { List list = new ArrayList(); for (ColLine entry : cols) { entry.getConditionSet().addAll(conditions); list.add(ParseUtil.cloneColLine(entry)); } return list; } /* * 判断正常列, * 正常:a as col, a * 异常:1 ,'a' //数字、字符等作为列名 / private boolean notNormalCol(String column) { return Check.isEmpty(column) || NumberUtil.isNumeric(column) || (column.startsWith(""") && column.endsWith(""")) || (column.startsWith("'") && column.endsWith("'")); } /* * 从指定索引位置开始解析子树 * @param ast * @param startIndex 开始索引 * @param isSimple 是否简写 * @param withCond 是否包含条件 * @return / private Set processChilds(ASTNode ast,int startIndex) { int cnt = ast.getChildCount(); Set set = new LinkedHashSet(); for (int i = startIndex; i < cnt; i++) { Block bk = getBlockIteral((ASTNode) ast.getChild(i)); if (Check.notEmpty(bk.getCondition()) || Check.notEmpty(bk.getColSet())){ set.add(bk); } } return set; } /* * 解析获得列名或者字符数字等和条件 * @param ast * @param isSimple * @return / private Block parseBlock(ASTNode ast) { if (ast.getType() == HiveParser.DOT && ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL && ast.getChild(0).getChildCount() == 1 && ast.getChild(1).getType() == HiveParser.Identifier) { String column = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(1).getText()); String alia = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(0).getChild(0).getText()); return getBlock(column, alia); } else if (ast.getType() == HiveParser.TOK_TABLE_OR_COL && ast.getChildCount() == 1 && ast.getChild(0).getType() == HiveParser.Identifier) { String column = ast.getChild(0).getText(); return getBlock(column, null); } else if (ast.getType() == HiveParser.Number || ast.getType() == HiveParser.StringLiteral || ast.getType() == HiveParser.Identifier) { Block bk = new Block(); bk.setCondition(ast.getText()); bk.getColSet().add(ast.getText()); return bk; } return new Block(); } /* * 根据列名和别名获得块信息 * @param column * @param alia * @param tree 当前子查询下的别名可以使用 * @return / private Block getBlock(String column, String alia) { String[] result = getTableAndAlia(alia); String tableArray = result[0]; String _alia = result[1]; for (String string : _alia.split(SPLIT_AND)) { //迭代循环的时候查询 QueryTree qt = queryMap.get(string.toLowerCase()); if (Check.notEmpty(column)) { for (ColLine colLine : qt.getColLineList()) { if (column.equalsIgnoreCase(colLine.getToNameParse())) { Block bk = new Block(); bk.setCondition(colLine.getColCondition()); bk.setColSet(ParseUtil.cloneSet(colLine.getFromNameSet())); return bk; } } } } String _realTable = tableArray; int cnt = 0; //匹配字段和元数据字段相同数目,如果有多个匹配,即此sql有二义性 for (String tables : tableArray.split(SPLIT_AND)) { //初始化的时候查询数据库对应表 String[] split = tables.split("\."); if (split.length > 2) { throw new SQLParseException("parse table:" + tables); } List colByTab = MetaCache.getInstance().getColumnByDBAndTable(tables); for (String col : colByTab) { if (column.equalsIgnoreCase(col)) { _realTable = tables; cnt++; } } }// if (cnt == 0) { //此类没有找到的检查在Validater类中检查// } if (cnt > 1) { //二义性检查 throw new SQLParseException("SQL is ambiguity, column: " + column + " tables:" + tableArray); } Block bk = new Block(); bk.setCondition(_realTable + SPLIT_DOT + column); bk.getColSet().add(_realTable + SPLIT_DOT + column); return bk; } /* * 过滤掉无用的列:如col1,123,'2013',col2 ==>> col1,col2 * @param col * @return / private Set filterData(Set colSet){ Set set = new LinkedHashSet(); for (String string : colSet) { if (!notNormalCol(string)) { set.add(string); } } return set; } /* * 解析所有子节点 * @param ast * @return / private void parseChildNodes(ASTNode ast){ int numCh = ast.getChildCount(); if (numCh > 0) { for (int num = 0; num < numCh; num++) { ASTNode child = (ASTNode) ast.getChild(num); parseIteral(child); } } } /* * 准备解析当前节点 * @param ast / private void prepareToParseCurrentNodeAndChilds(ASTNode ast){ if (ast.getToken() != null) { switch (ast.getToken().getType()) { case HiveParser.TOK_SWITCHDATABASE: System.out.println("nowQueryDB changed " + nowQueryDB+ " to " +ast.getChild(0).getText()); nowQueryDB = ast.getChild(0).getText(); break; case HiveParser.TOK_TRANSFORM: throw new UnSupportedException("no support transform using clause"); case HiveParser.TOK_RIGHTOUTERJOIN: case HiveParser.TOK_LEFTOUTERJOIN: case HiveParser.TOK_JOIN: case HiveParser.TOK_LEFTSEMIJOIN: case HiveParser.TOK_MAPJOIN: case HiveParser.TOK_FULLOUTERJOIN: case HiveParser.TOK_UNIQUEJOIN: joinStack.push(joinClause); joinClause = true; joinOnStack.push(joinOn); joinOn = ast; break; } } } /* * 结束解析当前节点 * @param ast / private void endParseCurrentNode(ASTNode ast){ if (ast.getToken() != null) { Tree parent = ast.getParent(); switch (ast.getToken().getType()) { //join 从句结束,跳出join case HiveParser.TOK_RIGHTOUTERJOIN: case HiveParser.TOK_LEFTOUTERJOIN: case HiveParser.TOK_JOIN: case HiveParser.TOK_LEFTSEMIJOIN: case HiveParser.TOK_MAPJOIN: case HiveParser.TOK_FULLOUTERJOIN: case HiveParser.TOK_UNIQUEJOIN: joinClause = joinStack.pop(); joinOn = joinOnStack.pop(); break; case HiveParser.TOK_QUERY: processUnionStack(ast, parent); //union的子节点 case HiveParser.TOK_INSERT: case HiveParser.TOK_SELECT: break; case HiveParser.TOK_UNION: //合并union字段信息 mergeUnionCols(); processUnionStack(ast, parent); //union的子节点 break; } } } private void mergeUnionCols() { validateUnion(cols); int size = cols.size(); int colNum = size / 2; List list = new ArrayList(colNum); for (int i = 0; i < colNum; i++) { //合并字段 ColLine col = cols.get(i); for (int j = i + colNum; j < size; j = j + colNum) { ColLine col2 = cols.get(j); list.add(col2); if (notNormalCol(col.getToNameParse()) && !notNormalCol(col2.getToNameParse())) { col.setToNameParse(col2.getToNameParse()); } col.getFromNameSet().addAll(col2.getFromNameSet()); col.setColCondition(col.getColCondition() + SPLIT_AND + col2.getColCondition()); Set conditionSet = ParseUtil.cloneSet(col.getConditionSet()); conditionSet.addAll(col2.getConditionSet()); conditionSet.addAll(conditions); col.getConditionSet().addAll(conditionSet); } } cols.removeAll(list); //移除已经合并的数据 } private void processUnionStack(ASTNode ast, Tree parent) { boolean isNeedAdd = parent.getType() == HiveParser.TOK_UNION; if (isNeedAdd) { if (parent.getChild(0) == ast && parent.getChild(1) != null) {//有弟节点(是第一节点) //压栈 conditionsStack.push(ParseUtil.cloneSet(conditions)); conditions.clear(); colsStack.push(ParseUtil.cloneList(cols)); cols.clear(); } else { //无弟节点(是第二节点) //出栈 if (!conditionsStack.isEmpty()) { conditions.addAll(conditionsStack.pop()); } if (!colsStack.isEmpty()) { cols.addAll(0, colsStack.pop()); } } } } private void parseAST(ASTNode ast) { parseIteral(ast); } public void parse(String sqlAll) throws Exception{ if (Check.isEmpty(sqlAll)) { return; } startParseAll(); //清空最终结果集 int i = 0; //当前是第几个sql for (String sql : sqlAll.split("(?> map = new HashMap>(); for (Entry> entry : resultQueryMap.entrySet()) { if (entry.getKey().startsWith(TOK_EOF)) { List value = entry.getValue(); for (ColLine colLine : value) { List list = map.get(colLine.getToTable()); if (Check.isEmpty(list)) { list = new ArrayList(); map.put(colLine.getToTable(), list); } list.add(colLine); } } } for (Entry> entry : map.entrySet()) { String table = entry.getKey(); List pList = entry.getValue(); List dList = dbMap.get(table); int metaSize = Check.isEmpty(dList) ? 0 : dList.size(); for (int i = 0; i < pList.size(); i++) { //按顺序插入对应的字段 ColLine clp = pList.get(i); String colName = null; if (i < metaSize) { colName = table + SPLIT_DOT + dList.get(i); } if (isCreateTable && TOK_TMP_FILE.equals(table)) { for (String string : tmpOutputTables) { table = string; } } ColLine colLine = new ColLine(clp.getToNameParse(), clp.getColCondition(), clp.getFromNameSet(), clp.getConditionSet(), table, colName); colLines.add(colLine); } } } /** * 设置输出表的字段对应关系 / private void setOutInputTableSet() { outputTables.addAll(ParseUtil.cloneSet(tmpOutputTables)); inputTables.addAll(ParseUtil.cloneSet(tmpInputTables)); } private void putDBMap() { for (String table : tmpOutputTables) { List list = MetaCache.getInstance().getColumnByDBAndTable(table); dbMap.put(table, list); } } /* * 补全db信息 * table1 ==>> db1.table1 * db1.table1 ==>> db1.table1 * db2.t1&t2 ==>> db2.t1&db1.t2 * @param tables / private String fillDB(String nowTable) { if (Check.isEmpty(nowTable)) { return nowTable; } StringBuilder sb = new StringBuilder(); String[] tableArr = nowTable.split(SPLIT_AND); //fact.test&test2&test3 for (String tables : tableArr) { String[] split = tables.split("\" + SPLIT_DOT); if (split.length > 2) { System.out.println(tables); throw new SQLParseException("parse table:" + nowTable); } String db = split.length == 2 ? split[0] : nowQueryDB ; String table = split.length == 2 ? split[1] : split[0] ; sb.append(db).append(SPLIT_DOT).append(table).append(SPLIT_AND); } if (sb.length()>0) { sb.setLength(sb.length()-1); } return sb.toString(); } /* * 根据别名查询表明 * @param alia * @return / private String[] getTableAndAlia(String alia) { String _alia = Check.notEmpty(alia) ? alia : ParseUtil.collectionToString(queryMap.keySet(), SPLIT_AND, true) ; String[] result = {"" , _alia}; Set tableSet = new HashSet(); if (Check.notEmpty(_alia)) { String[] split = _alia.split(SPLIT_AND); for (String string : split) { //别名又分单独起的别名 和 表名,即 select a.col,table_name.col from table_name a if (tmpInputTables.contains(string) || tmpInputTables.contains(fillDB(string))) { tableSet.add(fillDB(string)); } else if (queryMap.containsKey(string.toLowerCase())) { tableSet.addAll(queryMap.get(string.toLowerCase()).getTableSet()); } } result[0] = ParseUtil.collectionToString(tableSet, SPLIT_AND, true); result[1] = _alia; } return result; } /* * 校验union * @param list */ private void validateUnion(List list){ int size = list.size(); if (size % 2 == 1) { throw new SQLParseException("union column number are different, size=" + size); } int colNum = size / 2; checkUnion(list, 0, colNum); checkUnion(list, colNum, size); } private void checkUnion(List list, int start, int end) { String tmp = null; for (int i = start; i < end; i++) { //合并字段 ColLine col = list.get(i); if (Check.isEmpty(tmp)) { tmp = col.getToTable(); } else if (!tmp.equals(col.getToTable())){ throw new SQLParseException("union column number/types are different,table1=" + tmp +",table2="+ col.getToTable()); } } }}

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

你可能感兴趣的:(python解析sql字段血缘)