HIVE数仓数据血缘分析工具-SQL解析

一、数仓经常会碰到的几类问题:
1、两个数据报表进行对比,结果差异很大,需要人工核对分析指标的维度信息,比如从头分析数据指标从哪里来,处理条件是什么,最后才能分析出问题原因。
2、基础数据表因某种原因需要修改字段,需要评估其对数仓的影响,费时费力,然后在做方案。

二、问题分析:
数据源长途跋涉,经过大量的处理和组件来传递,呈现在业务用户面前,对数据进行回溯其实很难。元数据回溯在有效决策、策略制定、差异分析等过程中很重要。这两类问题都属于数据血缘分析问题,第一类叫做数据回溯、第二类叫做影响分析,是数据回溯的逆向。

三、解决方法:
自己实现了一套基于hive数仓的数据血缘分析工具,来完成各个数据表、字段之间的关系梳理,进而解决上面两个问题。

  • 工具主要目标:解析计算脚本中的HQL语句,分析得到输入输出表、输入输出字段和相应的处理条件,进行分析展现。
  • 实现思路:对AST深度优先遍历,遇到操作的token则判断当前的操作,遇到子句则压栈当前处理,处理子句。子句处理完,栈弹出。处理字句的过程中,遇到子查询就保存当前子查询的信息,判断与其父查询的关系,最终形成树形结构; 遇到字段或者条件处理则记录当前的字段和条件信息、组成Block,嵌套调用。
  • 关键点解析:
    1、遇到TOK_TAB或TOK_TABREF则判断出当前操作的表
    2、压栈判断是否是join,判断join条件
    3、定义数据结构Block,遇到在where\select\join时获得其下相应的字段和条件,组成Block
    4、定义数据结构ColLine,遇到TOK_SUBQUERY保存当前的子查询信息,供父查询使用
    5、定义数据结构ColLine,遇到TOK_UNION结束时,合并并截断当前的列信息
    6、遇到select 或者未明确指出的字段,查询元数据进行辅助分析
    7、解析结果进行相关校验

代码如下:
Block类

package com.xiaoju.products.parse;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.Map.Entry;
import java.util.LinkedHashSet;

import org.antlr.runtime.tree.Tree;
import org.apache.hadoop.hive.ql.parse.ASTNode;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
import org.apache.hadoop.hive.ql.parse.HiveParser;
import org.apache.hadoop.hive.ql.parse.ParseDriver;

import com.xiaoju.products.bean.Block;
import com.xiaoju.products.bean.ColLine;
import com.xiaoju.products.bean.QueryTree;
import com.xiaoju.products.exception.SQLParseException;
import com.xiaoju.products.exception.UnSupportedException;
import com.xiaoju.products.util.Check;
import com.xiaoju.products.util.MetaCache;
import com.xiaoju.products.util.NumberUtil;
import com.xiaoju.products.util.ParseUtil;
import com.xiaoju.products.util.PropertyFileUtil;

/** * hive sql解析类 * * 目的:实现HQL的语句解析,分析出输入输出表、字段和相应的处理条件。为字段级别的数据血缘提供基础。 * 重点:获取SELECT操作中的表和列的相关操作。其他操作这判断到字段级别。 * 实现思路:对AST深度优先遍历,遇到操作的token则判断当前的操作,遇到子句则压栈当前处理,处理子句。子句处理完,栈弹出。 * 处理字句的过程中,遇到子查询就保存当前子查询的信息,判断与其父查询的关系,最终形成树形结构; * 遇到字段或者条件处理则记录当前的字段和条件信息、组成Block,嵌套调用。 * 关键点解析 * 1、遇到TOK_TAB或TOK_TABREF则判断出当前操作的表 * 2、压栈判断是否是join,判断join条件 * 3、定义数据结构Block,遇到在where\select\join时获得其下相应的字段和条件,组成Block * 4、定义数据结构ColLine,遇到TOK_SUBQUERY保存当前的子查询信息,供父查询使用 * 5、定义数据结构ColLine,遇到TOK_UNION结束时,合并并截断当前的列信息 * 6、遇到select * 或者未明确指出的字段,查询元数据进行辅助分析 * 7、解析结果进行相关校验 * 试用范围: * 1、支持标准SQL * 2、不支持transform using script * * @author yangyangthomas * */
public class LineParser {

    private static final String SPLIT_DOT = ".";
    private static final String SPLIT_COMMA = ",";
    private static final String SPLIT_AND = "&";
    private static final String TOK_EOF = "<EOF>";
    private static final String CON_WHERE = "WHERE:";
    private static final String TOK_TMP_FILE = "TOK_TMP_FILE";

    private Map<String /*table*/, List<String/*column*/>> dbMap = new HashMap<String, List<String>>();
    private List<QueryTree> queryTreeList = new ArrayList<QueryTree>(); //子查询树形关系保存

    private Stack<Set<String>> conditionsStack = new Stack<Set<String>>();
    private Stack<List<ColLine>> colsStack = new Stack<List<ColLine>>();

    private Map<String, List<ColLine>> resultQueryMap = new HashMap<String,  List<ColLine>>();
    private Set<String> conditions = new HashSet<String>(); //where or join 条件缓存
    private List<ColLine> cols = new ArrayList<ColLine>(); //一个子查询内的列缓存

    private Stack<String> tableNameStack = new Stack<String>();
    private Stack<Boolean> joinStack = new Stack<Boolean>();
    private Stack<ASTNode> joinOnStack = new Stack<ASTNode>();

    private Map<String, QueryTree> queryMap = new HashMap<String, QueryTree>();
    private boolean joinClause = false;
    private ASTNode joinOn = null;
    private String nowQueryDB = "default"; //hive的默认库
    private boolean isCreateTable = false;

    //结果
    private List<ColLine> colLines = new ArrayList<ColLine>();  
    private Set<String> outputTables = new HashSet<String>();
    private Set<String> inputTables = new HashSet<String>();

    private List<ColLine> tmpColLines = new ArrayList<ColLine>();  
    private Set<String> tmpOutputTables = new HashSet<String>();
    private Set<String> tmpInputTables = new HashSet<String>();

    public List<ColLine> getColLines() {
        return colLines;
    }
    public Set<String> getOutputTables() {
        return outputTables;
    }
    public Set<String> getInputTables() {
        return inputTables;
    }

    private void parseIteral(ASTNode ast) {
        prepareToParseCurrentNodeAndChilds(ast);
        parseChildNodes(ast);
        parseCurrentNode(ast);
        endParseCurrentNode(ast);
    }

    /** * 解析当前节点 * @param ast * @param set * @return */
    private void parseCurrentNode(ASTNode ast){
        if (ast.getToken() != null) {
            switch (ast.getToken().getType()) {
            case HiveParser.TOK_CREATETABLE: //outputtable
                isCreateTable = true;
                String tableOut = fillDB(BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0)));
                tmpOutputTables.add(tableOut);
                MetaCache.getInstance().init(tableOut); //初始化数据,供以后使用
                break;
            case HiveParser.TOK_TAB:// outputTable
                String tableTab = BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0));
                String tableOut2 = fillDB(tableTab);
                tmpOutputTables.add(tableOut2);
                MetaCache.getInstance().init(tableOut2); //初始化数据,供以后使用
                break;
            case HiveParser.TOK_TABREF:// inputTable
                ASTNode tabTree = (ASTNode) ast.getChild(0);
                String tableInFull = fillDB((tabTree.getChildCount() == 1) ?  
                        BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(0))
                        : BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(0))
                        + SPLIT_DOT + BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(1))
                        );
                String tableIn = tableInFull.substring(tableInFull.indexOf(SPLIT_DOT) + 1); 
                tmpInputTables.add(tableInFull);
                MetaCache.getInstance().init(tableInFull); //初始化数据,供以后使用
                queryMap.clear();
                String alia = null;
                if (ast.getChild(1) != null) { //(TOK_TABREF (TOK_TABNAME detail usersequence_client) c) 
                    alia = ast.getChild(1).getText().toLowerCase();
                    QueryTree qt = new QueryTree();
                    qt.setCurrent(alia);
                    qt.getTableSet().add(tableInFull);
                    QueryTree pTree = getSubQueryParent(ast);
                    qt.setpId(pTree.getpId());
                    qt.setParent(pTree.getParent());
                    queryTreeList.add(qt);
                    if (joinClause && ast.getParent() == joinOn) { // TOK_SUBQUERY join TOK_TABREF ,此处的TOK_SUBQUERY信息不应该清楚
                        for (QueryTree entry : queryTreeList) { //当前的查询范围
                            if (qt.getParent().equals(entry.getParent())) {
                                queryMap.put(entry.getCurrent(), entry);
                            }
                        }
                    } else {
                        queryMap.put(qt.getCurrent(), qt);
                    }
                } else {
                    alia = tableIn.toLowerCase();
                    QueryTree qt = new QueryTree();
                    qt.setCurrent(alia);
                    qt.getTableSet().add(tableInFull);
                    QueryTree pTree = getSubQueryParent(ast);
                    qt.setpId(pTree.getpId());
                    qt.setParent(pTree.getParent());
                    queryTreeList.add(qt);

                    if (joinClause && ast.getParent() == joinOn) {
                        for (QueryTree entry : queryTreeList) {
                            if (qt.getParent().equals(entry.getParent())) {
                                queryMap.put(entry.getCurrent(), entry);
                            }
                        }
                    } else {
                        queryMap.put(qt.getCurrent(), qt);
                        //此处检查查询 select app.t1.c1,t1.c1 from t1 的情况
                        queryMap.put(tableInFull.toLowerCase(), qt);
                    }
                }
                break;
            case HiveParser.TOK_SUBQUERY: 
                if (ast.getChildCount() == 2) {
                    String tableAlias = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(1).getText());
                    String aliaReal = "";
                    if(aliaReal.length() !=0){
                        aliaReal = aliaReal.substring(0, aliaReal.length()-1);
                    }

                    QueryTree qt = new QueryTree();
                    qt.setCurrent(tableAlias.toLowerCase());
                    qt.setColLineList(generateColLineList(cols, conditions));
                    QueryTree pTree = getSubQueryParent(ast);
                    qt.setId(generateTreeId(ast));
                    qt.setpId(pTree.getpId()); 
                    qt.setParent(pTree.getParent());
                    qt.setChildList(getSubQueryChilds(qt.getId()));
                    if (Check.notEmpty(qt.getChildList())) {
                        for (QueryTree cqt : qt.getChildList()) {
                            qt.getTableSet().addAll(cqt.getTableSet());
                            queryTreeList.remove(cqt);  // 移除子节点信息
                        }
                    }
                    queryTreeList.add(qt);
                    cols.clear();

                    queryMap.clear();
                    for (QueryTree _qt : queryTreeList) {
                        if (qt.getParent().equals( _qt.getParent())) { //当前子查询才保存
                            queryMap.put(_qt.getCurrent(), _qt);
                        }
                    }
                }
                break;
            case HiveParser.TOK_SELEXPR: //输入输出字段的处理
                 /** * (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) * (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) * * (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) * (TOK_SELECT * (TOK_SELEXPR (. (TOK_TABLE_OR_COL p) datekey) datekey) * (TOK_SELEXPR (TOK_TABLE_OR_COL datekey)) * (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL base) userid)) buyer_count)) * (TOK_SELEXPR (TOK_FUNCTION when (> (. (TOK_TABLE_OR_COL base) userid) 5) (. (TOK_TABLE_OR_COL base) clienttype) (> (. (TOK_TABLE_OR_COL base) userid) 1) (+ (. (TOK_TABLE_OR_COL base) datekey) 5) (+ (. (TOK_TABLE_OR_COL base) clienttype) 1)) bbbaaa) */
                //解析需要插入的表
                Tree tok_insert = ast.getParent().getParent();
                Tree child = tok_insert.getChild(0).getChild(0); 
                String tName = BaseSemanticAnalyzer.getUnescapedName((ASTNode) child.getChild(0));
                String destTable = TOK_TMP_FILE.equals(tName) ? TOK_TMP_FILE : fillDB(tName); 

                //select a.*,* from t1 和 select * from (select c1 as a,c2 from t1) t 的情况
                if (ast.getChild(0).getType() == HiveParser.TOK_ALLCOLREF) { 
                    String tableOrAlias = "";
                    if (ast.getChild(0).getChild(0) != null) {
                        tableOrAlias = ast.getChild(0).getChild(0).getChild(0).getText();
                    }
                    String[] result = getTableAndAlia(tableOrAlias);
                    String _alia = result[1];

                    boolean isSub = false;  //处理嵌套select * 的情况
                    if (Check.notEmpty(_alia)) {
                        for (String string : _alia.split(SPLIT_AND)) { //迭代循环的时候查询
                            QueryTree qt = queryMap.get(string.toLowerCase());
                            if (null != qt) {
                                List<ColLine> colLineList = qt.getColLineList();
                                if (Check.notEmpty(colLineList)) {
                                    isSub = true;
                                    for (ColLine colLine : colLineList) {
                                        cols.add(colLine);  
                                    }
                                }
                            }
                        }
                    }
                    if (!isSub) { //处理直接select * 的情况
                        String nowTable =  result[0];
                        String[] tableArr = nowTable.split(SPLIT_AND); //fact.test&test2
                        for (String tables : tableArr) {
                            String[] split = tables.split("\\.");
                            if (split.length > 2) {
                                throw new SQLParseException("parse table:" + nowTable);
                            }
                            List<String> colByTab =  MetaCache.getInstance().getColumnByDBAndTable(tables);
                            for (String column : colByTab) {
                                Set<String> fromNameSet = new LinkedHashSet<String>();
                                fromNameSet.add(tables + SPLIT_DOT + column);
                                ColLine cl = new ColLine(column, tables + SPLIT_DOT + column, fromNameSet,
                                        new LinkedHashSet<String>() , destTable, column);
                                cols.add(cl);   
                            }
                        }
                    }
                } else {
                    Block bk = getBlockIteral((ASTNode)ast.getChild(0));
                    String toNameParse = getToNameParse(ast, bk);
                    Set<String> fromNameSet  = filterData(bk.getColSet());
                    ColLine cl = new ColLine(toNameParse, bk.getCondition(), fromNameSet, new LinkedHashSet<String>() , destTable, "");
                    cols.add(cl);
                }
                break;
            case HiveParser.TOK_WHERE: //3、过滤条件的处理select类
                conditions.add(CON_WHERE + getBlockIteral((ASTNode) ast.getChild(0)).getCondition());
                break; 
            default:
                /** * (or * (> (. (TOK_TABLE_OR_COL p) orderid) (. (TOK_TABLE_OR_COL c) orderid)) * (and (= (. (TOK_TABLE_OR_COL p) a) (. (TOK_TABLE_OR_COL c) b)) * (= (. (TOK_TABLE_OR_COL p) aaa) (. (TOK_TABLE_OR_COL c) bbb)))) */
                 //1、过滤条件的处理join类
                if (joinOn != null && joinOn.getTokenStartIndex() == ast.getTokenStartIndex()
                        && joinOn.getTokenStopIndex() == ast.getTokenStopIndex()) {
                    ASTNode astCon = (ASTNode)ast.getChild(2);
                    conditions.add(ast.getText().substring(4) + ":" + getBlockIteral(astCon).getCondition());
                    break;  
                }
            }
        }
    }

    /** * 查找当前节点的父子查询节点 * @param ast */
    private QueryTree getSubQueryParent(Tree ast) {
        Tree _tree = ast;
        QueryTree qt = new QueryTree();
        while(!(_tree = _tree.getParent()).isNil()){
            if(_tree.getType() == HiveParser.TOK_SUBQUERY){
                qt.setpId(generateTreeId(_tree));
                qt.setParent(BaseSemanticAnalyzer.getUnescapedName((ASTNode)_tree.getChild(1)));
                return qt;
            }
        }
        qt.setpId(-1);
        qt.setParent("NIL");
        return qt; 
    }
    private int generateTreeId(Tree tree) {
        return tree.getTokenStartIndex() + tree.getTokenStopIndex();
    }


    /** * 查找当前节点的子子查询节点(索引) * @param ast */
    private List<QueryTree> getSubQueryChilds(int id) {
        List<QueryTree> list = new ArrayList<QueryTree>();
        for (int i = 0; i < queryTreeList.size(); i++) {
            QueryTree qt = queryTreeList.get(i);
            if (id == qt.getpId()) {
                list.add(qt);
            }
        }
        return list;
    }

    /** * 获得要解析的名称 * @param ast * @param bk * @return */
    private String getToNameParse(ASTNode ast, Block bk) {
        String alia = "";
        Tree child = ast.getChild(0);
        if (ast.getChild(1) != null) { //有别名 ip as alia
            alia = ast.getChild(1).getText();
        } else if (child.getType() == HiveParser.DOT //没有别名 a.ip 
                && child.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL
                && child.getChild(0).getChildCount() == 1
                && child.getChild(1).getType() == HiveParser.Identifier) {
            alia = BaseSemanticAnalyzer.unescapeIdentifier(child.getChild(1).getText());
        } else if (child.getType() == HiveParser.TOK_TABLE_OR_COL //没有别名 ip
                    && child.getChildCount() == 1
                    && child.getChild(0).getType() == HiveParser.Identifier) { 
            alia = BaseSemanticAnalyzer.unescapeIdentifier(child.getChild(0).getText());
        }
        return alia;
    }

    /** * 获得解析的块,主要应用在WHERE、JOIN和SELECT端 * 如: <p>where a=1 * <p>t1 join t2 on t1.col1=t2.col1 and t1.col2=123 * <p>select count(distinct col1) from t1 * @param ast * @return */
    private Block getBlockIteral(ASTNode ast) {
        if (ast.getType() == HiveParser.KW_OR
            ||ast.getType() == HiveParser.KW_AND) {
            Block bk1 = getBlockIteral((ASTNode)ast.getChild(0));
            Block bk2 = getBlockIteral((ASTNode)ast.getChild(1));
            bk1.getColSet().addAll(bk2.getColSet());
            bk1.setCondition("(" +  bk1.getCondition() + " " + ast.getText() + " " + bk2.getCondition() + ")");
            return bk1;
        } else if (ast.getType() == HiveParser.NOTEQUAL //判断条件 > < like in 
            || ast.getType() == HiveParser.EQUAL
            || ast.getType() == HiveParser.LESSTHAN
            || ast.getType() == HiveParser.LESSTHANOREQUALTO
            || ast.getType() == HiveParser.GREATERTHAN
            || ast.getType() == HiveParser.GREATERTHANOREQUALTO
            || ast.getType() == HiveParser.KW_LIKE
            || ast.getType() == HiveParser.DIVIDE
            || ast.getType() == HiveParser.PLUS
            || ast.getType() == HiveParser.MINUS
            || ast.getType() == HiveParser.STAR
            || ast.getType() == HiveParser.MOD
            || ast.getType() == HiveParser.AMPERSAND
            || ast.getType() == HiveParser.TILDE
            || ast.getType() == HiveParser.BITWISEOR
            || ast.getType() == HiveParser.BITWISEXOR) {
            Block bk1 = getBlockIteral((ASTNode)ast.getChild(0));
            if (ast.getChild(1) == null) { // -1
                bk1.setCondition(ast.getText() + bk1.getCondition());
            } else {
                Block bk2 = getBlockIteral((ASTNode)ast.getChild(1));
                bk1.getColSet().addAll(bk2.getColSet());
                bk1.setCondition(bk1.getCondition() + " " + ast.getText() + " " + bk2.getCondition());
            }
            return bk1;
        } else if (ast.getType() == HiveParser.TOK_FUNCTIONDI) {
            Block col = getBlockIteral((ASTNode) ast.getChild(1));
            String condition = ast.getChild(0).getText();
            col.setCondition(condition + "(distinct (" + col.getCondition() +"))");
            return col;
        } else if (ast.getType() == HiveParser.TOK_FUNCTION){
            String fun = ast.getChild(0).getText(); 
            Block col = ast.getChild(1) == null ? new Block() : getBlockIteral((ASTNode) ast.getChild(1));
            if ("when".equalsIgnoreCase(fun)) {
                col.setCondition(getWhenCondition(ast));
                Set<Block> processChilds = processChilds(ast, 1);
                col.getColSet().addAll(bkToCols(col, processChilds));
                return col; 
            } else if("IN".equalsIgnoreCase(fun)) {
                col.setCondition(col.getCondition() + " in (" + blockCondToString(processChilds(ast, 2)) + ")");
                return col;
            } else if("TOK_ISNOTNULL".equalsIgnoreCase(fun) //isnull isnotnull
                    || "TOK_ISNULL".equalsIgnoreCase(fun)){
                col.setCondition(col.getCondition() + " " + fun.toLowerCase().substring(4));
                return col; 
            } else if("BETWEEN".equalsIgnoreCase(fun)){
                col.setCondition(getBlockIteral((ASTNode) ast.getChild(2)).getCondition() 
                        + " between " + getBlockIteral((ASTNode) ast.getChild(3)).getCondition() 
                        + " and " + getBlockIteral((ASTNode) ast.getChild(4)).getCondition());
                return col;
            }
            Set<Block> processChilds = processChilds(ast, 1);
            col.getColSet().addAll(bkToCols(col, processChilds));
            col.setCondition(fun +"("+ blockCondToString(processChilds) + ")");
            return col;
        } else if(ast.getType() == HiveParser.LSQUARE){ //map,array
                Block column = getBlockIteral((ASTNode) ast.getChild(0));
                Block key = getBlockIteral((ASTNode) ast.getChild(1));
                column.setCondition(column.getCondition() +"["+ key.getCondition() + "]");
                return column;
        } else {
            return parseBlock(ast); 
        }
    }


    private Set<String> bkToCols(Block col, Set<Block> processChilds) {
        Set<String> set = new LinkedHashSet<String>(processChilds.size());
        for (Block colLine : processChilds) {
            if (Check.notEmpty(colLine.getColSet())) {
                set.addAll(colLine.getColSet());
            }
        }
        return set;
    }

    private String blockCondToString(Set<Block> processChilds) {
        StringBuilder sb = new StringBuilder();
        for (Block colLine : processChilds) {
            sb.append(colLine.getCondition()).append(SPLIT_COMMA);
        }
        if (sb.length()>0) {
            sb.setLength(sb.length()-1);
        }
        return sb.toString();
    }

    /** * 解析when条件 * @param ast * @return case when c1>100 then col1 when c1>0 col2 else col3 end */
    private String getWhenCondition(ASTNode ast) {
        int cnt = ast.getChildCount();
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i < cnt; i++) {
            String condition = getBlockIteral((ASTNode)ast.getChild(i)).getCondition();
            if (i == 1) {
                sb.append("(case when " + condition);
            } else if (i == cnt-1) { //else
                sb.append(" else " + condition + " end)");
            } else if (i % 2 == 0){ //then
                sb.append(" then " + condition);
            } else {
                sb.append(" when " + condition);
            }
        }
        return sb.toString();
    }


    /** * 保存subQuery查询别名和字段信息 * @param sqlIndex * @param tableAlias */
    private void putResultQueryMap(int sqlIndex, String tableAlias) {
        List<ColLine> list = generateColLineList(cols, conditions);
        String key = sqlIndex == 0 ? tableAlias : tableAlias + sqlIndex; //没有重名的情况就不用标记
        resultQueryMap.put(key, list);
    }

    private List<ColLine> generateColLineList(List<ColLine> cols, Set<String> conditions) {
        List<ColLine> list = new ArrayList<ColLine>();
        for (ColLine entry : cols) {
            entry.getConditionSet().addAll(conditions);
            list.add(ParseUtil.cloneColLine(entry));
        }
        return list;
    }

    /** * 判断正常列, * 正常:a as col, a * 异常:1 ,'a' //数字、字符等作为列名 */
    private boolean notNormalCol(String column) {
        return Check.isEmpty(column) || NumberUtil.isNumeric(column) 
                || (column.startsWith("\"") && column.endsWith("\"")) 
                || (column.startsWith("\'") && column.endsWith("\'"));
    }

    /** * 从指定索引位置开始解析子树 * @param ast * @param startIndex 开始索引 * @param isSimple 是否简写 * @param withCond 是否包含条件 * @return */
    private Set<Block> processChilds(ASTNode ast,int startIndex) {
        int cnt = ast.getChildCount();
        Set<Block> set = new LinkedHashSet<Block>();
        for (int i = startIndex; i < cnt; i++) {
            Block bk = getBlockIteral((ASTNode) ast.getChild(i));
            if (Check.notEmpty(bk.getCondition()) || Check.notEmpty(bk.getColSet())){
                set.add(bk);
            }
        }
        return set;
    }   


    /** * 解析获得列名或者字符数字等和条件 * @param ast * @param isSimple * @return */
    private Block parseBlock(ASTNode ast) {
        if (ast.getType() == HiveParser.DOT
                && ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL
                && ast.getChild(0).getChildCount() == 1
                && ast.getChild(1).getType() == HiveParser.Identifier) {
            String column = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(1).getText());
            String alia = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(0).getChild(0).getText());
            return getBlock(column, alia);
        } else if (ast.getType() == HiveParser.TOK_TABLE_OR_COL 
                    && ast.getChildCount() == 1
                    && ast.getChild(0).getType() == HiveParser.Identifier) { 
            String column = ast.getChild(0).getText();
            return getBlock(column, null);
        } else if (ast.getType() == HiveParser.Number 
                || ast.getType() == HiveParser.StringLiteral 
                || ast.getType() == HiveParser.Identifier) {
            Block bk = new Block();
            bk.setCondition(ast.getText());
            bk.getColSet().add(ast.getText());
            return bk;
        }
        return new Block();
    }


    /** * 根据列名和别名获得块信息 * @param column * @param alia * @param tree 当前子查询下的别名可以使用 * @return */
    private Block getBlock(String column, String alia) {
        String[] result = getTableAndAlia(alia);
        String tableArray = result[0];
        String _alia = result[1];

        for (String string : _alia.split(SPLIT_AND)) { //迭代循环的时候查询
            QueryTree qt = queryMap.get(string.toLowerCase());
            if (Check.notEmpty(column)) {
                for (ColLine colLine : qt.getColLineList()) {
                    if (column.equalsIgnoreCase(colLine.getToNameParse())) {
                        Block bk = new Block();
                        bk.setCondition(colLine.getColCondition());
                        bk.setColSet(ParseUtil.cloneSet(colLine.getFromNameSet())); 
                        return bk;
                    }
                }
            }
        }

        String _realTable = tableArray;
        int cnt = 0; //匹配字段和元数据字段相同数目,如果有多个匹配,即此sql有二义性
        for (String tables : tableArray.split(SPLIT_AND)) { //初始化的时候查询数据库对应表
            String[] split = tables.split("\\.");
            if (split.length > 2) {
                throw new SQLParseException("parse table:" + tables);
            }
            List<String> colByTab =  MetaCache.getInstance().getColumnByDBAndTable(tables);
            for (String col : colByTab) {
                if (column.equalsIgnoreCase(col)) {
                    _realTable = tables;
                    cnt++;
                }
            }
        }

// if (cnt == 0) { //此类没有找到的检查在Validater类中检查
// }
        if (cnt > 1) { //二义性检查
            throw new SQLParseException("SQL is ambiguity, column: " + column + " tables:" + tableArray);
        }

        Block bk = new Block();
        bk.setCondition(_realTable + SPLIT_DOT + column);
        bk.getColSet().add(_realTable + SPLIT_DOT + column);
        return bk;
    }

    /** * 过滤掉无用的列:如col1,123,'2013',col2 ==>> col1,col2 * @param col * @return */
    private Set<String> filterData(Set<String> colSet){
        Set<String> set  = new LinkedHashSet<String>();
        for (String string : colSet) {
            if (!notNormalCol(string)) {
                set.add(string);
            }
        }
        return set;
    }


    /** * 解析所有子节点 * @param ast * @return */
    private void parseChildNodes(ASTNode ast){
        int numCh = ast.getChildCount();
        if (numCh > 0) {
            for (int num = 0; num < numCh; num++) {
                ASTNode child = (ASTNode) ast.getChild(num);
                parseIteral(child);
            }
        }
    }

    /** * 准备解析当前节点 * @param ast */
    private void prepareToParseCurrentNodeAndChilds(ASTNode ast){
        if (ast.getToken() != null) {
            switch (ast.getToken().getType()) {
                case HiveParser.TOK_SWITCHDATABASE:
                    System.out.println("nowQueryDB changed " + nowQueryDB+ " to " +ast.getChild(0).getText());
                    nowQueryDB = ast.getChild(0).getText();
                    break;
                case HiveParser.TOK_TRANSFORM:
                    throw new UnSupportedException("no support transform using clause");
                case HiveParser.TOK_RIGHTOUTERJOIN:
                case HiveParser.TOK_LEFTOUTERJOIN:
                case HiveParser.TOK_JOIN:
                case HiveParser.TOK_LEFTSEMIJOIN:
                case HiveParser.TOK_MAPJOIN:
                case HiveParser.TOK_FULLOUTERJOIN:
                case HiveParser.TOK_UNIQUEJOIN:
                    joinStack.push(joinClause);
                    joinClause = true;
                    joinOnStack.push(joinOn);
                    joinOn = ast;
                    break;
            }
        }
    }


    /** * 结束解析当前节点 * @param ast */
    private void endParseCurrentNode(ASTNode ast){
        if (ast.getToken() != null) {
            Tree parent = ast.getParent();
            switch (ast.getToken().getType()) { //join 从句结束,跳出join
            case HiveParser.TOK_RIGHTOUTERJOIN:
            case HiveParser.TOK_LEFTOUTERJOIN:
            case HiveParser.TOK_JOIN:
            case HiveParser.TOK_LEFTSEMIJOIN:
            case HiveParser.TOK_MAPJOIN:
            case HiveParser.TOK_FULLOUTERJOIN:
            case HiveParser.TOK_UNIQUEJOIN:
                joinClause = joinStack.pop();
                joinOn = joinOnStack.pop();
                break;

            case HiveParser.TOK_QUERY:
                processUnionStack(ast, parent); //union的子节点
            case HiveParser.TOK_INSERT:
            case HiveParser.TOK_SELECT:
                break;
            case HiveParser.TOK_UNION:  //合并union字段信息
                mergeUnionCols();
                processUnionStack(ast, parent); //union的子节点
                break;
            }
        }
    }
    private void mergeUnionCols() {
        validateUnion(cols);
        int size = cols.size();
        int colNum = size / 2;
        List<ColLine> list = new ArrayList<ColLine>(colNum);
        for (int i = 0; i < colNum; i++) { //合并字段
            ColLine col = cols.get(i);
            for (int j = i + colNum; j < size; j = j + colNum) {
                ColLine col2 = cols.get(j);
                list.add(col2);
                if (notNormalCol(col.getToNameParse()) && !notNormalCol(col2.getToNameParse())) {
                    col.setToNameParse(col2.getToNameParse());
                }
                col.getFromNameSet().addAll(col2.getFromNameSet());

                col.setColCondition(col.getColCondition() + SPLIT_AND + col2.getColCondition());

                Set<String> conditionSet = ParseUtil.cloneSet(col.getConditionSet());
                conditionSet.addAll(col2.getConditionSet());
                conditionSet.addAll(conditions);
                col.getConditionSet().addAll(conditionSet);
            }
        }
        cols.removeAll(list); //移除已经合并的数据
    }
    private void processUnionStack(ASTNode ast, Tree parent) {
        boolean isNeedAdd = parent.getType() == HiveParser.TOK_UNION;
        if (isNeedAdd) {
            if (parent.getChild(0) == ast && parent.getChild(1) != null) {//有弟节点(是第一节点)
                //压栈
                conditionsStack.push(ParseUtil.cloneSet(conditions));
                conditions.clear();
                colsStack.push(ParseUtil.cloneList(cols));
                cols.clear();
            } else {  //无弟节点(是第二节点)
                 //出栈
                if (!conditionsStack.isEmpty()) {
                    conditions.addAll(conditionsStack.pop());
                }
                if (!colsStack.isEmpty()) {
                    cols.addAll(0, colsStack.pop());
                }
            }
        }
    }

    private void parseAST(ASTNode ast) {
        parseIteral(ast);
    }

    public void parse(String sqlAll) throws Exception{
        if (Check.isEmpty(sqlAll)) {
            return;
        }
        startParseAll(); //清空最终结果集
        int i = 0; //当前是第几个sql
        for (String sql : sqlAll.split("(?<!\\\\);")) {
                ParseDriver pd = new ParseDriver();
                String trim = sql.toLowerCase().trim();
                if (trim.startsWith("set") || trim.startsWith("add") || Check.isEmpty(trim)) {
                    continue;
                }
                ASTNode ast = pd.parse(sql);
                if ("local".equals(PropertyFileUtil.getProperty("environment"))) {
                    System.out.println(ast.toStringTree());
                }
                prepareParse();
                parseAST(ast);
                endParse(++i);
         }
    }

    /** * 清空上次处理的结果 */
    private void startParseAll() {
        colLines.clear();
        outputTables.clear();
        inputTables.clear();
    }

    private void prepareParse() {
        isCreateTable = false;
        dbMap.clear();

        queryMap.clear();
        queryTreeList.clear();

        //结果
        tmpColLines.clear();  
        tmpOutputTables.clear();
        tmpInputTables.clear();

        conditionsStack.clear(); //where or join 条件缓存
        colsStack.clear(); //一个子查询内的列缓存

        resultQueryMap.clear();
        conditions.clear(); //where or join 条件缓存
        cols.clear(); //一个子查询内的列缓存

        tableNameStack.clear();
        joinStack.clear();
        joinOnStack.clear();

        joinClause = false;
        joinOn = null;
    }

    /** * 所有解析完毕之后的后期处理 */
    private void endParse(int sqlIndex) {
        putResultQueryMap(sqlIndex, TOK_EOF); 
        putDBMap();
        setColLineList();
        setOutInputTableSet();
    }

    /*** * 设置输出表的字段对应关系 */
    private void setColLineList() {
        Map<String, List<ColLine>> map = new HashMap<String, List<ColLine>>();
        for (Entry<String, List<ColLine>> entry : resultQueryMap.entrySet()) {
            if (entry.getKey().startsWith(TOK_EOF)) {
                List<ColLine> value = entry.getValue();
                for (ColLine colLine : value) {
                    List<ColLine> list = map.get(colLine.getToTable());
                    if (Check.isEmpty(list)) {
                        list = new ArrayList<ColLine>();
                        map.put(colLine.getToTable(), list);
                    }
                    list.add(colLine);
                }
            }
        }

        for (Entry<String, List<ColLine>> entry : map.entrySet()) {
            String table = entry.getKey();
            List<ColLine> pList = entry.getValue();
            List<String> dList = dbMap.get(table);
            int metaSize = Check.isEmpty(dList) ? 0 : dList.size();
            for (int i = 0; i < pList.size(); i++) { //按顺序插入对应的字段
                ColLine clp = pList.get(i);
                String colName = null;
                if (i < metaSize) { 
                    colName = table + SPLIT_DOT + dList.get(i); 
                } 
                if (isCreateTable && TOK_TMP_FILE.equals(table)) {
                    for (String string : tmpOutputTables) {
                        table = string;
                    }
                }
                ColLine colLine = new ColLine(clp.getToNameParse(), clp.getColCondition(),
                        clp.getFromNameSet(), clp.getConditionSet(), table, colName);
                colLines.add(colLine);
            }
        }
    }

    /*** * 设置输出表的字段对应关系 */
    private void setOutInputTableSet() {
        outputTables.addAll(ParseUtil.cloneSet(tmpOutputTables));
        inputTables.addAll(ParseUtil.cloneSet(tmpInputTables));
    }

    private void putDBMap() {
        for (String table : tmpOutputTables) {
            List<String> list =  MetaCache.getInstance().getColumnByDBAndTable(table);
            dbMap.put(table, list);
        }
    }

    /** * 补全db信息 * table1 ==>> db1.table1 * db1.table1 ==>> db1.table1 * db2.t1&t2 ==>> db2.t1&db1.t2 * @param tables */
    private String fillDB(String nowTable) {
        if (Check.isEmpty(nowTable)) {
            return nowTable;
        }
        StringBuilder sb = new StringBuilder();
        String[] tableArr = nowTable.split(SPLIT_AND); //fact.test&test2&test3
        for (String tables : tableArr) {
            String[] split = tables.split("\\" + SPLIT_DOT);
            if (split.length > 2) {
                System.out.println(tables);
                throw new SQLParseException("parse table:" + nowTable);
            }
            String db = split.length == 2 ? split[0] : nowQueryDB ;
            String table = split.length == 2 ? split[1] : split[0] ;
            sb.append(db).append(SPLIT_DOT).append(table).append(SPLIT_AND);
        }
        if (sb.length()>0) {
            sb.setLength(sb.length()-1);
        }
        return sb.toString();
    }


    /** * 根据别名查询表明 * @param alia * @return */
    private String[] getTableAndAlia(String alia) {
        String _alia = Check.notEmpty(alia) ? alia  : 
                ParseUtil.collectionToString(queryMap.keySet(), SPLIT_AND, true) ;
        String[] result = {"" , _alia};
        Set<String> tableSet = new HashSet<String>();
        if (Check.notEmpty(_alia)) { 
            String[] split = _alia.split(SPLIT_AND);
            for (String string : split) {
                //别名又分单独起的别名 和 表名,即 select a.col,table_name.col from table_name a 
                if (tmpInputTables.contains(string) || tmpInputTables.contains(fillDB(string))) {
                    tableSet.add(fillDB(string));
                } else if (queryMap.containsKey(string.toLowerCase())) {
                    tableSet.addAll(queryMap.get(string.toLowerCase()).getTableSet());
                }
            }
            result[0] = ParseUtil.collectionToString(tableSet, SPLIT_AND, true);
            result[1] = _alia;
        } 
        return result;
    }

    /** * 校验union * @param list */
    private void validateUnion(List<ColLine> list){
        int size = list.size();
        if (size % 2 == 1) {
            throw new SQLParseException("union column number are different, size=" + size);
        }
        int colNum = size / 2;
        checkUnion(list, 0, colNum);
        checkUnion(list, colNum, size);
    }

    private void checkUnion(List<ColLine> list, int start, int end) {
        String tmp = null;
        for (int i = start; i < end; i++) { //合并字段
            ColLine col = list.get(i);
            if (Check.isEmpty(tmp)) {
                tmp = col.getToTable();
            } else if (!tmp.equals(col.getToTable())){
                throw new SQLParseException("union column number/types are different,table1=" + tmp +",table2="+ col.getToTable());
            }
        }
    }

}


另,附上一AST语法树格式化的python脚本。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' Created on 2012-5-20 '''
import sys

# explain select key from kv mykv join test mytest on (mykv.key == mytest.id);

original_str = """ (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME dim_city))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL city_name)) (TOK_SELEXPR (TOK_TABLE_OR_COL city_id)) (TOK_SELEXPR (TOK_TABLE_OR_COL pt))) (TOK_WHERE (AND (= (TOK_TABLE_OR_COL pt) '$yesday') (= (TOK_TABLE_OR_COL level) 2))) (TOK_GROUPBY (TOK_TABLE_OR_COL city_name) (TOK_TABLE_OR_COL city_id) (TOK_TABLE_OR_COL pt)))) b) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME dw_dri_wide_sheet))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL city_id)) (TOK_SELEXPR (TOK_TABLE_OR_COL pt)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION when (= (TOK_FUNCTION to_date (TOK_TABLE_OR_COL last_sucgrabord_time)) '$data_desc') (TOK_TABLE_OR_COL dri_id))) last1_dri_cnt) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION when (and (> (TOK_FUNCTION to_date (TOK_TABLE_OR_COL last_sucgrabord_time)) (TOK_FUNCTION date_sub '$data_desc' 7)) (<= (TOK_FUNCTION to_date (TOK_TABLE_OR_COL last_sucgrabord_time)) '$data_desc')) (TOK_TABLE_OR_COL dri_id))) last7_dri_cnt)) (TOK_WHERE (and (= (TOK_TABLE_OR_COL pt) '$data_desc') (TOK_FUNCTION TOK_ISNOTNULL (TOK_TABLE_OR_COL last_sucgrabord_time)))) (TOK_GROUPBY (TOK_TABLE_OR_COL city_id) (TOK_TABLE_OR_COL pt)))) a) (= (. (TOK_TABLE_OR_COL a) city_id) (. (TOK_TABLE_OR_COL b) city_id)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test kd_st_kpi_dri_active_day_city_bi))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) city_name)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) city_id)) (TOK_SELEXPR (TOK_FUNCTION nvl (TOK_TABLE_OR_COL last1_dri_cnt) 0)) (TOK_SELEXPR (TOK_FUNCTION nvl (TOK_TABLE_OR_COL last7_dri_cnt) 0)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) pt))))) <EOF> """


tmp_str = original_str.strip().replace('\n', '')

def my_print(mystr):
    sys.stdout.write(mystr)

def print_indent(indent_level):
    for i in range(indent_level):
        my_print(' ' * 4)


indent_level = 0
for char in tmp_str:
    if char == '(':
        # 如果是左括号,先换行,然后打印缩进+(
        my_print('\n')
        print_indent(indent_level)
        my_print(char)
        indent_level += 1
    elif char == ')':
        # 如果是右括号,先打印),再换行,打印下一级别的缩进
        indent_level -= 1
        my_print(char)
        my_print('\n')
        print_indent(indent_level - 1)
    else:
        # 其他的直接打印出来
        my_print(char)

参考文章:
http://tech.meituan.com/hive-sql-to-mapreduce.html
http://www.cnblogs.com/drawwindows/p/4595771.html
https://cwiki.apache.org/confluence/display/Hive/LanguageManual

你可能感兴趣的:(hive,数据仓库,元数据,数据血缘,SQ解析)