hive二级分区

几年前的工作了,放在草稿箱2,3年了,整理一下吧。

内容主要是扩展了hive的分区方式,原生的hive支持多字段等值分区,对于很有用的range、hash、list等数据库传统的分区方式不支持,因此这个做了二级的多类分区的扩展。

这里只是antlr的定义,显然你还需要在查询、存储层做更多的工作来支持这种分区工作,量还是不小的,我们的工作不具参考性,这里只列一下sql解析的修改,给大家做参考。


语法定义
tablePartition
@init { msgs.push("table partition specification"); }
@after { msgs.pop(); }
    : KW_PARTITIONED KW_BY partParamList=partitionParamList
    tableSubPartition?
    partitionTemplate?
    -> ^(TOK_PARTITIONED_BY $partParamList
    tableSubPartition?
    partitionTemplate?)
    ;
    
tableSubPartition
@init { msgs.push("table subPartition specification"); }
@after { msgs.pop(); }
    :KW_SUBPARTITIONED KW_BY partParamList=partitionParamList
   subPartitionTemplate?
    -> ^(TOK_SUBPARTITIONED_BY $partParamList
    subPartitionTemplate?)
    ;
    
partitionParamList
@init { msgs.push(" partitionParamList specification"); }
@after { msgs.pop(); }
    :(LPAREN (columnNameTypeList
    	|columnNameList
   	| functionList
    ) RPAREN)  
    | (columnNameTypeList
    	|columnNameList
   	| functionList)
    ;
    
partitionTemplate
@init { msgs.push("table PartitionTemplate specification"); }
@after { msgs.pop(); }
    :partitionExper (COMMA partitionExper)*
     -> ^(TOK_PARTITION_EXPER partitionExper+)
    ;
    
subPartitionTemplate
@init { msgs.push(" subPartitionTemplate specification"); }
@after { msgs.pop(); }
    : (subPartitionExper (COMMA subPartitionExper)*)
    -> ^(TOK_SUBPARTITION_EXPER subPartitionExper+)
    ;

partitionExper
@init { msgs.push("partitionExper specification"); }
@after { msgs.pop(); }
    :KW_PARTITION partition_name=partitionName  partitionValuesExper  tableSubPartition?
    -> ^(TOK_PARTITION $partition_name partitionValuesExper tableSubPartition?)
    ;
    
partitionValuesExper
@init { msgs.push(" partitionValuesExper specification"); }
@after { msgs.pop(); }
    :KW_VALUES  KW_LESS KW_THEN LPAREN value=stringOrNumOrFunc RPAREN 
    -> ^(TOK_VALUES_LESS  $value )
    |KW_VALUES  KW_GREATER KW_THEN LPAREN value=stringOrNumOrFunc RPAREN 
     -> ^(TOK_VALUES_GREATER  $value )
    | KW_VALUES LPAREN valueList=stringOrNumOrFuncList RPAREN
     -> ^(TOK_VALUES   $valueList )
    ;
    
subPartitionExper
@init { msgs.push("subPartitionExper specification"); }
@after { msgs.pop(); }
/*
    :KW_SUBPARTITION partitionName (
     (KW_VALUES ((KW_LESS|KW_GREATER) KW_THEN) LPAREN (stringOrNumOrFunc) RPAREN)
     | KW_VALUES LPAREN (stringOrNumOrFuncList) RPAREN)
    ;
    */
      :KW_SUBPARTITION partition_name=partitionName  partitionValuesExper 
    -> ^(TOK_SUBPARTITION $partition_name partitionValuesExper)
    ;
    
stringOrNumOrFunc
 @init { msgs.push("table stringOrNumOrFunc specification"); }
@after { msgs.pop(); }
    :StringLiteral
    |Number
    |function
    ;
    
 stringOrNumOrFuncList  
@init { msgs.push("table stringOrNumOrFuncList specification"); }
@after { msgs.pop(); }
 :(stringOrNumOrFunc (COMMA stringOrNumOrFunc)*)
 -> ^(TOK_STR_OR_NUM_OR_FUNC stringOrNumOrFunc+)
    ;
      
partitionName
@init { msgs.push("table partitionName specification"); }
@after { msgs.pop(); }
    :
      Identifier
    ;
    
 datawarehouseStatement
@init { msgs.push("table datawarehouseStatement specification"); }
@after { msgs.pop(); }
    :
      KW_ALTER KW_DW  KW_DIRECT LPAREN dwNo=Number COMMA sql=StringLiteral RPAREN
       -> ^(TOK_ALTER_DW $dwNo $sql)
    ;

//added by zjw to customize table partition
functionList
@init { msgs.push("table partition function specification"); }
@after { msgs.pop(); }
    :
    function  (COMMA function)*  -> ^(TOK_TABLEPARTCOLS (function)+)
    ;	


JDO定义:
 <field name="partitionKeys" table="PARTITION_KEYS" >
        <collection element-type="MFieldSchema"/>
        <join>
          <primary-key name="PARTITION_KEY_PK">
            <column name="TBL_ID"/>
            <column name="PKEY_NAME"/>
          </primary-key>	
          <column name="TBL_ID"/>
        </join>
        <element>
          <embedded>
            <field name="name">
              <column name="PKEY_NAME" length="128" jdbc-type="VARCHAR"/>
              </field>
            <field name="type">
              <column name="PKEY_TYPE" length="767" jdbc-type="VARCHAR" allows-null="false"/>
            </field>
            <field name="comment" >
              <column name="PKEY_COMMENT" length="4000" jdbc-type="VARCHAR" allows-null="true"/>
            </field>
            <!-- added by zjw for subpartitions -->
            <field name="part_num">
              <column name="P_NUM" jdbc-type="integer"/>
            </field>
            <field name="part_level">
              <column name="P_LEVEL" jdbc-type="integer"/>
            </field>
            <field name="part_type">
              <column name="P_TYPE" length="256" jdbc-type="VARCHAR" allows-null="false"/>
            </field>
            <field name="part_type_param" >
              <column name="P_TYPE_PARAM" length="767" jdbc-type="VARCHAR" allows-null="true"/>
            </field>
            <!-- added by zjw for subpartitions -->
            <field name="subPartitionKeys" table="SUB_PARTITION_KEYS" >
		        <collection element-type="MFieldSchema"/>
		        <join>
		          <primary-key name="PARTITION_KEY_PK">
		            <column name="TBL_ID"/>
		            <column name="PKEY_NAME"/>
		          </primary-key>	
		          <column name="TBL_ID"/>
		        </join>
		        <element>
		          <embedded>
		            <field name="name">
		              <column name="PKEY_NAME" length="128" jdbc-type="VARCHAR"/>
		              </field>
		            <field name="type">
		              <column name="PKEY_TYPE" length="767" jdbc-type="VARCHAR" allows-null="false"/>
		            </field>
		            <field name="comment" >
		              <column name="PKEY_COMMENT" length="4000" jdbc-type="VARCHAR" allows-null="true"/>
		            </field>
		            <field name="part_num">
		              <column name="P_NUM" jdbc-type="integer"/>
		              </field>
		            <field name="part_type">
		              <column name="P_TYPE" length="256" jdbc-type="VARCHAR" allows-null="false"/>
		            </field>
		            <field name="part_type_param" >
		              <column name="P_TYPE_PARAM" length="767" jdbc-type="VARCHAR" allows-null="true"/>
		            </field>
		          </embedded>
		        </element>
		      </field>
          </embedded>
        </element>
      </field>


分区key定义用途有2个方面:

    1 加载需要确定如何划分加载的文件个数和数据路由
    2 查询如何按照分区方式,优化查询计划


p_key tbl_id p_type integer_idx


p_key tbl_id p_type integer_idx

你可能感兴趣的:(分区,hive,list,hash,range)