hive建表插入元数据表过程

#table_name:表名
#column:列名->类型
#location:数据存储位置
#partitionColumn:分区名->类型
#field_delimit:列分隔符
#is_parquet_type:是否parquet
def createTable(table_name, column, location, partitionColumn, field_delimit, is_parquet_type = False):

    conn = MySQLdb.connect(host='xxx', port=xxx, user='xxx', passwd='xxx')
    conn.select_db('xxx')
    cursor = conn.cursor()
    sql = "select TBL_ID from TBLS order by TBL_ID desc limit 1"
    cursor.execute(sql)
    result = cursor.fetchone()
    TBL_ID = result[0] + 1
    sql = "select SD_ID from SDS order by SD_ID desc limit 1"
    cursor.execute(sql)
    result = cursor.fetchone()
    SD_ID = result[0] + 1
    print TBL_ID, SD_ID
    createTime = int(time.time())
    DB_ID = 1
    lastAccessTime = 0
    owner = 'root'
    retention = 0
    inputFormat = 'org.apache.hadoop.mapred.TextInputFormat'
    outputFormat = 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    tblType = 'EXTERNAL_TABLE'

    c_str = ""
    i = 0

    columns = json.loads(column)
    for c in columns:
        print c
        c_str += "("
        c_str += str(TBL_ID) + "," + "NULL,'" + c[0] + "','" + c[1] + "'," + str(i)
        i = i + 1
        c_str += "),"
    c_str = c_str[:-1]
    print c_str

    pc = json.loads(partitionColumn)
    pc_str = ""
    i = 0
    for c in pc:
        print c
        pc_str += "("
        pc_str += str(TBL_ID) + "," + "NULL,'" + c[0] + "','" + c[1] + "'," + str(i)
        i = i + 1
        pc_str += "),"

    pc_str = pc_str[:-1]
    print pc_str
    insertCDSQL = "insert into CDS(`CD_ID`) values ("+str(TBL_ID)+")"
    cursor.execute(insertCDSQL)
    if is_parquet_type :
        insertSerdesSQL = "insert into SERDES (`SERDE_ID`,`NAME`,`SLIB`) VALUES ("+str(SD_ID)+",NULL,'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe')"
    else:
        insertSerdesSQL = "insert into SERDES (`SERDE_ID`,`NAME`,`SLIB`) VALUES ("+str(SD_ID)+",NULL,'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe')"
    print insertSerdesSQL
    cursor.execute(insertSerdesSQL)

    insertSerdeParamsSQL = "insert into SERDE_PARAMS (`SERDE_ID`,`PARAM_KEY`,`PARAM_VALUE`) values ("+str(SD_ID)+",'field.delim','"+field_delimit+"'),("+str(SD_ID)+",'serialization.format','"+field_delimit+"')"
    print insertSerdeParamsSQL
    cursor.execute(insertSerdeParamsSQL)

    insertSdSQL = "insert into SDS(`SD_ID`,`CD_ID`,`INPUT_FORMAT`,`IS_COMPRESSED`,`IS_STOREDASSUBDIRECTORIES`,`LOCATION`,`NUM_BUCKETS`,`OUTPUT_FORMAT`,`SERDE_ID`) values ("+ str(SD_ID) +"," +\
                  str(TBL_ID)+",'"+inputFormat+"','','','"+location+"',-1,'"+outputFormat+"',"+str(SD_ID)+")"
    print insertSdSQL
    cursor.execute(insertSdSQL)

    insertTableSQL = "insert into TBLS(`TBL_ID`,`CREATE_TIME`,`DB_ID`,`LAST_ACCESS_TIME`,`OWNER`,`RETENTION`,`SD_ID`,`TBL_NAME`,`TBL_TYPE`,`VIEW_EXPANDED_TEXT`,`VIEW_ORIGINAL_TEXT`) values ("+ str(TBL_ID) +","+ str(createTime) +","+ str(DB_ID) +\
                     ","+ str(lastAccessTime) +",'"+owner+"',"+str(retention)+","+ str(SD_ID) +",'"+ table_name +"','"+tblType+"',NULL,NULL)"
    print insertTableSQL
    cursor.execute(insertTableSQL)

    insertTblParamsSQL = "insert into TABLE_PARAMS (`TBL_ID`,`PARAM_KEY`,`PARAM_VALUE`) values ("+str(TBL_ID)+",'EXTERNAL','TRUE'),("+str(TBL_ID)+",'transient_lastDdlTime',"+str(createTime)+")"
    print insertTblParamsSQL
    cursor.execute(insertTblParamsSQL)

    insertColumnSQL = "insert into COLUMNS_V2 (`CD_ID`,`COMMENT`,`COLUMN_NAME`,`TYPE_NAME`,`INTEGER_IDX`) values" + c_str
    print insertColumnSQL
    cursor.execute(insertColumnSQL)

    insertPartionKeysSQL = "insert into PARTITION_KEYS(`TBL_ID`,`PKEY_COMMENT`,`PKEY_NAME`,`PKEY_TYPE`,`INTEGER_IDX`) values" + pc_str
    print insertPartionKeysSQL
    cursor.execute(insertPartionKeysSQL)
    conn.commit()
    cursor.close()
    conn.close()

 

你可能感兴趣的:(hive)