hive 文件存储parquet格式, sparkSql解析部分字段为null

hive 版本2.1.0, hdfs 上同一份数据

(1) 用hive 建表, 部分字段hive 读取有值, sparksql读取值为null

CREATE EXTERNAL TABLE business_log.rule_business_log2 (
                           id bigint,
                            businessCode String,
                            businessDesc String,
                            comment String,
                            orderNo String,
                            idCard String,
                            result String,
                            mobile String,
                            departId bigint,
                            createTime String,
                            tpFlag String

COMMENT 'rule_business_log2 table'
PARTITIONED BY (day  string)
STORED AS PARQUET
location 'hdfs://namenode:8020/user/business_log/test/rule_business_log';
alter table rule_business_log2 add partition(day = '2017-09-27');


sparkSql, 读取结果:

0 null  null  null  null  null  STRATEGY_REFUSE 15294342222 null  null  null  2017-09-27
0 null  null  null  null  null  0.8 15138888808 null  null  null  2017-09-27


(2) 用sparksql 建表, hive 读取有值, sparksql读取也正常



CREATE EXTERNAL TABLE business_log.rule_business_log3 (
                           id bigint,
                            businessCode String,
                            businessDesc String,
                            comment String,
                            orderNo String,
                            idCard String,
                            result String,
                            mobile String,
                            departId bigint,
                            createTime String,
                            tpFlag String

COMMENT 'rule_business_log3 table'


PARTITIONED BY (day  string)


STORED AS PARQUET


location 'hdfs://namenode:8020/user/business_log/test/rule_business_log';
alter table rule_business_log3 add partition(day = '2017-09-27');


通过hive command(show create table tablename) 查看table,  两种表的结构有差别:

(1) 

CREATE EXTERNAL TABLE `rule_business_log2`(
  `id` bigint, 
  `businesscode` string, 
  `businessdesc` string, 
  `comment` string, 
  `orderno` string, 
  `idcard` string, 
  `result` string, 
  `mobile` string, 
  `departid` bigint, 
  `createtime` string, 
  `tpflag` string)
COMMENT 'rule_business_log3 table'
PARTITIONED BY ( 
  `day` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://namenode:8020/user/business_log/test/rule_business_log'
TBLPROPERTIES (
  'transient_lastDdlTime'='1506506398')

(2)

CREATE EXTERNAL TABLE `rule_business_log3`(
  `id` bigint, 
  `businesscode` string, 
  `businessdesc` string, 
  `comment` string, 
  `orderno` string, 
  `idcard` string, 
  `result` string, 
  `mobile` string, 
  `departid` bigint, 
  `createtime` string, 
  `tpflag` string)
COMMENT 'rule_business_log2 table'
PARTITIONED BY ( 
  `day` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://namenode:8020/user/business_log/test/rule_business_log'
TBLPROPERTIES (
  'spark.sql.sources.schema.numPartCols'='1', 
  'spark.sql.sources.schema.numParts'='1', 
  'spark.sql.sources.schema.part.0'='{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"businessCode\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"businessDesc\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"comment\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"orderNo\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"idCard\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"result\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mobile\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"departId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"createTime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tpFlag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"day\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}', 
  'spark.sql.sources.schema.partCol.0'='day', 
  'transient_lastDdlTime'='1506505821')


推测:sparksql 建表会自动记录schema, hive 表schema\ metadata保存在外部mysql或其他系统里, sparksql解析hive 创建的表,解析部分字符(汉字)出错。

你可能感兴趣的:(hive)