hive 版本2.1.0, hdfs 上同一份数据
(1) 用hive 建表, 部分字段hive 读取有值, sparksql读取值为null
CREATE EXTERNAL TABLE business_log.rule_business_log2 (
id bigint,
businessCode String,
businessDesc String,
comment String,
orderNo String,
idCard String,
result String,
mobile String,
departId bigint,
createTime String,
tpFlag String
)
COMMENT 'rule_business_log2 table'
PARTITIONED BY (day string)
STORED AS PARQUET
location 'hdfs://namenode:8020/user/business_log/test/rule_business_log';
alter table rule_business_log2 add partition(day = '2017-09-27');
sparkSql, 读取结果:
0 null null null null null STRATEGY_REFUSE 15294342222 null null null 2017-09-27
0 null null null null null 0.8 15138888808 null null null 2017-09-27
(2) 用sparksql 建表, hive 读取有值, sparksql读取也正常
通过hive command(show create table tablename) 查看table, 两种表的结构有差别:
(1)
CREATE EXTERNAL TABLE `rule_business_log2`(
`id` bigint,
`businesscode` string,
`businessdesc` string,
`comment` string,
`orderno` string,
`idcard` string,
`result` string,
`mobile` string,
`departid` bigint,
`createtime` string,
`tpflag` string)
COMMENT 'rule_business_log3 table'
PARTITIONED BY (
`day` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://namenode:8020/user/business_log/test/rule_business_log'
TBLPROPERTIES (
'transient_lastDdlTime'='1506506398')
(2)
CREATE EXTERNAL TABLE `rule_business_log3`(
`id` bigint,
`businesscode` string,
`businessdesc` string,
`comment` string,
`orderno` string,
`idcard` string,
`result` string,
`mobile` string,
`departid` bigint,
`createtime` string,
`tpflag` string)
COMMENT 'rule_business_log2 table'
PARTITIONED BY (
`day` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://namenode:8020/user/business_log/test/rule_business_log'
TBLPROPERTIES (
'spark.sql.sources.schema.numPartCols'='1',
'spark.sql.sources.schema.numParts'='1',
'spark.sql.sources.schema.part.0'='{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"businessCode\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"businessDesc\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"comment\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"orderNo\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"idCard\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"result\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mobile\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"departId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"createTime\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tpFlag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"day\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}',
'spark.sql.sources.schema.partCol.0'='day',
'transient_lastDdlTime'='1506505821')
推测:sparksql 建表会自动记录schema, hive 表schema\ metadata保存在外部mysql或其他系统里, sparksql解析hive 创建的表,解析部分字符(汉字)出错。