{
"bibliographic-data":{
"classification-data":{
"classification-ipcr":{
"further-classification":[
{
"main-group":"11",
"class":"01",
"subgroup":"11",
"subclass":"L",
"section":"H"
},
{
"main-group":"11",
"class":"02",
"subgroup":"34",
"subclass":"S",
"section":"H"
}
],
"main-classification":{
"main-group":"31",
"class":"01",
"subgroup":"05",
"subclass":"L",
"section":"H"
}
}
},
"invention-title":[
{
"data-format":"original",
"text":发呆发翻腾而过热狗",
"lang":"CN"
}
],
"parties":{
"agency":[
{
"data-format":"original",
"sequence":1,
"rep-type":"agency",
"name":"务所",
"lang":"CN"
}
],
"inventors":[
{
"data-format":"original",
"sequence":1,
"name":"发的发电",
"lang":"CN"
},
{
"data-format":"original",
"sequence":2,
"name":"都发大水",
"lang":"CN"
},
{
"data-format":"original",
"sequence":3,
"name":"地区",
"lang":"CN"
}
],
"agents":[
{
"data-format":"original",
"sequence":1,
"rep-type":"agent",
"name":"发动v法",
"lang":"CN"
}
]
},
"application-reference":{
"date":20200619,
"country":"CN",
"doc-number":"202021161647.X"
},
"patent-type":"熬到VVT",
"publication-reference":{
"date":20210323,
"country":"CN",
"kind":"U",
"doc-number":"3479"
}
},
"update-ts":20210831,
"abstract":[
{
"data-format":"original",
"text":"fdfdfdfd",
"lang":"CN"
}
],
"patent-legal":{
"legal-update-date":20210323,
"legal-status":[
"3"
]
}
}
可以看出上述json十分复杂,如果函数对JSON字符串进行解析,灵活性比较高,但是对于如果整个文件就是一个JSON文件,在使用起来就相对比较麻烦。Hive中为了简化对于JSON文件的处理,内置了一种专门用于解析JSON文件的Serde解析器, 在创建表时,只要指定使用 JSONSerde 解析表的文件,就会自动将 JSON 文件中的每一列进行解析。
1.github地址
GitHub - rcongiu/Hive-JSON-Serde: Read - Write JSON SerDe for Apache Hive.
2.jar包下载地址
Index of /hive-json-serde (congiu.net)
将 JAR 放入 Hive/lib 或者在 Hive 中使用 ADD JAR
hive -e "add jar /app/hive/lib/json-serde-1.3.8-jar-with-dependencies.jar"
hive -e "add jar /app/hive/lib/json-udf-1.3.8-jar-with-dependencies.jar"
虽然可以使用Serde进行自动解析,但是如此复杂的json字符串,定义表结构是个很麻烦的时,而且容易出错,所以用程序自动化创建再好不过了。
GitHub - quux00/hive-json-schema: Tool to generate a Hive schema from a JSON example doc
使用maven打包
mvn package
执行如下命令
java -jar target/json-hive-schema-1.0-jar-with-dependencies.jar F:\大数据\数据源\json\data\data.json mytable
生成create table 命令
CREATE TABLE mytable (
abstract array<struct<data-format:string, lang:string, text:string>>,
bibliographic-data struct<application-reference:struct<country:string, date:int, doc-number:string>, classification-data:struct<classification-ipcr:struct<further-classification:array<struct<class:string, main-group:string, sectio
n:string, subclass:string, subgroup:string>>, main-classification:struct<class:string, main-group:string, section:string, subclass:string, subgroup:string>>>, invention-title:array<struct<data-format:string, lang:string, text:string
>>, parties:struct<agency:array<struct<data-format:string, lang:string, name:string, rep-type:string, sequence:int>>, agents:array<struct<data-format:string, lang:string, name:string, rep-type:string, sequence:int>>, inventors:array
<struct<data-format:string, lang:string, name:string, sequence:int>>>, patent-type:string, publication-reference:struct<country:string, date:int, doc-number:string, kind:string>>,
patent-legal struct<legal-status:array<string>, legal-update-date:int>,
update-ts int)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';
可以使用``反引号引起来,因为在hive -e 命令中所以使用`进行转义
hive -e "use mydb;CREATE TABLE mytable (abstract array>,bibliographic_data struct`date\` :int, doc_number:string>, classification_data:struct>, main_classification:struct>>, invention_title:array>, parties:struct>, agents:array>, inventors:array>>, patent_type:string, publication_reference:struct`date\`:int, doc_number:string, kind:string>>, patent_legal struct, legal_update_date:int>,update_ts int)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' with serdeproperties('mapping.data_format' = 'data-format','mapping.bibliographic_data' = 'bibliographic-data','mapping.application_reference' = 'application-reference','mapping.doc_number' = 'doc-number','mapping.classification_data' = 'classification-data','mapping.classification_ipcr' = 'classification-ipcr','mapping.further_classification' = 'further-classification','mapping.main_group' = 'main-group','mapping.main_classification' = 'main-classification','mapping.main_group' = 'main-group','mapping.invention_title' = 'invention-title','mapping.data_format' = 'data-format','mapping.rep_type' = 'rep-type','mapping.patent_type' = 'patent-type','mapping.publication_reference' = 'publication-reference','mapping.update_ts' = 'update-ts','mapping.patent_legal' = 'patent-legal','mapping.legal_status' = 'legal-status','mapping.legal_update_date' = 'legal-update-date','mapping.lang_r' = 'lang') STORED AS TEXTFILE;"
with serdeproperties('mapping.data_format' = 'data-format','mapping.bibliographic_data' = 'bibliographic-data','mapping.application_reference' = 'application-reference','mapping.doc_number' = 'doc-number','mapping.classification_data' = 'classification-data','mapping.classification_ipcr' = 'classification-ipcr','mapping.further_classification' = 'further-classification','mapping.main_group' = 'main-group','mapping.main_classification' = 'main-classification','mapping.main_group' = 'main-group','mapping.invention_title' = 'invention-title','mapping.data_format' = 'data-format','mapping.rep_type' = 'rep-type','mapping.patent_type' = 'patent-type','mapping.publication_reference' = 'publication-reference','mapping.update_ts' = 'update-ts','mapping.patent_legal' = 'patent-legal','mapping.legal_status' = 'legal-status','mapping.legal_update_date' = 'legal-update-date','mapping.lang_r' = 'lang')
使用serdeproperties对字段名进行映射
3.加载数据
hive -e "use mydb;load data local inpath '/app/shell/datasource/json/mydata/mydata20210331_bakfile/outFile/data.json' into table mytable"
4.查看数据
hive -e "use mydb;select * from mytable limit 1;"