把mongoDB数据导入hive

hive映射mongoDB表

mongodb基本脚本

##创建一张表
db.createCollection("company_info_mapping");

##插入数据
db.getCollection("company_info_mapping").insert( {
     
    companyName: "test",
    licenseNumber: "test",
    socialCreditCode: "test",
    hid: "7752395865026566931",
    uid: "1396656205685932577"
});

hive表映射mongodb数据表

## 添加jar包(可在相应资料里面下载)
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-hadoop-core-2.0.2.jar;
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-hadoop-hive-2.0.2.jar;
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-java-driver-3.12.8.jar;

CREATE EXTERNAL TABLE `mongodb_patent2020`(
  `ZhuanLiXinXiInfo` string COMMENT 'from deserializer')
ROW FORMAT SERDE 
  'com.mongodb.hadoop.hive.BSONSerDe' 
STORED BY 
  'com.mongodb.hadoop.hive.MongoStorageHandler' 
WITH SERDEPROPERTIES ( 
  'mongo.columns.mapping'='{"ZhuanLiXinXiInfo":"ZhuanLiXinXiInfo"}', 
  'serialization.format'='1')
TBLPROPERTIES (
  'mongo.uri'='mongodb://172.16.98.159:21000/patent.patent2020');


## mongodb的地址
mongo.uri

## mongodb表映射的字段
mongo.columns.mapping

## 如果查询报错,添加如下脚本
set mongo.input.split.create_input_splits=false;

开始把mongodb数据导出

## 开始把mongodb数据导出
insert overwrite table hive_patent2020_pre_name 
select 
concat(substr(`_id`,length(`_id`)-1,1),`_id`) as key,
`_id` as id ,
patentname,
patenttype,
applicationpublishtime,
applicationnum,
applicationtime,
publishnum,
applicantname,
address,
agency,
agent,
abstracts,
status,
type,
trim(companyname)
from hive_patent2020_pre lateral view explode(split(REPLACE(REPLACE(REPLACE(trim(substr(applicantname,2,length(`applicantname`)-2)),'(','('),')',')'),'"',''),',')) tmp as companyname ;

注意事项:如果报错说什么split之类,可是如下参数:
set mongo.input.split.create_input_splits=false;

你可能感兴趣的:(mongdb,hive,mongodb,hive,大数据)