参考:https://www.cnblogs.com/jonban/p/10779938.html
Hive 启动:hive
退出:hive>quit;
show databases;
show tables;
desc tab_name; --查看表的结构及表的路径
show partitions fact_measured_cft_hive ;展示表分区 fact_measured_cft_hive
1、 创建表 create-table.sql
create table if not exists db_hive.tb_user ( id int, username string comment '用户名', age int comment '年龄', address string comment '地址' ) comment '用户表' row format delimited fields terminated by ',' stored as textfile location '/user/hive/warehouse/db_hive.db/db_user'
2、执行创建表
hive -f 'create-table.sql'
3、加载数据到 tb_user 表中
数据文件 /root/files/tb_user.txt
1001,Logan,16,shenzhen 1002,Herry potter,12,Magic school 1003,孙悟空,500,花果山
Hive交互式命令行执行命令 load data local inpath '/root/files/tb_user.txt' into table db_hive.tb_user;
如下所示:
hive (db_hive)> load data local inpath '/root/files/tb_user.txt' into table db_hive.tb_user;
如果要覆盖旧数据,可以加 overwrite,如下所示
hive (db_hive)> load data local inpath '/root/files/tb_user.txt' overwrite into table db_hive.tb_user;
4、查询数据
hive -e "select id,username from db_hive.tb_user"
5、根据已有表创建只有部分字段的子表
create table if not exists db_hive.tb_user_sub as select id,username from db_hive.tb_user;
6、 like 创建表
create table if not exists db_hive.tb_user_like like db_hive.tb_user;
插入数据
insert into table db_hive.tb_user_like select * from db_hive.tb_user;
7、重命名表
alter table tb_user_like rename to tb_user_rename ;
8、 创建外部表,删除时只删除元数据,不会删除表数据
create external table if not exists db_hive.tb_ext(id string);
9、创建分区表
create table if not exists db_hive.tb_logs( ip string, text string, log_time string ) partitioned by (month string) row format delimited fields terminated by "\t";
数据文件 /root/files/tb_logs.txt
192.168.32.100 login 20190429072650 192.168.32.100 order 20190429072730 192.168.32.101 browse 20190429072812
载入数据
load data local inpath '/root/files/tb_logs.txt' into table db_hive.tb_logs partition (month = '201904')
查询数据
select ip,text,log_time from tb_logs where month = '201904';
10、手工创建分区数据及修复分区表
创建分区目录
hdfs dfs -mkdir -p /user/hive/warehouse/db_hive.db/tb_logs/month=201905
上传数据文件到分区目录下
hdfs dfs -put /root/files/tb_logs.txt /user/hive/warehouse/db_hive.db/tb_logs/month=201905
此时执行查询
select count(distinct ip) from db_hive.tb_logs where month = '201905';
查询结果为0。
【原因】:数据并未添加到分区中,查看配置的MySQL元数据信息
mysql> use hive_metastore; mysql> select * from PARTITIONS;
示例配置的Hive元数据存放为MySQL数据库中的 hive_metastore 数据库
查询分区表 PARTITIONS 中的数据,发现只有一条记录,如下所示:
+---------+-------------+------------------+--------------+-------+--------+ | PART_ID | CREATE_TIME | LAST_ACCESS_TIME | PART_NAME | SD_ID | TBL_ID | +---------+-------------+------------------+--------------+-------+--------+ | 1 | 1556494255 | 0 | month=201904 | 29 | 28 | +---------+-------------+------------------+--------------+-------+--------+
【修复方法一】直接执行修复命令
msck repair table tb_logs
此时分区表中的数据如下:
+---------+-------------+------------------+--------------+-------+--------+ | PART_ID | CREATE_TIME | LAST_ACCESS_TIME | PART_NAME | SD_ID | TBL_ID | +---------+-------------+------------------+--------------+-------+--------+ | 1 | 1556494255 | 0 | month=201904 | 29 | 28 | | 2 | 1556495227 | 0 | month=201905 | 30 | 28 | +---------+-------------+------------------+--------------+-------+--------+
执行查询命令
select count(distinct ip) from db_hive.tb_logs where month = '201905';
返回结果为2,数据已正常加入分区。
【修复方法二】 使用增加分区命令
操作步骤:创建新分区目录并上传数据文件,命令如下:
hive (db_hive)> dfs -mkdir -p /user/hive/warehouse/db_hive.db/tb_logs/month=201906; hive (db_hive)> dfs -put /root/files/tb_logs.txt /user/hive/warehouse/db_hive.db/tb_logs/month=201906;
执行增加分区命令
alter table tb_logs add partition(month = '201906');
查询数据,测试结果正常。
此时元数据分区表中数据如下:
+---------+-------------+------------------+--------------+-------+--------+ | PART_ID | CREATE_TIME | LAST_ACCESS_TIME | PART_NAME | SD_ID | TBL_ID | +---------+-------------+------------------+--------------+-------+--------+ | 1 | 1556494255 | 0 | month=201904 | 29 | 28 | | 2 | 1556495227 | 0 | month=201905 | 30 | 28 | | 3 | 1556495635 | 0 | month=201906 | 31 | 28 | +---------+-------------+------------------+--------------+-------+--------+
查看表分区命令
show partitions db_hive.tb_logs;
11、 导出表数据
export table db_hive.tb_logs to '/user/hive/warehouse/export/db_hive/tb_logs';
12、 导入表数据
创建表
create table tb_logs_like like tb_logs;
导入数据
import table tb_logs_like from '/user/hive/warehouse/export/db_hive/tb_logs';
13、导出数据到本地文件
insert overwrite local directory '/root/files/hive_out' row format delimited fields terminated by '\t' collection items terminated by '\n' select * from db_hive.tb_logs;
Hive 常用命令和语句
#!/usr/bin/python # -*- coding: utf-8 -*- from sqlalchemy.engine import create_engine from sqlalchemy import text import pandas as pd import datetime starttime = datetime.datetime.now() sql = """ select * from fact_five_data_cft where source = 'classics' and wfid='320924' and yyyy='2019' limit 10 """ engine = create_engine('presto://node1:8085/hive/cnyb') df = pd.read_sql(text(sql),engine) print(df) endtime = datetime.datetime.now() runtime = endtime - starttime print "presto run time -->"+str(runtime)