LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename;
filepath表示待移动数据的路径。可以指向文件(在这种情况下,Hive将文件移动到表中),也可以指向目录(在这种情况下,Hive将把该目录中的所有文件移动到表中)。
filepath文件路径支持下面三种形式,要结合LOCAL关键字一起考虑:
Local是在本地文件系统中查找文件路劲,这个本地并不是客户端,而是hiveserver2服务所在机器的本地Linux系统。没有指定local的话如果filepath指向的是一个完整的uri,会直接使用这个URI,如果也没有schema,Hive会使用Hadoop配置文件中fs.default.name指定。
--建表student_local用于演示从本地加载数据
CREATE TABLE student_local(
num int,
name string,
sex string,
age int,
dept string)
ROW FORMAT delimited fields terminated BY ',';
--建表student_hdfs用于演示从HDFS加载数据
CREATE TABLE student_hdfs(
num int,
name string,
sex string,
age int,
dept string)
ROW FORMAT delimited fields terminated BY ',';
-- 从本地加载数据,数据位于Hiveserver2(node1)本地文件系统 本质是hadoop fs -put上传操作
LOAD DATA LOCAL INPATH '/root/hivedata/students.txt' INTO TABLE student_local;
SELECT * FROM student_local;
--从HDFS加载数据 数据位于HDFS文件系统根目录下 本质是hadoop fs -mv 移动操作
--先把数据上传到HDFS上 hadoop fs -put /root/hivedata/students.txt /cauchy/
LOAD DATA INPATH '/cauchy/students.txt' INTO TABLE student_hdfs;
INSERT INTO TABLE tablename select_statement1 FROM from_statement;
CREATE TABLE student(
num int,
name string,
sex string,
age int,
dept string)
ROW FORMAT delimited fields terminated BY ',';
LOAD DATA LOCAL INPATH '/root/hivedata/students.txt' INTO TABLE student;
CREATE TABLE student_insert(sno int, sname string);
INSERT INTO TABLE student_insert SELECT num, name FROM student;
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[LIMIT [offset,] rows];
--创建表t_usa_covid19
CREATE TABLE t_usa_covid19(
count_date string,
county string,
state string,
fips int,
cases int,
deaths int)
row format delimited fields terminated BY ',';
--将数据load加载到t_usa_covid19表对应的路径下
LOAD DATA LOCAL INPATH '/root/hivedata/us-covid19-counties.dat' INTO TABLE t_usa_covid19;
--查询所有字段或者指定字段
select * from t_usa_covid19;
select county, cases, deaths from t_usa_covid19;
--查询当前数据库
select current_database(); --省去from关键字
--返回所有匹配的行两者相同
select state from t_usa_covid19;
select all state from t_usa_covid19;
--返回所有匹配的行,去除重复的结果
select distinct state from t_usa_covid19;
--多个字段distinct是整体去重
select distinct county,state from t_usa_covid19;
--找出来自于California州的疫情数据
select * from t_usa_covid19 where state = 'California';
--where条件中使用函数 找出州名字母长度超过10位的有哪些
select * from t_usa_covid19 where length(state) >10 ;
--统计美国总共有多少个县county
--as给查询返回的结果起个别名
select count(county) as county_cnts from t_usa_covid19;
--去重distinct
select count(distinct county) as county_cnts from t_usa_covid19;
--统计美国加州有多少个县
select count(county) as total_county_California from t_usa_covid19 where state = "California";
--统计德州总死亡病例数
select sum(deaths) as total_deaths_Texas from t_usa_covid19 where state = "Texas";
--统计出美国最高确诊病例数是哪个县
select max(cases) as max_cases_county from t_usa_covid19;
--这个比较高级了
SELECT * FROM t_usa_covid19 where cases=(select max(cases) as max_cases_county from t_usa_covid19);
--根据state州进行分组 统计每个州有多少个县county
select state, count(county) as total_county from t_usa_covid19 group by state;
-- 根据state州进行分组 统计每个州有多少个县county并计算死亡人数总和
select state,count(county) as total_county, sum(deaths) as total_deaths from t_usa_covid19 group by state;
--统计2021-01-28死亡病例数大于10000的州
--先where分组前过滤,再进行group by分组, 分组后每个分组结果集确定 再使用having过滤
select state,sum(deaths) as deaths_cnts from t_usa_covid19 where count_date = "2021-01-28" group by state having sum(deaths) > 10000;
--这样写更好 即在group by的时候聚合函数已经作用得出结果 having直接引用结果过滤 不需要再单独计算一次了
select state,sum(deaths) as deaths_cnts from t_usa_covid19 where count_date = "2021-01-28" group by state having deaths_cnts> 10000;
--根据确诊病例数升序排序 查询返回结果(默认就是asc升序)
select * from t_usa_covid19 order by cases;
select * from t_usa_covid19 order by cases asc;
--根据死亡病例数倒序排序 查询返回加州每个县的结果
select * from t_usa_covid19 where state = "California" order by cases desc;
--没有限制返回2021.1.28 加州的所有记录,返回结果集的前5条
select * from t_usa_covid19 where count_date = "2021-01-28" and state ="California" limit 5;
--返回结果集从第3行开始 共3行
select * from t_usa_covid19 where count_date = "2021-01-28" and state ="California" limit 2,3;