create table users(userid BigInt, gender String, age Int, occupation String, zipcode String)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
load data local inpath '/home/hadoop/moviedata/users.dat' INTO TABLE users;
select * from users limit 5;
create table movies(movieid BigInt, name String, type String)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
load data local inpath '/home/hadoop/moviedata/movies.dat' INTO TABLE movies;
select * from movies limit 5;
create table ratings(userid BigInt, movieid BigInt, rate Double, ts String)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
load data local inpath '/home/hadoop/moviedata/ratings.dat' INTO TABLE ratings;
select * from ratings limit 5;
问题1:求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
思路:
要求的字段: 电影名称 movies.name 评分次数 ratings count
核心:按照电影进行分组,求出每组的总评分次数。
select a.movieid, b.name, count(a.userid) as total
from ratings a join movies b on a.movieid = b.movieid
group by a.movieid, b.name
order by total desc
limit 10;
问题2:分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
思路:
要查询的字段: 性别 users.gender 电影名 movies.name 影评分 ratings.rate avg( )
分别求男性和女性评分最高的10部电影
select m.gender,m.name,avg(r.rate) as avgrte,count(r.rate) as count
from ratings r
join users u on r.userid = u.userid
join movies m on r.movieid = m.movieid
where u.gender = 'F'
group by m.movieid,m.name
having count >= 50
order by avgrate desc limit 10;
select m.gender,m.name,avg(r.rate) as avgrte,count(r.rate) as count
from ratings r
join users u on r.userid = u.userid
join movies m on r.movieid = m.movieid
where u.gender = 'M'
group by m.movieid,m.name
having count >= 50
order by avgrate desc limit 10;
问题3:求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
思路:
要查询的字段:年龄段 users.age 影评分 ratings.rate
select b.age as age , avg(a.rate) as avgrate
from ratings a join users b on a.userid = b.userid
where a.movieid = 2116
group by b.age;
问题4:求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)思路
思路:
要查询的字段: 观影者:最喜欢看电影的那个女性;电影名称 movies.name ; 影评分 ratings.rate
核心要点:1、求出最喜欢看电影的女性的ID;
2、求出该女性最喜欢看的10部电影;
3、求出这10部电影的平均影评分;
第一个步骤:要查询的字段users.userid count(ratings.rate)
核心要点:按照userid分组 构建中间表供后续使用。
create table answer4_1 as
select b.userid, count(a.movieid) as total
from ratings a join users b on a.userid = b.userid
where b.gender = "F"
group by b.userid
order by total desc limit 1;
第二个步骤:要查询的字段movies.movieid
核心条件是userid = answer4_1.userid 将查询结果作为中间表 answer4_2
create table answer4_2 as
select a.movieid,a.rate from ratings a join answer4_1 t on a.userid = t.userid
order by a.rate desc limit 10;
第三个步骤:要查询的字段 电影ID answer4_2 .movieid 影评平均分 avg(ratings.rate)
create table answer4_3 as
select a.movieid, c.name, avg(a.rate) as avgrate
from ratings a join answer5_2 b on a.movieid = b.movieid
join movies c on c.movieid = a.movieid
group by a.movieid, c.name;
问题5:求好片(评分>=4.0)最多的那个年份的最好看的10部电影
首先构建一个movie_avgrate_year表,该表中存储所有的movieid, avg(rate), year
create table movie_rate_year as
select a.movieid as movieid, avg(a.rate) as avgrate, substring(b.name, -5, 4) as year
from ratings a join movies b on a.movieid = b.movieid
group by a.movieid, substring(b.name, -5, 4);
第一步:求出好片最多的那个年份;
核心要点:按照好片计数:count(movieid) as total where avgrate >= 4.0
按照年份分组:group by year
按照 total 排序降序
取好片最多的年份,就是total 最大的那个year limit 1 获取这个电影大年 : max_year
create table max_year as
select year, count(movieid) as total
from movie_rate_year
where avgrate >= 4
group by year
order by total desc
limit 1;
第二步:求出电影大年中最好看的10部电影
核心要点:
过滤条件:where year = max_year.year
按照avgrate排降序
limit 10
select a.movieid,a.avgrate
from movie_rate_year a
join max_year b on a.year = b.year
order by a.avgrate desc limit 10;
需求中的异常:
这里的排名如果有需要可以参考下面这三个函数的用法:
RANK()
DENSE_RANK()
ROW_NUMBER()