现有如此三份数据:
1、users.dat 数据格式为: 2::M::56::16::70072
对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
对应字段中文解释:用户id,性别,年龄,职业,邮政编码
2、movies.dat 数据格式为: 2::Jumanji (1995)::Adventure|Children’s|Fantasy
对应字段为:MovieID BigInt, Title String, Genres String
对应字段中文解释:电影ID,电影名字,电影类型
3、ratings.dat 数据格式为: 1::1193::5::978300760
对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
对应字段中文解释:用户ID,电影ID,评分,评分时间戳
题目要求:
数据要求:
(1)写shell脚本清洗数据。
(2)使用Hive能解析的方式进行
Hive要求:
(1)正确建表,导入数据(三张表,三份数据),并验证是否正确
思路:重点在于要把 ‘::’ 给替换掉
第一种方法,使用sed 进行文本处理:
#!/bin/bash
sed 's/::/%/g' users.dat >> newusers.dat
sed 's/::/%/g' movies.dat >> newmovies.dat
sed 's/::/%/g' ratings.dat >> newratings.dat
create table users(userid BigInt, gender String, age Int, occupation String, zipcode String)
row format delimited fields terminated by '%' stored as textfile;
create table movies(movieID BigInt, title String, genres String)
row format delimited fields terminated by '%' stored as textfile;
create table ratings(userID BigInt, movieID BigInt, rating Double, timestamped String)
row format delimited fields terminated by '%' stored as textfile;
load data local inpath '/home/user/newusers.dat' into table users;
load data local inpath '/home/user/newmovies.dat' into table movies;
load data local inpath '/home/user/newratings.dat' into table ratings;
select * from users limit 10;
select * from movies limit 10;
select * from ratings limit 10;
第二种方法
create table users(userID BigInt, gender String, age Int, occupation String, zipcode String)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
load data local inpath '/home/user/users.dat' INTO TABLE users;
select * from users limit 5;
create table movies(movieID BigInt, title String, genres String)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
load data local inpath '/home/user/movies.dat' INTO TABLE movies;
select * from movies limit 5;
create table ratings(userID BigInt, movieID BigInt, rating Double, timestamped String)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
load data local inpath '/home/user/ratings.dat' INTO TABLE ratings;
select * from ratings limit 5;
得到数据如下::
1、users.dat 数据格式为: 2::M::56::16::70072
1 F 1 10 48067
2 M 56 16 70072
3 M 25 15 55117
2、movies.dat 数据格式为: 2::Jumanji (1995)::Adventure|Children's|Fantasy
1 Toy Story (1995) Animation|Children's|Comedy
2 Jumanji (1995) Adventure|Children's|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
3、ratings.dat 数据格式为: 1::1193::5::978300760
1 1193 5.0 978300760
1 661 3.0 978302109
1 914 3.0 978301968
(2)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
select a.title as title , b.count as counts from
movies a join
(
select movieid as MovieID,count(*) as count from ratings group by movieid sort by count desc limit 10
) b
on a.movieid = b.MovieID;
得到结果:
title counts
American Beauty (1999) 3428
Star Wars: Episode IV - A New Hope (1977) 2991
Star Wars: Episode V - The Empire Strikes Back (1980) 2990
Star Wars: Episode VI - Return of the Jedi (1983) 2883
Jurassic Park (1993) 2672
Saving Private Ryan (1998) 2653
Terminator 2: Judgment Day (1991) 2649
Matrix, The (1999) 2590
Back to the Future (1985) 2583
Silence of the Lambs, The (1991) 2578
select movieid as MovieID,count(*) as count from ratings group by movieid sort by count desc limit 10
(3)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
求男性评分最高的10部电影
1、首先根据电影评分高低进行排序
select userid as userid ,movieid as movieid,avg(rating) as avgrate from ratings group by movieid,userid sort by avgrate desc
2、
select collect_set(c.gender)[0] gender, a.title title , avg(b.rating) rating,count(b.rating) counts from movies a
join ratings b on a.movieid = b.movieid
join users c on b.userid = c.userid
where c.gender = "M"
group by b.movieid,a.title
having counts>70
order by rating desc
limit 10
;
得到如下结果:
gender title rating counts
M Godfather, The (1972) 4.583333333333333 1740
M Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 4.576628352490421 522
M Shawshank Redemption, The (1994) 4.560625 1600
M Raiders of the Lost Ark (1981) 4.520597322348094 1942
M Usual Suspects, The (1995) 4.518248175182482 1370
M Star Wars: Episode IV - A New Hope (1977) 4.495307167235495 2344
M Schindler's List (1993) 4.49141503848431 1689
M Paths of Glory (1957) 4.485148514851486 202
M Wrong Trousers, The (1993) 4.478260869565218 644
M Close Shave, A (1995) 4.473794549266247 477
Time taken: 150.246 seconds, Fetched: 10 row(s)
select collect_set(c.gender)[0] gender, a.title title , avg(b.rating) rating,count(b.rating) counts from movies a
join ratings b on a.movieid = b.movieid
join users c on b.userid = c.userid
where c.gender = "F"
group by b.movieid,a.title
having counts>70
order by rating desc
limit 10
;
得到如下结果:
gender title rating counts
F Close Shave, A (1995) 4.644444444444445 180
F Wrong Trousers, The (1993) 4.588235294117647 238
F Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572649572649572 117
F Wallace & Gromit: The Best of Aardman Animation (1996) 4.563106796116505 103
F Schindler's List (1993) 4.56260162601626 615
F Shawshank Redemption, The (1994) 4.539074960127592 627
F Grand Day Out, A (1992) 4.537878787878788 132
F To Kill a Mockingbird (1962) 4.536666666666667 300
F Creature Comforts (1990) 4.513888888888889 72
F Usual Suspects, The (1995) 4.513317191283293 413
Time taken: 154.546 seconds, Fetched: 10 row(s)
将两个结果进行拼接:
select q.* from
(
select collect_set(c.gender)[0] gender, a.title title , avg(b.rating) rating,count(b.rating) counts from movies a
join ratings b on a.movieid = b.movieid
join users c on b.userid = c.userid
where c.gender = "M"
group by b.movieid,a.title
having counts>70
order by rating desc
limit 10
) q
union all
select w.* from
(
select collect_set(c.gender)[0] gender, a.title title , avg(b.rating) rating,count(b.rating) counts from movies a
join ratings b on a.movieid = b.movieid
join users c on b.userid = c.userid
where c.gender = "F"
group by b.movieid,a.title
having counts>70
order by rating desc
limit 10
)w
;
***************
collect_set(去重),collect_list (不去重) 是hive 中的 自定义函数UDF 用来解除
Hive sql 使用group by 字段被限制使用 的问题
***************
得到如下结果:
OK
_u1.gender _u1.title _u1.rating _u1.counts
M Godfather, The (1972) 4.583333333333333 1740
M Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 4.576628352490421 522
M Shawshank Redemption, The (1994) 4.560625 1600
M Raiders of the Lost Ark (1981) 4.520597322348094 1942
M Usual Suspects, The (1995) 4.518248175182482 1370
M Star Wars: Episode IV - A New Hope (1977) 4.495307167235495 2344
M Schindler's List (1993) 4.49141503848431 1689
M Paths of Glory (1957) 4.485148514851486 202
M Wrong Trousers, The (1993) 4.478260869565218 644
M Close Shave, A (1995) 4.473794549266247 477
F Close Shave, A (1995) 4.644444444444445 180
F Wrong Trousers, The (1993) 4.588235294117647 238
F Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572649572649572 117
F Wallace & Gromit: The Best of Aardman Animation (1996) 4.563106796116505 103
F Schindler's List (1993) 4.56260162601626 615
F Shawshank Redemption, The (1994) 4.539074960127592 627
F Grand Day Out, A (1992) 4.537878787878788 132
F To Kill a Mockingbird (1962) 4.536666666666667 300
F Creature Comforts (1990) 4.513888888888889 72
F Usual Suspects, The (1995) 4.513317191283293 413
Time taken: 287.59 seconds, Fetched: 20 row(s)
(4)求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
select aa.age as age , avg(bb.rating) as avgrate
from
users aa
join
ratings bb
on aa.userid=bb.userid
where bb.movieid = 2116
group by aa.age
;
得到结果:
age avgrate
1 3.2941176470588234
18 3.3580246913580245
25 3.436548223350254
35 3.2278481012658227
45 2.8275862068965516
50 3.32
56 3.5
Time taken: 213.044 seconds
(5)求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
select a.userid userid,c.title title ,avg(a.rating) rating , count(rating) counts from ratings a
join users
首先 找到影评次数最多的那位女性ID
select a.userid from users a
join
(
select userid ,count(*) counts from ratings group by userid order by counts desc
) b on a.userid = b.userid
where a.gender = "F"
limit 1
;
得到如下结果:
a.userid
1150
Time taken: 126.843 seconds, Fetched: 1 row(s)
然后,根据userid 得到 这位女性评分最高的十部电影
select dd.userid userid ,dd.movieid movieid ,ee.title title ,dd.rating from
movies ee join
(
select aa.userid userid , aa.movieid movieid,aa.rating rating from
ratings aa
join
(
select a.userid from users a
join
(
select userid ,count(*) counts from ratings group by userid order by counts desc
) b on a.userid = b.userid
where a.gender = "F"
limit 1
) bb
on aa.userid = bb.userid
order by rating desc
limit 10
) dd
on dd.movieid=ee.movieid
;
得到如下结果:
userid movieid title dd.rating
1150 745 Close Shave, A (1995) 5.0
1150 1279 Night on Earth (1991) 5.0
1150 1236 Trust (1990) 5.0
1150 904 Rear Window (1954) 5.0
1150 750 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) 5.0
1150 2997 Being John Malkovich (1999) 5.0
1150 2064 Roger & Me (1989) 5.0
1150 905 It Happened One Night (1934) 5.0
1150 1094 Crying Game, The (1992) 5.0
1150 1256 Duck Soup (1933) 5.0
Time taken: 439.429 seconds, Fetched: 10 row(s)
然后,根据movieid 求每部电影的平均分
select movieid, avg(rating) rating from ratings group by movieid limit 5;
然后用这张表关联上述结果的表
select q.userid userid , q.title title ,w.rating rating from
(
select dd.userid userid ,dd.movieid movieid ,ee.title title ,dd.rating from
movies ee join
(
select aa.userid userid , aa.movieid movieid,aa.rating rating from
ratings aa
join
(
select a.userid from users a
join
(
select userid ,count(*) counts from ratings group by userid order by counts desc
) b on a.userid = b.userid
where a.gender = "F"
limit 1
) bb
on aa.userid = bb.userid
order by rating desc
limit 10
) dd
on dd.movieid=ee.movieid
) q
join
(
select movieid, avg(rating) rating from ratings group by movieid
) w on w.movieid=q.movieid
;
得到如下结果:
userid title rating
1150 Close Shave, A (1995) 4.52054794520548
1150 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) 4.4498902706656915
1150 Rear Window (1954) 4.476190476190476
1150 It Happened One Night (1934) 4.280748663101604
1150 Crying Game, The (1992) 3.7314890154597236
1150 Trust (1990) 4.188888888888889
1150 Duck Soup (1933) 4.21043771043771
1150 Night on Earth (1991) 3.747422680412371
1150 Roger & Me (1989) 4.0739348370927315
1150 Being John Malkovich (1999) 4.125390450691656
Time taken: 441.606 seconds, Fetched: 10 row(s)
(6)求好片(评分>=4.0)最多的那个年份的最好看的10部电影
提取电影表中的年份:
select substr(title,-5,4) from movies limit 1;
结果如下:
_c0
1995
***其中substr 中 -5,4 是指title 中从倒数第5个字符往后截取4位***
create view m_v as
select a.movieid movieid,a.title title ,b.rating rating ,substr(title,-5,4) year , a.genres genres from movies a
join ratings b on b.movieid = a.movieid
;
select * from m_v limit 1;
得到结果:
OK
m_v.movieid m_v.title m_v.rating m_v.year m_v.genres
1193 One Flew Over the Cuckoo's Nest (1975) 5.0 1975 Drama
Time taken: 74.408 seconds, Fetched: 1 row(s)
create view m_v2 as
select title title, year year , avg(rating) avgrate from m_v
group by title,year ;
select * from m_v2 limit 10;
得到结果如下:
m_v2.title m_v2.year m_v2.rating
$1,000,000 Duck (1971) 1971 3.027027027027027
'Night Mother (1986) 1986 3.3714285714285714
'Til There Was You (1997) 1997 2.6923076923076925
'burbs, The (1989) 1989 2.910891089108911
...And Justice for All (1979) 1979 3.71356783919598
1-900 (1994) 1994 2.5
10 Things I Hate About You (1999) 1999 3.422857142857143
101 Dalmatians (1961) 1961 3.5964601769911506
101 Dalmatians (1996) 1996 3.0467032967032965
12 Angry Men (1957) 1957 4.295454545454546
Time taken: 123.322 seconds, Fetched: 10 row(s)
然后,找到(评分>=4.0) 最多的那个年份
select year,count(*)counts from m_v2
where avgrate>=4.0
group by year
order by counts desc
limit 1
;
得到结果如下:
year counts
1998 27
Time taken: 84.021 seconds, Fetched: 1 row(s)
根据year 求出 评分最高的10部电影
筛选出评价人数超过70人的电影
select aa.title title ,aa.counts counts from (
select title title, count(rating) counts from m_v
group by title ) aa where aa.counts > 70
;
select a.year year,a.title title ,a.avgrate avgrate,c.counts counts from m_v2 a
join
(
select aa.title title ,aa.counts counts from (
select title title, count(rating) counts from m_v
group by title ) aa where aa.counts > 70
) c on a.title = c.title
join
(
select year,count(*)counts from m_v2
where avgrate>=4.0
group by year
order by counts desc
limit 1
) b on a.year = b.year
order by avgrate desc
limit 10
;
得到最终结果:
year title avgrate counts
1998 Saving Private Ryan (1998) 4.337353938937053 2653
1998 Celebration, The (Festen) (1998) 4.3076923076923075 117
1998 Central Station (Central do Brasil) (1998) 4.283720930232558 215
1998 42 Up (1998) 4.2272727272727275 88
1998 American History X (1998) 4.2265625 640
1998 Run Lola Run (Lola rennt) (1998) 4.224813432835821 1072
1998 Shakespeare in Love (1998) 4.127479949345715 2369
1998 After Life (1998) 4.088235294117647 102
1998 Elizabeth (1998) 4.029850746268656 938
1998 Croupier (1998) 4.029787234042553 235
Time taken: 423.386 seconds, Fetched: 10 row(s)
(7)求1997年上映的电影中,评分最高的10部Comedy类电影
第一步:求出1997年上映的Comedy电影
select * from m_v where year=1997 and genres like '%Comedy%' limit 5;
得到以下结果:
m_v.movieid m_v.title m_v.rating m_v.year m_v.genres
1566 Hercules (1997) 4.0 1997 Adventure|Animation|Children's|Comedy|Musical
1784 As Good As It Gets (1997) 5.0 1997 Comedy|Drama
1641 Full Monty, The (1997) 2.0 1997 Comedy
1431 Beverly Hills Ninja (1997) 3.0 1997 Action|Comedy
1580 Men in Black (1997) 3.0 1997 Action|Adventure|Comedy|Sci-Fi
Time taken: 71.308 seconds, Fetched: 5 row(s)
第二步:求出1997年评分最高的十部电影
select collect_set(year)[0] year,collect_set(title)[0] title , collect_set(genres)[0] genres,avg(rating) avgrate from m_v
where year=1997 and genres like '%Comedy%'
group by movieid
having count(*) >70
order by avgrate desc
limit 10;
得到最终结果:
year title genres avgrate
1997 Life Is Beautiful (La Vita � bella) (1997) Comedy|Drama 4.329861111111111
1997 Big One, The (1997) Comedy|Documentary 4.0
1997 As Good As It Gets (1997) Comedy|Drama 3.9501404494382024
1997 Full Monty, The (1997) Comedy 3.872393661384487
1997 My Life in Pink (Ma vie en rose) (1997) Comedy|Drama 3.825870646766169
1997 Grosse Pointe Blank (1997) Comedy|Crime 3.813380281690141
1997 Men in Black (1997) Action|Adventure|Comedy|Sci-Fi 3.739952718676123
1997 Austin Powers: International Man of Mystery (1997) Comedy 3.7103734439834026
1997 Billy's Hollywood Screen Kiss (1997) Comedy|Romance 3.6710526315789473
1997 Liar Liar (1997) Comedy 3.5
Time taken: 76.294 seconds, Fetched: 10 row(s)
(8)该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
首先创建一张 电影与种类分离的视图:
create view newmovies as
select movieid ,title, newgenres from movies a
LATERAL VIEW explode(split(genres ,'\\|'))genres as newgenres ;
select collect_set(movieid)[0] movieid,collect_set(title)[0] title , newgenres from newmovies
group by newgenres
limit 10;
结果如下:
newmovies.movieid newmovies.title newmovies.newgenres
1 Toy Story (1995) Animation
1 Toy Story (1995) Children's
1 Toy Story (1995) Comedy
2 Jumanji (1995) Adventure
2 Jumanji (1995) Children's
Time taken: 0.359 seconds, Fetched: 5 row(s)
将这张表跟之前对title分组求出avgrate 的表进行关联
select a.movieid movieid ,a.title title,b.avgrate avgrate ,a.newgenres newgenres from
newmovies a join
m_v2 b on b.title = a.title
limit 10
;
movieid title avgrate a.newgenres
2031 $1,000,000 Duck (1971) 3.027027027027027 Children's
2031 $1,000,000 Duck (1971) 3.027027027027027 Comedy
3112 'Night Mother (1986) 3.3714285714285714 Drama
779 'Til There Was You (1997) 2.6923076923076925 Drama
779 'Til There Was You (1997) 2.6923076923076925 Romance
2072 'burbs, The (1989) 2.910891089108911 Comedy
3420 ...And Justice for All (1979) 3.71356783919598 Drama
3420 ...And Justice for All (1979) 3.71356783919598 Thriller
889 1-900 (1994) 2.5 Romance
2572 10 Things I Hate About You (1999) 3.422857142857143 Comedy
Time taken: 270.178 seconds, Fetched: 10 row(s)
然后把数据导入新的视图:
create view newmovies_2 as
select dd.*,row_number() over(distribute by dd.newgenres sort by dd.avgrate desc ) as od
from (
select a.movieid movieid ,a.title title,b.avgrate avgrate ,a.newgenres newgenres from
newmovies a join
m_v2 b on b.title = a.title
) dd
;
select * from newmovies_2 where od <=5 limit 30;
得到以下结果:
newmovies_2.movieid newmovies_2.title newmovies_2.avgrate newmovies_2.newgenres newmovies_2.od
2905 Sanjuro (1962) 4.608695652173913 Action 1
2019 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 4.560509554140127 Action 2
858 Godfather, The (1972) 4.524966261808367 Action 3
1198 Raiders of the Lost Ark (1981) 4.477724741447892 Action 4
260 Star Wars: Episode IV - A New Hope (1977) 4.453694416583082 Action 5
3172 Ulysses (Ulisse) (1954) 5.0 Adventure 1
2905 Sanjuro (1962) 4.608695652173913 Adventure 2
1198 Raiders of the Lost Ark (1981) 4.477724741447892 Adventure 3
260 Star Wars: Episode IV - A New Hope (1977) 4.453694416583082 Adventure 4
1204 Lawrence of Arabia (1962) 4.401925391095066 Adventure 5
745 Close Shave, A (1995) 4.52054794520548 Animation 1
1148 Wrong Trousers, The (1993) 4.507936507936508 Animation 2
720 Wallace & Gromit: The Best of Aardman Animation (1996) 4.426940639269406 Animation 3
1223 Grand Day Out, A (1992) 4.361522198731501 Animation 4
3429 Creature Comforts (1990) 4.335766423357664 Animation 5
919 Wizard of Oz, The (1939) 4.247962747380675 Children's 1
3114 Toy Story 2 (1999) 4.218927444794953 Children's 2
1 Toy Story (1995) 4.146846413095811 Children's 3
2761 Iron Giant, The (1999) 4.0474777448071215 Children's 4
1023 Winnie the Pooh and the Blustery Day (1968) 3.986425339366516 Children's 5
1830 Follow the Bitch (1998) 5.0 Comedy 1
3233 Smashing Time (1967) 5.0 Comedy 2
3607 One Little Indian (1973) 5.0 Comedy 3
745 Close Shave, A (1995) 4.52054794520548 Comedy 4
1148 Wrong Trousers, The (1993) 4.507936507936508 Comedy 5
3656 Lured (1947) 5.0 Crime 1
858 Godfather, The (1972) 4.524966261808367 Crime 2
50 Usual Suspects, The (1995) 4.517106001121705 Crime 3
3517 Bells, The (1926) 4.5 Crime 4
3435 Double Indemnity (1944) 4.415607985480944 Crime 5
Time taken: 214.261 seconds, Fetched: 30 row(s)
(9)各年评分最高的电影类型(年份,类型,影评分)
首先创建一张年份 ,类型的视图(前面已经创建了newmovies_2 ,有评分及类型)
create view newmovies_3 as
select substr(a.title,-5,4) year,a.movieid movieid ,a.title title,b.avgrate avgrate ,a.newgenres newgenres from
newmovies a join
m_v2 b on b.title = a.title
;
select * from newmovies_3 limit 5;
得到结果如下:
newmovies_3.year newmovies_3.movieid newmovies_3.title newmovies_3.avgrate newmovies_3.newgenres
1971 2031 $1,000,000 Duck (1971) 3.027027027027027 Children's
1971 2031 $1,000,000 Duck (1971) 3.027027027027027 Comedy
1986 3112 'Night Mother (1986) 3.3714285714285714 Drama
1997 779 'Til There Was You (1997) 2.6923076923076925 Drama
1997 779 'Til There Was You (1997) 2.6923076923076925 Romance
Time taken: 218.875 seconds, Fetched: 5 row(s)
然后根据 year 进行分组
select a.year year ,collect_set(a.newgenres)[0] genres ,max(b.avgrate) maxavgrate from
newmovies_3 a join
(
select collect_set(a.movieid)[0] movieid,avg(a.rating) avgrate from m_v a
group by a.movieid
having count(*) >70
order by avgrate desc
) b on a.movieid=b.movieid
group by a.year
;
得到以下结果:
year genres maxavgrate
1922 Horror 3.991596638655462
1925 Drama 4.189090909090909
1926 Sci-Fi 4.082474226804123
1927 Comedy 4.368932038834951
1930 War 4.1940298507462686
1931 Comedy 4.387453874538745
………………
最后,根据 类型 分组
select collect_set(aa.year)[0] year,collect_set(aa.genres)[0] genres ,max(aa.maxavgrate) maxavgrate from
(
select a.year year ,collect_set(a.newgenres)[0] genres ,max(b.avgrate) maxavgrate from
newmovies_3 a join
(
select collect_set(a.movieid)[0] movieid,avg(a.rating) avgrate from m_v a
group by a.movieid
having count(*) >70
order by avgrate desc
) b on a.movieid=b.movieid
group by a.year
)aa
group by aa.genres
sort by year
;
得到最终结果:
year genres maxavgrate
1922 Horror 3.991596638655462
1925 Drama 4.554557700942973
1926 Sci-Fi 4.082474226804123
1927 Comedy 4.52054794520548
1930 War 4.1940298507462686
1935 Thriller 4.147410358565737
1938 Action 4.361522198731501
1942 Animation 4.477724741447892
1943 Adventure 4.560509554140127
1955 Western 4.149643705463183
1962 Film-Noir 4.425646551724138
1972 Musical 4.524966261808367
1975 Children's 4.426940639269406
1984 Mystery 4.251808972503618
Time taken: 729.168 seconds, Fetched: 14 row(s)
(10)每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)
超过70人数评价,且按照电影分组,平均分排序,由高到低
select movieid movieid, collect_set(title)[0] title,avg(rating) avgrate from m_v
group by movieid
having count(*) >70
order by avgrate desc
limit 10
;
movieid title avgrate
2019 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) 4.560509554140127
318 Shawshank Redemption, The (1994) 4.554557700942973
858 Godfather, The (1972) 4.524966261808367
745 Close Shave, A (1995) 4.52054794520548
50 Usual Suspects, The (1995) 4.517106001121705
527 Schindler's List (1993) 4.51041666666666
create view movies_1 as
select movieid movieid, collect_set(title)[0] title,avg(rating) avgrate from m_v
group by movieid
having count(*) >70
order by avgrate desc
;
合并rating表
create view movies_2 as
select b.userid userid,a.movieid movieid,a.title title,a.avgrate avgrate from movies_1 a
join ratings b on b.movieid=a.movieid;
select * from movies_2 limit 5;
movies_2.userid movies_2.movieid movies_2.title movies_2.avgrate
1 1193 One Flew Over the Cuckoo's Nest (1975) 4.390724637681159
1 661 James and the Giant Peach (1996) 3.4647619047619047
1 914 My Fair Lady (1964) 4.154088050314465
1 3408 Erin Brockovich (2000) 3.863878326996198
1 2355 Bug's Life, A (1998) 3.854374633000587
Time taken: 366.137 seconds, Fetched: 5 row(s)
再合并users表
create view movies_3 as
select d.zipcode zipcode,collect_set(c.title)[0] title ,max(c.avgrate) maxavgrate from movies_2 c
join users d on d.userid = c.userid
group by d.zipcode
order by maxavgrate desc
;
select * from movies_3 limit 5;
得到结果:
movies_3.zipcode movies_3.title movies_3.maxavgrate
01060 Toy Story (1995) 4.560509554140127
01020 Touch of Evil (1958) 4.560509554140127
01379 Footloose (1984) 4.560509554140127
99353 Who Framed Roger Rabbit? (1988) 4.560509554140127
01002 Who Framed Roger Rabbit? (1988) 4.560509554140127
Time taken: 889.359 seconds, Fetched: 5 row(s)
将结果存入HDFS
insert overwrite directory '/' select * from movies_3;