一份来自于某在线考试系统的学员答题批改日志,日志中记录了日志生成时间,题目难度系数,题目所属的知识点 ID,做题的学生 ID,题目 ID 以及作答批改结果。日志的结构如下:
要求得到下表所示结果,并以此表映射的Hive表计算学生的平均成绩
建表:hbase、hive建表语句,以及之间的映射语句
导数据:insert into table TABLE_NAME(…) select … /insert overwrite table TABLE_NAME select …
函数:聚合函数collect_list,字符串函数regexp_extract、size、split,窗口函数lead
start-dfs.sh
start-yarn.sh
nohup hive --service metastore>/dev/null 2>&1 &
nohup hive --service hiveserver2>/dev/null 2>&1 &
zkServer.sh start
start-hbase.sh
jps -ml
hdfs dfs -mkdir /app
hdfs dfs -mkdir /app/data
hdfs dfs -mkdir /app/data/exam
#上面三部也可以直接递归创建:hdfs dfs -mkdir -p /app/data/exam
hdfs dfs -put /root/hadooptmp/answer_question.log /app/data/exam
根据日志数据,可以看到逗号分隔符,且逗号分隔符后的数据本题用不到
create external table ex_exam7_tmp(
req string,
resp string
)
row format delimited fields terminated by ',' location '/app/data/exam'
create external table ex_exam_record(
topic_id string,
student_id string,
question_id string,
score float
)
row format delimited fields terminated by ','
hbase(main):009:0> create_namespace 'exam'
hbase(main):010:0> create 'exam:analysis','accuracy','question'
create external table ex_exam_analysis(
student_id string,
total_score float,
question_count int,
accuracy float
)STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" ":key,accuracy:total_score,accuracy:question_count,accuracy:accuracy") TBLPROPERTIES ("hbase.table.name" = "exam:analysis")
create external table ex_exam_question(
student_id string,
right string,
half string,
error string
)STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,question:right ,question:half ,question:error ") TBLPROPERTIES ("hbase.table.name" = "exam:analysis")
0: jdbc:hive2://192.168.221.140:10000> create database exam;
insert into ex_exam_record(topic_id,student_id,question_id,score)
select * from (
select
regexp_extract(req,'.*?_(.*?)_(.*?)_(.*?)r.*?',1) topic_id,
regexp_extract(req,'.*?_(.*?)_(.*?)_(.*?)r.*?',2) student_id,
regexp_extract(req,'.*?_(.*?)_(.*?)_(.*?)r.*?',3) question_id,
cast(regexp_extract(req,'.*?_(.*?)_(.*?)_(.*?)r (\\d(.\\d)?)',4) as float) score
from ex_exam7_tmp
)T
如果数据插入错误,可使用insert overwrite table TABLE_NAME select…覆盖重写
insert into ex_exam_question(student_id,error,half,right)
select student_id,concat_ws(',',error) error,
concat_ws(',',half) half,
concat_ws(',',right) right
from (
select student_id,info error,
lead(info,1,NULL) over(distribute by student_id sort by score) as half,
lead(info,2,NULL) over(distribute by student_id sort by score) as right,
row_number() over(distribute by student_id sort by score) as rn
from (
select student_id,score,collect_list(question_id) info
from ex_exam_record
group by student_id,score
)T
)T
where rn=1
insert into table ex_exam_analysis(student_id,total_score,question_count,accuracy)
select student_id,
(halfNum*0.5+rightNum) total_score,
(halfNum+rightNum+errNum) question_count, round((halfNum*0.5+rightNum)/(halfNum+rightNum+errNum),2) accuracy
from (
select student_id,size(split(error,',')) errNum,
size(split(half,',')) halfNum, size(split(right,',')) rightNum
from ex_exam_question
)T