本项目所使用的数据及其编写的代码,可戳 https://download.csdn.net/download/atuo200/12716083下载。本项目采用scala编写数据分析代码,若采用python编写数据分析代码,可参考 基于Spark的音乐专辑数据分析展示。
数据集albums.csv,包含了10万条音乐专辑的数据。主要字段说明如下:
下面对音乐专辑数据集albums.csv进行了一系列的分析,包括:
编写scala代码
mkdir musicCount
cd musicCount
mkdir -p src/main/scala
src/main/scala
vi musicCount.scala
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import java.io._
object App {
def main(args:Array[String]){
val conf = new SparkConf().setAppName("genreSales")
val sc = new SparkContext(conf)
val spark=SparkSession.builder().getOrCreate()
//加载数据文件时这里采用本地文件系统,要确保各个spark节点相同路径下都有albums.csv;并也可采用hdfs文件系统,把albums.csv上传到hdfs中
val df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "false").option("delimiter",",").load("file:///usr/local/spark/data/mllib/albums.csv")
import spark.implicits._
val genre_count = df.groupBy("genre").count()
val genre_array = genre_count.filter(genre_count("count") > 2000)
val result1 = genre_array.toJSON.collectAsList.toString
val writer1 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result1.json" ))
writer1.write(result1)
writer1.close()
val genre_sales = df.select(df("genre"), df("num_of_sales")).rdd.map(v => (v(0).toString, v(1).toString.toInt)).reduceByKey(_+_).collect()
val result2 = sc.parallelize(genre_sales).toDF().toJSON.collectAsList.toString
val writer2 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result2.json" ))
writer2.write(result2)
writer2.close()
val tracksAndSales = df.select(df("year_of_pub"), df("num_of_tracks")).rdd.map(v => (v(0).toString.toInt, List(v(1).toString.toInt,1))).reduceByKey((x,y) => List(x(0) + y(0), x(1) + y(1))).sortByKey().collect()
val result3 = sc.parallelize(tracksAndSales).toDF().toJSON.collectAsList.toString
val writer3 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result3.json" ))
writer3.write(result3)
writer3.close()
val tmp = df.groupBy("genre").count()
val genre_list = tmp.orderBy(tmp("count").desc).rdd.map(v=>v(0).toString).take(5)
val genreYearHotArray = df.select(df("genre"), df("year_of_pub"), df("num_of_sales")).rdd.filter(v => genre_list.contains(v(0))).map(v => ((v(0).toString, v(1).toString.toInt), v(2).toString.toInt)).reduceByKey(_+_).collect()
val result4 = sc.parallelize(genreYearHotArray).toDF().toJSON.collectAsList.toString
val writer4 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result4.json" ))
writer4.write(result4)
writer4.close()
spark.close()
}
}
编写sbt打包文件
name := "peopleage project"
version := "1.0"
scalaVersion := "2.11.12"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.5"
libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "2.4.5"
对编写的scala数据分析代码进行打包,把打包完成后生成的jar包提交到spark-submit执行
cd /usr/local/spark/bin
./spark-submit --class "App" /usr/local/spark/mycode/musicCount/target/scala-2.11/peopleage-project_2.11-1.0.jar
执行完后在spark的master节点下的/usr/local/spark/data/music_diy/目录生成了相应的数据分析结果json文件
使用jquery读取json文件数据,利用echarts绘图,建立Flask应用程序渲染生成的数据图表html文件
项目文件及结构如下
app.py
from flask import render_template,Flask
app = Flask(__name__)
@app.route('/view')
def req_file():
return render_template("view.html")
if __name__ == '__main__':
app.DEBUG=True
app.run(debug=True, port=5000,host='0.0.0.0')
view.html
<html>
<head>
<meta charset="utf-8">
<title>小坨的太空站title>
<script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.5.1/jquery.min.js">script>
<script src="https://cdn.bootcss.com/echarts/3.6.2/echarts.min.js">script>
<style>
body {
background-image: url("/static/bg.jpeg");
}
h1 {
color: #fff;
}
#box1 {
background-color: azure!important;
display: inline-block;
}
#box2 {
background-color: azure!important;
display: inline-block;
}
.contain {
text-align: center;
}
#box3 {
background-color: azure!important;
display: inline-block;
}
#box4 {
background-color: azure!important;
display: inline-block;
}
.public {
width: 600px;
height: 500px;
padding: 10px;
border: 1px solid #ccc;
box-shadow: 0 0 8px #aaa inset;
}
.empty {
height: 30px;
}
style>
head>
<body>
<h1 align="center">Spark音乐专辑分析数据图表h1>
<div class="empty">div>
<div class="contain">
<div id="box1" class="public">div>
<div id="box2" class="public">div>
<div id="box3" class="public">div>
<div id="box4" class="public">div>
div>
<script type="text/javascript">
$.getJSON("/static/result1.json", function(data) {
var name = []
var value = []
$.each(data, function(key, val) {
name.push(val["genre"]);
value.push({
"value": val["count"],
"name": val["genre"]
})
});
var myChart1 = echarts.init(document.getElementById('box1'));
// 指定图表的配置项和数据
var option1 = {
title: {
text: '各类型专辑的数量统计图',
x: 'left'
},
tooltip: {
trigger: 'item',
formatter: "{a}
{b} : {c} ({d}%)"
},
legend: {
x: 'center',
y: 'bottom',
data: name
},
toolbox: {
show: true,
feature: {
mark: {
show: true
},
dataView: {
show: true,
readOnly: false
},
magicType: {
show: true,
type: ['pie', 'funnel']
},
restore: {
show: true
},
saveAsImage: {
show: true
}
}
},
calculable: true,
series: [{
name: '半径模式',
type: 'pie',
radius: [30, 180],
center: ['50%', '50%'],
roseType: 'radius',
label: {
normal: {
show: false
},
emphasis: {
show: true
}
},
lableLine: {
normal: {
show: false
},
emphasis: {
show: true
}
},
data: value
}]
};
myChart1.setOption(option1);
// 使用刚指定的配置项和数据显示图表。
})
script>
<script type="text/javascript">
$.getJSON("static/result2.json", function(data) {
var dataAxis = []
var dataValue = []
$.each(data, function(key, val) {
dataAxis.push(val["_1"]);
dataValue.push(Math.abs(val["_2"]) / 1000000)
});
// 基于准备好的dom,初始化echarts实例
var myChart2 = echarts.init(document.getElementById('box2'), 'light');
var option2 = {
title: {
text: '各类型专辑的销量统计图',
x: 'center',
// bottom: 10
padding: [0, 0, 15, 0]
},
color: ['#3398DB'],
tooltip: {
trigger: 'axis',
axisPointer: { // 坐标轴指示器,坐标轴触发有效
type: 'shadow' // 默认为直线,可选为:'line' | 'shadow'
}
},
grid: {
left: '3%',
right: '4%',
bottom: '10%',
containLabel: true
},
xAxis: [{
type: 'category',
data: dataAxis,
axisTick: {
show: true,
alignWithLabel: true,
interval: 0
},
axisLabel: {
interval: 0,
rotate: 45,
}
}],
yAxis: [{
type: 'value',
name: 'Million Albums',
nameLocation: 'middle',
nameGap: 50
}],
series: [{
name: '直接访问',
type: 'bar',
barWidth: '60%',
data: dataValue
}]
};
// 使用刚指定的配置项和数据显示图表。
myChart2.setOption(option2);
})
script>
<script type="text/javascript">
$.getJSON("static/result3.json", function(data) {
var years_list = []
var tracks_list = []
var albums_list = []
$.each(data, function(key, val) {
years_list.push(val["_1"])
tracks_list.push(val["_2"][0])
albums_list.push(val["_2"][1])
});
var myChart3 = echarts.init(document.getElementById('box3'), 'light');
var colors = ['#5793f3', '#d14a61', '#675bba'];
var option3 = {
title: {
text: '近20年的专辑数量和单曲数量的变化趋势',
},
tooltip: {
trigger: 'axis'
},
legend: {
x: 'center',
y: '25',
data: ['单曲数量', '专辑数量'],
},
toolbox: {
show: true,
feature: {
dataZoom: {
yAxisIndex: 'none'
},
dataView: {
readOnly: false
},
magicType: {
type: ['line', 'bar']
},
restore: {},
saveAsImage: {}
}
},
xAxis: {
type: 'category',
boundaryGap: false,
data: years_list,
boundaryGap: ['20%', '20%']
},
yAxis: {
type: 'value',
axisLabel: {
formatter: '{value}'
}
},
series: [{
name: '单曲数量',
type: 'bar',
data: tracks_list,
barWidth: 15,
}, {
name: '专辑数量',
type: 'bar',
data: albums_list,
barGap: '-100%',
barWidth: 15,
}]
};
// 使用刚指定的配置项和数据显示图表。
myChart3.setOption(option3);
})
script>
<script type="text/javascript">
$.getJSON("static/result4.json", function(data) {
var items = {
"Indie": [],
"Pop": [],
"Rap": [],
"Latino": [],
"Pop-Rock": []
}
$.each(data, function(key, val) {
tmp = [val["_1"]["_2"], val["_2"]]
items[val["_1"]["_1"]].push(tmp)
});
var myChart4 = echarts.init(document.getElementById('box4'));
var option4 = {
title: {
text: '堆叠区域图'
},
tooltip: {
trigger: 'axis',
axisPointer: {
type: 'cross',
label: {
backgroundColor: '#6a7985'
}
}
},
legend: {
data: ['Indie', 'Pop', 'Rap', 'Pop-Rock', 'Latino']
},
toolbox: {
feature: {
saveAsImage: {}
}
},
grid: {
left: '3%',
right: '4%',
bottom: '3%',
containLabel: true
},
xAxis: [{
type: 'category',
boundaryGap: false,
data: items['Indie'].map(v => v[0])
}],
yAxis: [{
type: 'value',
name: 'Million Albums',
nameLocation: 'middle',
nameGap: 30
}],
series: [{
name: 'Indie',
type: 'line',
stack: '总量',
areaStyle: {},
data: items["Indie"].map(v => v[1] / 1000000)
}, {
name: 'Pop',
type: 'line',
stack: '总量',
areaStyle: {},
data: items["Pop"].map(v => v[1] / 1000000)
}, {
name: 'Rap',
type: 'line',
stack: '总量',
areaStyle: {},
data: items["Rap"].map(v => v[1] / 1000000)
}, {
name: 'Pop-Rock',
type: 'line',
stack: '总量',
areaStyle: {},
data: items["Pop-Rock"].map(v => v[1] / 1000000)
}, {
name: 'Latino',
type: 'line',
stack: '总量',
label: {
normal: {
show: true,
position: 'top'
}
},
areaStyle: {},
data: items["Latino"].map(v => v[1] / 1000000)
}]
};
myChart4.setOption(option4);
});
script>
body>
html>
运行编写好的Flask应用程序,可得到以下数据图表
cd /usr/local/code/music_flask
python3 app.py
浏览器访问
http://Master:5000/view