爬取数据部分
导包
from selenium.webdriver import Chrome
from selenium.webdriver.support.select import Select
from time import sleep
from selenium.webdriver.chrome.options import Options
无头和防屏蔽设置(固定代码,一般最后加上)
opt = Options()
opt.add_argument('--headless')
opt.add_argument('--disable gpu')
请求网址并事先创建一个文件
chrome = Chrome(options=opt)
url = 'https://www.endata.com.cn/BoxOffice/BO/Year/index.html'
chrome.get(url)
sleep(1)
f = open('./movies.csv', mode='a',encoding='utf-8')
support库里面的Select模块处理年份问题
select_el = chrome.find_element_by_id('OptionDate')
select = Select(select_el)
for i in range(len(select.options)):
select.select_by_index(i)
sleep(2)
tr_list = chrome.find_elements_by_xpath('//table[@class="bo-table img-table"]//tr')[1:]
for tr in tr_list:
for td in tr.find_elements_by_xpath('./td'):
f.write(td.text.strip())
f.write(',')
f.write('\n')
f.write('\n\n')
print('第%d页打印完毕' % i)
chrome.close()
数据处理部分
导包
import pandas as pd
数据的清洗,去重
data = pd.read_csv('./movies.csv', header=None)
data = data.loc[:, 2:4]
data[5] = data[2].map(lambda i: i.split('/')[0])
data[6] = data[2].map(lambda i: i.split('/')[1] if '/' in i else 666)
data[7] = data[3].astype(str) + data[4].astype(str)
data_1 = data.loc[:,[5,7]]
data_2 = data.loc[:, [6,7]]
data_2 = data_2.loc[data_2[6] != 666]
data_2 = data_2.rename(columns={6:5})
data = data_1.append(data_2)
data = data.rename(columns={5:'type',7:'money'})
data['money'] = data['money'].astype(int)
booking_o = data.groupby('type').mean().round(2)
booking_o = booking_o.drop(['-'])
booking_o.to_csv('./数据.csv')
可视化展示
导包
from flask import Flask,render_template
import pandas as pd
Flask可视化展示
app = Flask(__name__)
@app.route('/')
def index():
data = pd.read_csv('./数据.csv')
data.columns = ['name','value']
data = data.loc[:,['value','name']]
data['value'] = data['value']//1000
data = data.to_dict(orient='records')
return render_template('数据.html',data=data)
if __name__ == '__main__':
app.run(debug=True)
HTML界面与echart源代码的结合
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<!---引入echarts--->
<script src="https://cdn.bootcdn.net/ajax/libs/echarts/5.0.2/echarts.min.js"></script>
<div id="main" style="width: 800px;height:600px;blackgroup:pink"></div>
<script type="text/javascript">
var myChart = echarts.init(document.getElementById('main'));
var option = {
legend: {
top: 'bottom'
},
toolbox: {
show: true,
feature: {
mark: {show: true},
dataView: {show: true, readOnly: false},
restore: {show: true},
saveAsImage: {show: true}
}
},
series: [
{
name: '面积模式',
type: 'pie',
radius: [50, 250],
center: ['50%', '50%'],
roseType: 'area',
itemStyle: {
borderRadius: 8
},
data:{{data|tojson}}
}
]
};
myChart.setOption(option);
</script>
</body>
</html>