作者:雨晨源码
简介:java、微信小程序、安卓;定制开发,远程调试 代码讲解,文档指导,ppt制作
精彩专栏推荐订阅:在下方专栏
Java精彩实战毕设项目案例
小程序精彩项目案例
Python实战项目案例
文末获取源码
本次文章主要是介绍基于hadoop+spark+echarts+机器学习的豆瓣图书数据可视化分析系统的功能,系统分为二个角色,分别是用户和管理员
用户和管理员(亮点:大数据、hadoop分布式框架爱、大屏可视化分析)
用户:各作者占比、所有图书数据、评分最高书籍前十、评分请假占比、各时间段的书籍数量、评论最多书籍前十、词云等。
管理员:登录、用户管理、认证和授权、豆瓣数据分析等。
☀️豆瓣图书列表☀️
1.数据清洗【代码如下(示例):】
if __name__ == '__main__':
# urllib3.contrib.pyopenssl.inject_into_urllib3()
html_re = re.compile(r'<[^>]+>', re.S)
db = pymysql.connect(user=tools.Config.MYSQL_USER,
password=tools.Config.MYSQL_PWD,
host=tools.Config.MYSQL_HOST,
database=tools.Config.MYSQL_DB,
charset=tools.Config.MYSQL_CHARSET)
db_cursor = db.cursor(cursor=pymysql.cursors.DictCursor)
db_cursor.execute("TRUNCATE TABLE article")
host = "www.hsnc.edu.cn"
# url="https://www.hsnc.edu.cn/index/xyxx/xyxw/162.htm"
url = "https://www.hsnc.edu.cn/index/xyxx/xyxw.htm"
resp = get_page_data(host, url)
sp = BeautifulSoup(resp, 'lxml')
div_titles = sp.find_all("div", "biaoti")
for div_title in div_titles:
a = div_title.find_all("a")[1]
title = a.string # 新闻标题
link = a.attrs["href"]
link = "https://" + host + link.replace("../..", "").replace("/..", "") # 详细内容的链接
print(link)
resp_cnt = get_page_data(host, link)
sp_cnt = BeautifulSoup(resp_cnt, 'lxml')
div_author = sp_cnt.find("div", "auther")
html_author = div_author.__str__()
html_author = html_re.sub("", html_author).split(" ")
author = html_author[0].replace("来源:", "").strip() # 来源
if html_author[1].strip() != "":
ptime = html_author[1].replace("发布日期:", "").strip() # 发布时间
else:
ptime = html_author[2].replace("发布日期:", "").strip() # 发布时间
div_cnt = sp_cnt.find("div", "v_news_content")
cnt = html_re.sub("", div_cnt.__str__()).replace("\n", "
") # 内容
cnt = cnt.strip("
")
insert_sql = f"INSERT INTO article (title,content,source,t) " \
f"VALUES ('" + addslashes(title) + "','" + addslashes(cnt) + "','" + addslashes(
author) + "','" + addslashes(ptime) + "')"
try:
db_cursor.execute(insert_sql)
db.commit()
except:
print("insert error")
print(insert_sql)
print("休眠1秒")
time.sleep(1)
print("休眠10秒")
time.sleep(10)
2.数据分析【代码如下(示例):】
# 进入下N页的列表
cur_page = 162
url = "https://www.hsnc.edu.cn/index/xyxx/xyxw/" + str(cur_page) + ".htm"
while cur_page >= 140:
resp = get_page_data(host, url)
sp = BeautifulSoup(resp, 'lxml')
div_titles = sp.find_all("div", "biaoti")
for div_title in div_titles:
a = div_title.find_all("a")[1]
title = a.string # 新闻标题
link = a.attrs["href"]
link = "https://" + host + link.replace("../..", "").replace("/..", "") # 详细内容的链接
print(link)
resp_cnt = get_page_data(host, link)
sp_cnt = BeautifulSoup(resp_cnt, 'lxml')
div_author = sp_cnt.find("div", "auther")
html_author = div_author.__str__()
html_author = html_re.sub("", html_author).split(" ")
author = html_author[0].replace("来源:", "").strip() # 来源
if html_author[1].strip() != "":
ptime = html_author[1].replace("发布日期:", "").strip() # 发布时间
else:
ptime = html_author[2].replace("发布日期:", "").strip() # 发布时间
div_cnt = sp_cnt.find("div", "v_news_content")
cnt = html_re.sub("", div_cnt.__str__()).replace("\n", "
") # 内容
cnt = cnt.strip("
")
insert_sql = f"INSERT INTO article (title,content,source,t) " \
f"VALUES ('" + addslashes(title) + "','" + addslashes(cnt) + "','" + addslashes(
author) + "','" + addslashes(ptime) + "')"
try:
db_cursor.execute(insert_sql)
db.commit()
except:
print("insert error")
print(insert_sql)
print("休眠1秒")
time.sleep(1)
print("休眠10秒")
time.sleep(10)
cur_page -= 1
url = "https://www.hsnc.edu.cn/index/xyxx/xyxw/" + str(cur_page) + ".htm"
db_cursor.close()
db.close()
def main():
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'stfeel.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
Java精彩实战毕设项目案例
小程序精彩项目案例
Python实战项目集
如果大家有任何疑虑,或者对这个系统感兴趣,欢迎点赞收藏、留言交流啦!
欢迎在下方位置详细交流。