Python课程设计

文章目录

  • 前言
  • 一、数据爬取
  • 二、数据存储
  • 总结
    • spider代码


前言

本文涉及的代码在最后,希望能获取你的认可和小小的赞
更为详细的代码介绍和课程设计在我的Python项目专栏中,有需要的uu可以自行查看,代码链接在总结的github链接处
项目链接
一 课程设计任务说明及总体设计说明
二 总体设计说明
三 核心功能与代码
四 问题与讨论

参考了:IT私塾
想看最终效果图可以直接移步至三 核心功能与代码

一、数据爬取

1.请求网页必需的requests模块
2.处理数据格式的json
3.xlwt是用来形成excel文件
4.最后的sqlite3是将数据存储到数据库文件.db中

import requests as rq, json as js
import xlwt
import sqlite3  # 进行SQLite数据库操作

爬取热门榜单的数据
以下注释可供参考

datalist = []
for page in range(1,5):#这边的参数是自己测试出来的
    # 1.获取html文件
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
    r = rq.get('https://api.bilibili.com/x/web-interface/popular?ps=50&pn={}'.format(page))
    # 2. 设置对于的编码方式
    r.encoding = r.apparent_encoding#从内容中分析出的响应内容编码
    # 3. 将已编码的 JSON 字符串解码为 Python 对象即字典
    data = js.loads(r.content.decode('utf8'))['data']['list']
    #print('hhhh'),用来测试是否进入循环
    # 4. 遍历字符串信息,d为字典型
    for d in data:
        count = []
        count.append(d['title'])             # 标题
        count.append(d['tname'])             # 类型
        count.append(d['owner']['name'])     # up主名字
        count.append(d['stat']['view'])      # 播放量
        count.append(d['stat']['reply'])     # 评论
        count.append(d['stat']['like'])      # 点赞
        count.append(d['stat']['coin'])      # 投币
        count.append(d['stat']['favorite'])  # 收藏
        count.append(d['stat']['share'])     # 分享
        count.append(d['short_link'])        # 链接
        datalist.append(count)

#print(datalist)#输出测试
b=sorted(datalist,key=lambda x:x[3],reverse=True)#根据播放量排序

二、数据存储

保存数据到excel文件中

def saveData(datalist,savepath):
    print("save.......")
    book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
    sheet = book.add_sheet('哔哩哔哩Top200', cell_overwrite_ok=True) #创建工作表
    col = ("标题","类型","up主","播放量","评论","点赞","投币","收藏","分享","链接")
    for i in range(0,10):
        sheet.write(0,i,col[i])  #列名
    for i in range(0,200):
        # print("第%d条" %(i+1))       #输出语句,用来测试
        data = datalist[i]
        for j in range(0,10):
            sheet.write(i+1,j,data[j])  #数据
    book.save(savepath) #保存

savepath = "bilibili.xls"
saveData(b,savepath)

保存数据到数据库中

def saveData2DB(datalist,dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()
    for data in datalist:
        for index in range(len(data)):
            #data[index] = '"' + data[index] + '"'
            data[index] = str(data[index])
            data[index] = '"' + data[index] + '"'

        sql = '''
                insert into bilibili200(
                title,ttype,up,views,reply,dianzan,coin,favorite,share,link)
                values (%s)'''%",".join(data)
        print(",".join(data))
        print(sql)     #输出查询语句,用来测试
        cur.execute(sql)
        conn.commit()
    cur.close
    conn.close()


def init_db(dbpath):
    sql = '''
        create table bilibili200
        (
        id integer  primary  key autoincrement,
        title varchar,
        ttype varchar,
        up varchar,
        views numeric,
        reply numeric,
        dianzan numeric,
        coin numeric,
        favorite numeric,
        share numeric, 
        link text
        )


    '''  #创建数据表
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

# dbpath = "bilibili.db"              #当前目录新建数据库,存储进去
# saveData2DB(b,dbpath)

csv文件存储 对爬取热门视频类型排序比较,再存取到csv文件

# -*- coding: utf-8 -*-
import requests as rq, json as js
import csv

datalist = []
for page in range(1,5):
    # 1.获取html文件
    r = rq.get('https://api.bilibili.com/x/web-interface/popular?ps=50&pn={}'.format(page))
    # 2. 设置对于的编码方式
    r.encoding = r.apparent_encoding
    # 3. 将已编码的 JSON 字符串解码为 Python 对象即字典
    data = js.loads(r.content.decode('utf8'))['data']['list']
    #print('hhhh'),用来测试是否进入循环
    # 4. 遍历字符串信息,d为字典型
    #print(data)
    for d in data:
        count = []
        count.append(d['title'])             # 标题
        count.append(d['tname'])             # 类型
        count.append(d['owner']['name'])     # up主名字
        count.append(d['stat']['view'])      # 播放量
        count.append(d['stat']['reply'])     # 评论
        count.append(d['stat']['like'])      # 点赞
        count.append(d['stat']['coin'])      # 投币
        count.append(d['stat']['favorite'])  # 收藏
        count.append(d['stat']['share'])     # 分享
        datalist.append(count)



s = set()                   #set创建一个无序不重复元素集
for i in data:              #遍历循环字典
    s.add(i['tname'])       #这样就能获取到所有不重复的电影类型
    print(i['tname'])
csvfile = open('bilibili热门分类.csv', 'w', newline='',encoding="utf-8")#打开文件,没有则创建

writer = csv.writer(csvfile)
writer.writerow(['分类', '视频', '类型', '点赞', '评论','综合评分'])
list = []
for i in s:
    for d in data:
            t = (i, d["title"], d["tname"],
            d['stat']['like'],
            d['stat']['reply'],(d['stat']['like']*0.3 +d['stat']['reply']*0.7))
            list.append(t)
#这里开始就是对视频类型的排序
    for j in range(len(list) - 1, 0, -1):
        if (list[j - 1][0] != i):
            #print(list[j-1][0])
            break
        if (list[j - 1][4] < list[j][4]):
            # print(list[j - 1][4],list[j][4])
            tmp = list[j - 1]
            list[j - 1] = list[j]
            list[j] = tmp
writer.writerows(list)
csvfile.close()

csvfile = open('bilibili热门分类-1.csv', 'w', newline='',encoding="utf-8")#打开文件,没有则创建
writer = csv.writer(csvfile)
writer.writerow([ '视频', '类型', '点赞', '评论','综合评分'])
list1 = []

for d in data:
        t = ( d["title"], d["tname"],
        d['stat']['like'],
        d['stat']['reply'],(d['stat']['like']*0.3 +d['stat']['reply']*0.7))
        list1.append(t)
# print(list1[0][4])
b=sorted(list1,key=lambda x:x[4],reverse=True)

writer.writerows(b)
csvfile.close()

总结

一次小小的项目经验吧
已上传至github上,有需要可以自取(切勿做非法用途)
项目地址

spider代码

bilibilispider.py

import requests as rq, json as js
import xlwt
import sqlite3  # 进行SQLite数据库操作

datalist = []
for page in range(1,5):#这边的参数是自己测试出来的
    # 1.获取html文件
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
    r = rq.get('https://api.bilibili.com/x/web-interface/popular?ps=50&pn={}'.format(page),headers=headers)
    # 2. 设置对于的编码方式
    r.encoding = r.apparent_encoding#从内容中分析出的响应内容编码
    # 3. 将已编码的 JSON 字符串解码为 Python 对象即字典
    data = js.loads(r.content.decode('utf8'))['data']['list']
    #print('hhhh'),用来测试是否进入循环
    # 4. 遍历字符串信息,d为字典型
    for d in data:
        count = []
        count.append(d['title'])             # 标题
        count.append(d['tname'])             # 类型
        count.append(d['owner']['name'])     # up主名字
        count.append(d['stat']['view'])      # 播放量
        count.append(d['stat']['reply'])     # 评论
        count.append(d['stat']['like'])      # 点赞
        count.append(d['stat']['coin'])      # 投币
        count.append(d['stat']['favorite'])  # 收藏
        count.append(d['stat']['share'])     # 分享
        count.append(d['short_link'])        # 链接
        datalist.append(count)

#print(datalist)#输出测试
b=sorted(datalist,key=lambda x:x[3],reverse=True)
#定义函数存储数据到excel文件中
def saveData(datalist,savepath):
    print("save.......")
    book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
    sheet = book.add_sheet('哔哩哔哩Top200', cell_overwrite_ok=True) #创建工作表
    col = ("标题","类型","up主","播放量","评论","点赞","投币","收藏","分享","链接")
    for i in range(0,10):
        sheet.write(0,i,col[i])  #列名
    for i in range(0,200):
        # print("第%d条" %(i+1))       #输出语句,用来测试
        data = datalist[i]
        for j in range(0,10):
            sheet.write(i+1,j,data[j])  #数据
    book.save(savepath) #保存

savepath = "bilibili.xls"
saveData(b,savepath)


def saveData2DB(datalist,dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()
    for data in datalist:
        for index in range(len(data)):
            #data[index] = '"' + data[index] + '"'
            data[index] = str(data[index])
            data[index] = '"' + data[index] + '"'

        sql = '''
                insert into bilibili200(
                title,ttype,up,views,reply,dianzan,coin,favorite,share,link)
                values (%s)'''%",".join(data)
        print(",".join(data))
        print(sql)     #输出查询语句,用来测试
        cur.execute(sql)
        conn.commit()
    cur.close
    conn.close()


def init_db(dbpath):
    sql = '''
        create table bilibili200
        (
        id integer  primary  key autoincrement,
        title varchar,
        ttype varchar,
        up varchar,
        views numeric,
        reply numeric,
        dianzan numeric,
        coin numeric,
        favorite numeric,
        share numeric, 
        link text
        )


    '''  #创建数据表
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

# dbpath = "bilibili.db"              #当前目录新建数据库,存储进去
# saveData2DB(b,dbpath)```

你可能感兴趣的:(Python项目,爬虫)