python爬虫抓一些考研补录数据并进行数据分析期末大作业

1.爬虫获取数据

import requests
import pandas as pd
import xlwt
import openpyxl
from time import sleep
from tqdm import tqdm

def get_page(n):
    url = 'https://api.kaoyan.cn/pc/adjust/adjustList'

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"
    }
    body = {

        "degree_type":"",
        "keyword":"",
        "level1":"",
        "level2":"",
        "limit":9000,
        "page":1,
        "year":2022,
    }
    req = requests.post(url,headers=header,data=body).json()
    content = req["data"]['data']
    order = ['tj_id','spe_id','school_id','year','recruit_number','school_name','special_code', 'special_name','recruit_type_name','depart_name', 'province_name']
    pf = pd.DataFrame(list(content))
    pf = pf[order]
    pf.fillna(' ', inplace=True)
    file_path = pd.ExcelWriter('F:\\考研数据\\调剂数据.xlsx')
    pf.to_excel(file_path, encoding='utf-8', index=False)
    file_path.save()
    #print(pf)



if __name__ == '__main__':
    for i in tqdm(range(11)):
        get_page(i)

2.将拿到的数据进行数据分析

import openpyxl,pprint

print("Opening workbook...... ")

file = "F:\\考研数据\\调剂数据.xlsx"
file2 = "F:\\考研数据\\result1.py"
file3 = "F:\\考研数据\\result2.py"
wb = openpyxl.load_workbook(file)

sheet = wb.active

Data1 = {}
for row in range(2,sheet.max_row+1):

    #招生省份
    Province = sheet['K' + str(row)].value
    #print(Province)
    #各个省份招生人数
    Recruit_number = sheet['E' + str(row)].value
    #大学
    School_name  = sheet['F' + str(row)].value

    Data1.setdefault(School_name, {})
    Data1[School_name].setdefault(Province,{'tract':0, 'Recruit_number':0})

    Data1[School_name][Province]['tract'] += 1
    Data1[School_name][Province]['Recruit_number'] += int(Recruit_number)

Data2 = {}
for row in range(2,sheet.max_row+1):

    #招生省份
    Province = sheet['K' + str(row)].value
    #print(Province)
    #各个省份招生人数
    Recruit_number = sheet['E' + str(row)].value
    #大学
    School_name  = sheet['F' + str(row)].value

    Data2.setdefault('', {})
    Data2[''].setdefault(Province,{'Recruit_number':0,'NUM':0})

    Data2[''][Province]['NUM'] += 1
    Data2[''][Province]['Recruit_number'] += int(Recruit_number)


print("Writing workbook......")
resultFile1 = open(file2,'w')
resultFile2 = open(file3,'w')
resultFile1.write(pprint.pformat(Data1))
resultFile2.write(pprint.pformat(Data2))
resultFile1.close()
resultFile2.close()

3.可视化处理并生成HTML页面

#!/usr/bin/python
# -*- coding: GBK -*-

import sys
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.charts import Page, Grid
//将拿到的数据放到同一文件下导进来
from 数据包 import result2

content = result2
python_data = content.con['']
university_num = []
student_num = []
university_name = []
print(type(university_name))
for i in python_data.keys():
    university_name.append(i)
for key in python_data:
    university_num.append(python_data[key]['NUM'])
    student_num.append(python_data[key]['Recruit_number'])
l = list(zip(university_name, student_num))
c = (
    Pie()
    .add(
        "",
        l,
        radius=["40%", "55%"],
        label_opts=opts.LabelOpts(
            position="outside",
            formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,


            rich={
                "a": {"color": "#999", "lineHeight": 22, "align": "center"},
                "abg": {
                    "backgroundColor": "#e3e3e3",
                    "width": "100%",
                    "align": "right",
                    "height": 20,
                    "borderRadius": [4, 4, 0,0 ],
                },
                "hr": {
                    "borderColor": "#aaa",
                    "width": "100%",
                    "borderWidth": 5,
                    "height": 0,
                },
                "b": {"fontSize": 20, "lineHeight": 35},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 5,
                },

            },
        ),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="大学在各省补录人数"),
        legend_opts=opts.LegendOpts(padding=20, pos_left=200),
    )
    .render("pie_rich_label.html")
)

4.socket同步修改网页数据

'''
作者 : 一盆萝卜丁
时间 : 2022/12
'''
# -*-coding:utf-8 -*-

import socket

server_html = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

server_html.bind(("127.0.0.1", 8080))

server_html.listen(10)

while True:
    conn, addr = server_html.accept()
    msg = conn.recv(1024 * 12)
    print(conn)
    # 以字节读取数据的权限去打开html_pro.html文件
    file_html = open("pie_rich_label.html", "rb")
    # 读取文件内容
    data = file_html.read()
    # 下面这句话必须写,关于http协议的内容,以后说
    conn.sendall(bytes("HTTP/1.1 201 OK\r\n\r\n", "utf-8"))
    # 发送读取的内容
    conn.sendall(data)

    conn.close()

你可能感兴趣的:(python,爬虫,考研,数据分析)