从网站上爬取疫情数据的方法

从网站上爬取疫情数据的方法

爬取疫情数据的网站是:link.

这是实现爬虫的主要方法:

// 国内各省疫情情况
import requests
import re
from util.commonFunc import WriteToDb

def parse_url(page_url):
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    try:
        r=requests.get(page_url,headers=headers,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        html = r.text
    except:
        print('访问失败')

    html=re.sub(r'provinceShortName','cityName',html)                                   #获取省或城市的信息,为避免遗漏省份,可以先将"provinceShortName"替换"cityName"再分析
    html=re.search('{ window.getAreaStat =.+?window.getIndexRecommendList2',html)       #再把关于中国的部分取出来
    html=html.group()
    cities=re.findall(r"""
    {.*?"cityName":"(.+?)",                         #城市名称
    "currentConfirmedCount":(-?\d+),                  #现存确诊
    "confirmedCount":(\d+),                         #累计确诊
    .+?"curedCount":(\d+),                          #治愈
    "deadCount":(\d+)                               #死亡
    """,html,re.VERBOSE|re.DOTALL)

    writer = WriteToDb('localhost', '3306', 'first_db')                                 #填写要连接的数据库的信息
    writer.connect()

    for city in cities:
        city=list(city)
        print(city)
        writer.writetoSQL(city)

def main():
    page_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'
    parse_url(page_url)

if __name__ == '__main__':
    main()

这是实现爬虫并导入数据库需要的辅助方法:

// An highlighted block
import pymysql

class WriteToDb(object):
    def __init__(self, hostname, port, database):
        self.host = hostname
        self.port = port
        self.database = database

    def connect(self):
        """连接数据库和客户端"""
        # engine = create_engine('impala://{}:{}/{}'.format(self.host, self.port, self.database))
        # self.con = engine.connect()
        # # return self.con
        self.connent = pymysql.connect(host='localhost', user='root', passwd='admin', db='first_db', charset='utf8') #db为所使用的数据库
        self.cursor = self.connent.cursor()

    def writetoSQL(self, data):
        """向数据库中写入数据"""
        #sql = "insert into covid_current(province_name,current_comfirmed, acculate_comfirmed, cured, dead) values(" + "'" + str(data[0]) + "'" + "," + data[1] + "," + data[2] + "," + data[3] + "," + list[4] + ")"
        sql ="INSERT INTO first_db.covid_current(province_name,current_comfirmed, acculate_comfirmed, cured, dead)VALUES('%s', '%s', '%s', '%s', '%s')" % \
        (data[0], data[1], data[2], data[3], data[4])

        print(sql)
        self.cursor.execute(sql)
        self.connent.commit()

通过以上两个方法就可以实现从丁香网上数据爬取到MySQL数据库的写入。
写入MySQL数据库后效果如下:
从网站上爬取疫情数据的方法_第1张图片

你可能感兴趣的:(数据开发,sql,数据库)