python数据保存到MySQL

本文依然以虎扑为例,将爬取的数据保存到mysql数据库中:

首先,导入相应的库

import requests
from bs4 import BeautifulSoup
import time
import random
import MySQLdb

定义方法爬取数据

def get_information(page=0):

    url = 'https://bbs.hupu.com/bxj-postdate-' + str(page+1)
    headers={
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
        "Referer": "https://bbs.hupu.com/bxj"
    }
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.content.decode("utf-8"),"html.parser")
    out = soup.find("ul",attrs={"class":"for-list"})
    datas = out.find_all('li')
    datas_list = []
    try:
        for data in datas:
            title = data.find('a', attrs={"class":"truetit"}).text.split()[0]
            artical_link = "https://bbs.hupu.com" + data.find('a', attrs={"class": "truetit"}).attrs['href']
            author = data.find('a', class_="aulink").text
            author_link = data.find('a', class_="aulink").attrs['href']
            create_time = data.find('a', style="color:#808080;cursor: initial; ").text
            reply_num = data.find('span', class_='ansour box').text.split("/")[0]
            lastest_reply = data.find('span', class_='endauthor').text
            lastest_reply_time = data.find('div', class_='endreply box').a.text

            datas_list.append({"title":title,"artical_link":artical_link,"author":author,"author_link":author_link,"create_time":create_time,"reply_num":reply_num,"lastest_reply":lastest_reply,"lastest_reply_time":lastest_reply_time})
    except:
        None
    return datas_list

运行并保存到mysql中:

if __name__ == "__main__":    
    conn = MySQLdb.connect(host='localhost', user='root', password='12345678', db='scraping',charset="utf8")  # 创建连接
    cur = conn.cursor()  # 创建游标
    for page in range(10):
        datas = get_information(page)
        for data in datas:
            cur.execute("INSERT INTO hupu_datas (title, artical_link, author, author_link,create_time, reply_num, lastest_reply, lastest_reply_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)",(data['title'], data['artical_link'], data['author'], data['author_link'], data['create_time'],data['reply_num'], data['lastest_reply'], data['lastest_reply_time']))
        print("正在爬取第%s页"%(page+1))
        time.sleep(1+random.random()*2)
    cur.close()  # 关闭游标
    conn.commit()  # 提交事务
    conn.close()  # 关闭连接

进入mysql查看:
python数据保存到MySQL_第1张图片

你可能感兴趣的:(数据分析,python,mysql)