python爬虫爬取个人博客导入sqlite数据库

python爬虫爬取个人博客导入sqlite数据库

【代码示例】

import re
from bs4 import BeautifulSoup
import urllib.request,urllib.error
import sqlite3


def main():
    #声明要爬取的博客网址
    baseurl = "https://blog.csdn.net/gets_s/article/list/"
    #获取数据
    datalist = getData(baseurl)
    #保存数据
    dbpath = "blogs.db"
    #print(datalist)
    saveData(dbpath,datalist)

#正则表达式
findtitle = re.compile(r'.*(.*?)',re.S)
findlink = re.compile(r')
findtime = re.compile(r'(.*?)')
findnum = re.compile(r'(.*?)')
def getData(baseurl):
    datalist = []
    for i in range(1,3):
        url = baseurl + str(i)
        html = askURL(url)

        #对获取的网页代码进行解析
        soup = BeautifulSoup(html,"html.parser")
        for item in soup.find_all('div',class_="article-item-box csdn-tracking-statistics"):
            item = str(item)
            #print(item)
            data = []

            title = re.findall(findtitle,item)
            title = title[0].replace("\n","")
            #print(title)
            data.append(title)

            link = re.findall(findlink,item)[0]
            #print(link)
            data.append(link)

            time = re.findall(findtime,item)[0]
            #print(time)
            data.append(time)

            num = re.findall(findnum,item)[0]
            #print(num)
            data.append(num)

            datalist.append(data)

    return datalist
def askURL(url):
    head = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 80.0.3987.163Safari / 537.36"
    }
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


def saveData(dbpath,datalist):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            sql = '''
              insert into blogs(
              title, blog_link, sigup_time, num)
             values('%s','%s','%s','%s') '''%(data[0],data[1],data[2],data[3])
        print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()
def init_db(dbpath):
    sql = '''
           create table blogs
           (
           id integer primary key autoincrement,
           title text,
           blog_link text,
           sigup_time numeric,
           num numeric
           )
       '''
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()
if __name__ == "__main__":
    main()

【运行结果】
python爬虫爬取个人博客导入sqlite数据库_第1张图片

【思路总结】
python爬虫爬取个人博客导入sqlite数据库_第2张图片

你可能感兴趣的:(python爬虫,代码,python,sqlite,爬虫)