python实现博客爬虫

python实现博客爬虫

有序的存到word中

目标地址:https://www.kingname.info/archives/

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
from docx import Document
from docx.shared import Inches
import re



# 爬取网页函数
def request(url):
    html = ""  # 爬取到的网页源代码
    URL = urllib.parse.quote(url, safe=':/.')  # 爬取的网页url
    headers = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / "
                      "89.0.4389.114Safari / 537.36 "
    }  # 请求头,防止爬虫拦截
    request = urllib.request.Request(URL, headers=headers, method="GET")  # 封装请求对象
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode()
    except urllib.error.HTTPError as e:
        print("超时")
    return html


# 获取数据函数
def getData(urllist, urleds):
    # 需要爬取的是urllist和urleds的差集
    newurl = list(set(urllist) - set(urleds))
    # 退出条件,urllist中没有新的url
    if len(newurl) == 0:
        printok()
        return
    data = []  # 存储每个页面的html数据
    urls = []  # 存储文章真是链接地址
    for url in newurl:
        data.append(request(url))  # 讲爬取到数据存入列表中,这样先爬取到的页面下标越小
        # 爬取之后讲地址放入列表
        urleds.append(url)
    # 处理data列表
    for dataone in data:
        # 使用正则分别拿到想要的数据
        bs = BeautifulSoup(dataone, "html.parser")
        # data_year = bs.select("div[class='collection-title'] > h1[class=archive-year]")
        data_title_url = bs.select("article > header > h2 > a")  # 用来获取文章真实地址

        # 1:获取文章的url
        for url in data_title_url:
            urls.append("https://www.kingname.info" + url['href'])
        data_title_name = bs.select("article > header > h2 > a > span")  # 用来获取文章标题
        data_title_detailtime = bs.select("article > header > div >time")  # 用来获取文章发布时间
        for i in range(0, len(data_title_name)):
            #调用方法,向word写标题
            # print(data_title_detailtime[i].get_text(), end="\t")
            # print(data_title_name[i].get_text(), end="\n")

            # 访问文章的连接,爬取文章的内容
            title_html = request(urls[i])
            s = BeautifulSoup(title_html, "html.parser")
            imgs = s.findAll("img")
            # 解析文章内容
            nav = s.select("div[class='post-body'] > p")
            save_to_doc(data_title_name[i].get_text(), data_title_detailtime[i].get_text(), nav,imgs)
            # for n in nav:
            #     print(n.get_text())
        # 1:获取其他页面的url
        nexturl = []
        tem = bs.select("a[class='page-number']")
        for url in tem:
            nexturl.append("https://www.kingname.info" + url['href'])

    # 最后递归调用
    getData(nexturl, urleds)

def save_to_doc(title,time,plist,imgs):
    doc = Document()
    save_title_name(doc,title,time)
    print(title)
    imgs.remove(imgs[len(imgs) - 1])
    src = []
    for img in imgs:
        src.append(img['src'])
    print(len(src))
    flag = 0
    #保存内容
    for n in plist:
        if str(n).find("img") ==-1:
            #说明是段落
            doc.add_paragraph(n.get_text())
            print(n.get_text())
        else:
            #说明是图片
            if flag < len(src):
                requestimg(doc, src[flag])
                flag = flag+1

    doc.save(title[1:-3]+".docx")

def save_title_name(doc,title,time):
    doc.add_heading(title)
    print(title)
    doc.add_heading(time, level=1)
    print(time)



def requestimg(doc,url):
    print(url)
    URL = urllib.parse.quote(url, safe=':/.')
    print(URL)
    path = ''

    if URL[-5:].find("jpg") ==-1 and URL[-5:].find("png") ==-1 and URL[-5:].find("png") and URL[-5:].find("JPEG"):
        name = URL[-13:]
    else:
        name = URL[-23:]
    if name.find("/") != -1:
        sub = name.index("/")
        name = name[sub:]
    print(name)
    try:
        urllib.request.urlretrieve(URL, name)
    except Exception as e:
        print("图片下载失败")
    try:
        doc.add_picture(name, width=Inches(5))
    except FileNotFoundError as e:
        print("图片路径未找到")
    print("ok")

def test():
    html = request("https://www.kingname.info/2021/02/18/entry-file/")
    bs = BeautifulSoup(html, "html.parser")
    print("

".find("img")) imgs = bs.findAll("img") imgs.remove(imgs[len(imgs)-1]) for img in imgs: print(img['src']) def printok(): print("--------------------------------------------------------------", end="\n") print("##############################################################", end="\n") print(" ", end="\n") print(" $$$$$$$$$$ ### ### ", end="\n") print(" $$$$$$$$$$$$ ### ### ", end="\n") print(" $############$ ### #### ", end="\n") print(" $$$$$$$$$$$$$$ ######## ", end="\n") print(" $$$$$$$$$$$$$$ ### ### ", end="\n") print(" $$$$$$$$$$$$ ### ### ", end="\n") print(" $$$$$$$$ ### ### ", end="\n") print(" ", end="\n") print("##############################################################", end="\n") print("______________________________________________________________", end="\n") if __name__ == '__main__': # test() # 入口地址 urls = ['https://www.kingname.info/archives/'] # 用来存储已经爬取过的地址,防止重复爬取 urleds = [] # 调用爬取函数 getData(urls, urleds)

基础模板,控制台打印文本

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
import re


# 爬取网页函数
def request(url):
    html = ""  # 爬取到的网页源代码
    URL = urllib.parse.quote(url, safe=':/.')  # 爬取的网页url
    headers = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / "
                      "89.0.4389.114Safari / 537.36 "
    }  # 请求头,防止爬虫拦截
    request = urllib.request.Request(URL, headers=headers, method="GET")  # 封装请求对象
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode()
    except urllib.error.HTTPError as e:
        print("超时")
    return html


# 获取数据函数
def getData(urllist, urleds):
    # 需要爬取的是urllist和urleds的差集
    newurl = list(set(urllist) - set(urleds))
    # 退出条件,urllist中没有新的url
    if len(newurl) == 0:
        printok()
        return
    data = []  # 存储每个页面的html数据
    urls = []  # 存储文章真是链接地址
    for url in newurl:
        data.append(request(url))  # 讲爬取到数据存入列表中,这样先爬取到的页面下标越小
        # 爬取之后讲地址放入列表
        urleds.append(url)
    # 处理data列表
    for dataone in data:
        # 使用正则分别拿到想要的数据
        bs = BeautifulSoup(dataone, "html.parser")
        # data_year = bs.select("div[class='collection-title'] > h1[class=archive-year]")
        data_title_url = bs.select("article > header > h2 > a")  # 用来获取文章真实地址

        # 1:获取文章的url
        for url in data_title_url:
            urls.append("https://www.kingname.info" + url['href'])
        data_title_name = bs.select("article > header > h2 > a > span")  # 用来获取文章标题
        data_title_detailtime = bs.select("article > header > div >time")  # 用来获取文章发布时间
        for i in range(0, len(data_title_name)):
            print(data_title_detailtime[i].get_text(), end="\t")
            print(data_title_name[i].get_text(), end="\n")

            # 访问文章的连接,爬取文章的内容
            title_html = request(urls[i])
            s = BeautifulSoup(title_html, "html.parser")
            # 解析文章内容
            nav = s.select("div[class='post-body'] > p")
            for n in nav:
                print(n.get_text())
        # 1:获取其他页面的url
        nexturl = []
        tem = bs.select("a[class='page-number']")
        for url in tem:
            nexturl.append("https://www.kingname.info" + url['href'])

    # 最后递归调用
    getData(nexturl, urleds)


def printok():
    print("--------------------------------------------------------------", end="\n")
    print("##############################################################", end="\n")
    print("                                                              ", end="\n")
    print("           $$$$$$$$$$      ###     ###                        ", end="\n")
    print("          $$$$$$$$$$$$     ###    ###                         ", end="\n")
    print("         $############$    ###  ####                          ", end="\n")
    print("         $$$$$$$$$$$$$$    ########                           ", end="\n")
    print("         $$$$$$$$$$$$$$    ###  ###                           ", end="\n")
    print("          $$$$$$$$$$$$     ###    ###                         ", end="\n")
    print("            $$$$$$$$       ###     ###                        ", end="\n")
    print("                                                              ", end="\n")
    print("##############################################################", end="\n")
    print("______________________________________________________________", end="\n")


if __name__ == '__main__':
    # 入口地址
    urls = ['https://www.kingname.info/archives/']
    # 用来存储已经爬取过的地址,防止重复爬取
    urleds = [] # 队列 存放的是爬取过的url地址
    # 调用爬取函数
    getData(urls, urleds)






你可能感兴趣的:(python,python,爬虫)