有序的存到word中
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
from docx import Document
from docx.shared import Inches
import re
# 爬取网页函数
def request(url):
html = "" # 爬取到的网页源代码
URL = urllib.parse.quote(url, safe=':/.') # 爬取的网页url
headers = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / "
"89.0.4389.114Safari / 537.36 "
} # 请求头,防止爬虫拦截
request = urllib.request.Request(URL, headers=headers, method="GET") # 封装请求对象
try:
response = urllib.request.urlopen(request)
html = response.read().decode()
except urllib.error.HTTPError as e:
print("超时")
return html
# 获取数据函数
def getData(urllist, urleds):
# 需要爬取的是urllist和urleds的差集
newurl = list(set(urllist) - set(urleds))
# 退出条件,urllist中没有新的url
if len(newurl) == 0:
printok()
return
data = [] # 存储每个页面的html数据
urls = [] # 存储文章真是链接地址
for url in newurl:
data.append(request(url)) # 讲爬取到数据存入列表中,这样先爬取到的页面下标越小
# 爬取之后讲地址放入列表
urleds.append(url)
# 处理data列表
for dataone in data:
# 使用正则分别拿到想要的数据
bs = BeautifulSoup(dataone, "html.parser")
# data_year = bs.select("div[class='collection-title'] > h1[class=archive-year]")
data_title_url = bs.select("article > header > h2 > a") # 用来获取文章真实地址
# 1:获取文章的url
for url in data_title_url:
urls.append("https://www.kingname.info" + url['href'])
data_title_name = bs.select("article > header > h2 > a > span") # 用来获取文章标题
data_title_detailtime = bs.select("article > header > div >time") # 用来获取文章发布时间
for i in range(0, len(data_title_name)):
#调用方法,向word写标题
# print(data_title_detailtime[i].get_text(), end="\t")
# print(data_title_name[i].get_text(), end="\n")
# 访问文章的连接,爬取文章的内容
title_html = request(urls[i])
s = BeautifulSoup(title_html, "html.parser")
imgs = s.findAll("img")
# 解析文章内容
nav = s.select("div[class='post-body'] > p")
save_to_doc(data_title_name[i].get_text(), data_title_detailtime[i].get_text(), nav,imgs)
# for n in nav:
# print(n.get_text())
# 1:获取其他页面的url
nexturl = []
tem = bs.select("a[class='page-number']")
for url in tem:
nexturl.append("https://www.kingname.info" + url['href'])
# 最后递归调用
getData(nexturl, urleds)
def save_to_doc(title,time,plist,imgs):
doc = Document()
save_title_name(doc,title,time)
print(title)
imgs.remove(imgs[len(imgs) - 1])
src = []
for img in imgs:
src.append(img['src'])
print(len(src))
flag = 0
#保存内容
for n in plist:
if str(n).find("img") ==-1:
#说明是段落
doc.add_paragraph(n.get_text())
print(n.get_text())
else:
#说明是图片
if flag < len(src):
requestimg(doc, src[flag])
flag = flag+1
doc.save(title[1:-3]+".docx")
def save_title_name(doc,title,time):
doc.add_heading(title)
print(title)
doc.add_heading(time, level=1)
print(time)
def requestimg(doc,url):
print(url)
URL = urllib.parse.quote(url, safe=':/.')
print(URL)
path = ''
if URL[-5:].find("jpg") ==-1 and URL[-5:].find("png") ==-1 and URL[-5:].find("png") and URL[-5:].find("JPEG"):
name = URL[-13:]
else:
name = URL[-23:]
if name.find("/") != -1:
sub = name.index("/")
name = name[sub:]
print(name)
try:
urllib.request.urlretrieve(URL, name)
except Exception as e:
print("图片下载失败")
try:
doc.add_picture(name, width=Inches(5))
except FileNotFoundError as e:
print("图片路径未找到")
print("ok")
def test():
html = request("https://www.kingname.info/2021/02/18/entry-file/")
bs = BeautifulSoup(html, "html.parser")
print(""
.find("img"))
imgs = bs.findAll("img")
imgs.remove(imgs[len(imgs)-1])
for img in imgs:
print(img['src'])
def printok():
print("--------------------------------------------------------------", end="\n")
print("##############################################################", end="\n")
print(" ", end="\n")
print(" $$$$$$$$$$ ### ### ", end="\n")
print(" $$$$$$$$$$$$ ### ### ", end="\n")
print(" $############$ ### #### ", end="\n")
print(" $$$$$$$$$$$$$$ ######## ", end="\n")
print(" $$$$$$$$$$$$$$ ### ### ", end="\n")
print(" $$$$$$$$$$$$ ### ### ", end="\n")
print(" $$$$$$$$ ### ### ", end="\n")
print(" ", end="\n")
print("##############################################################", end="\n")
print("______________________________________________________________", end="\n")
if __name__ == '__main__':
# test()
# 入口地址
urls = ['https://www.kingname.info/archives/']
# 用来存储已经爬取过的地址,防止重复爬取
urleds = []
# 调用爬取函数
getData(urls, urleds)
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
import re
# 爬取网页函数
def request(url):
html = "" # 爬取到的网页源代码
URL = urllib.parse.quote(url, safe=':/.') # 爬取的网页url
headers = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / "
"89.0.4389.114Safari / 537.36 "
} # 请求头,防止爬虫拦截
request = urllib.request.Request(URL, headers=headers, method="GET") # 封装请求对象
try:
response = urllib.request.urlopen(request)
html = response.read().decode()
except urllib.error.HTTPError as e:
print("超时")
return html
# 获取数据函数
def getData(urllist, urleds):
# 需要爬取的是urllist和urleds的差集
newurl = list(set(urllist) - set(urleds))
# 退出条件,urllist中没有新的url
if len(newurl) == 0:
printok()
return
data = [] # 存储每个页面的html数据
urls = [] # 存储文章真是链接地址
for url in newurl:
data.append(request(url)) # 讲爬取到数据存入列表中,这样先爬取到的页面下标越小
# 爬取之后讲地址放入列表
urleds.append(url)
# 处理data列表
for dataone in data:
# 使用正则分别拿到想要的数据
bs = BeautifulSoup(dataone, "html.parser")
# data_year = bs.select("div[class='collection-title'] > h1[class=archive-year]")
data_title_url = bs.select("article > header > h2 > a") # 用来获取文章真实地址
# 1:获取文章的url
for url in data_title_url:
urls.append("https://www.kingname.info" + url['href'])
data_title_name = bs.select("article > header > h2 > a > span") # 用来获取文章标题
data_title_detailtime = bs.select("article > header > div >time") # 用来获取文章发布时间
for i in range(0, len(data_title_name)):
print(data_title_detailtime[i].get_text(), end="\t")
print(data_title_name[i].get_text(), end="\n")
# 访问文章的连接,爬取文章的内容
title_html = request(urls[i])
s = BeautifulSoup(title_html, "html.parser")
# 解析文章内容
nav = s.select("div[class='post-body'] > p")
for n in nav:
print(n.get_text())
# 1:获取其他页面的url
nexturl = []
tem = bs.select("a[class='page-number']")
for url in tem:
nexturl.append("https://www.kingname.info" + url['href'])
# 最后递归调用
getData(nexturl, urleds)
def printok():
print("--------------------------------------------------------------", end="\n")
print("##############################################################", end="\n")
print(" ", end="\n")
print(" $$$$$$$$$$ ### ### ", end="\n")
print(" $$$$$$$$$$$$ ### ### ", end="\n")
print(" $############$ ### #### ", end="\n")
print(" $$$$$$$$$$$$$$ ######## ", end="\n")
print(" $$$$$$$$$$$$$$ ### ### ", end="\n")
print(" $$$$$$$$$$$$ ### ### ", end="\n")
print(" $$$$$$$$ ### ### ", end="\n")
print(" ", end="\n")
print("##############################################################", end="\n")
print("______________________________________________________________", end="\n")
if __name__ == '__main__':
# 入口地址
urls = ['https://www.kingname.info/archives/']
# 用来存储已经爬取过的地址,防止重复爬取
urleds = [] # 队列 存放的是爬取过的url地址
# 调用爬取函数
getData(urls, urleds)