熟悉使用 Selenium,Puppeteer 等工具爬取网站基本内容
将网站https://antispider3.scrape.center/一页每本书的信息保存在一个 json 文件中,每个 json 文件命名为书名.json,内容为保存书籍相应的信息
import bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import re
import numpy as np
import json
from pyquery import PyQuery as pq
EC.presence_of_all_elements_located()
:定位的元素范围内,是否至少有一个元素存在于页面当中,如果是,返回满足条件的所有元素组成的 List,否则返回空 Listbrowser = webdriver.Chrome() # 声明浏览器对象
browser.get('https://antispider3.scrape.center/')
# CSS_SELECTOR 选择所有class为item的元素,没有返回值对象,再进行等待
WebDriverWait(browser, 10) \
.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.item')))
html = browser.page_source # 拿到的是下一页的内容源码
pyquery 用法:
doc = pq(html)
# print(html)
定义标题,作者,路径
titles = []
authors = []
urls = []
soup = BeautifulSoup(html, "html.parser") # 固定用法
soup.find_all()
搜索当前 tag 的所有 tag 子节点,并判断是否符合过滤器条件.text
节点的文本re.findall()
函数返回包含所有匹配项的列numpy.argsort()
返回的是数组值从小到大的索引值
>>> import numpy as np
>>> x=np.array([1,4,3,-1,5,9])
>>> x.argsort()
array([3,0,2,1,4,5)]
def get_title(tag: bs4.Tag) -> str:
tokens = []
tokens_pos = []
for span in tag.childGenerator():
if len(re.findall("\S+", span.text)) > 0:
token = re.findall("\S+", span.text)[0]
else:
token = " "
tokens.append(token)
style = span["style"]
#"\d+"表示匹配数字部分
token_pos = int(re.findall(r"\d+", style)[0])
tokens_pos.append(token_pos)
#argsort()返回的是数组值从小到大的索引值
idxs = np.array(tokens_pos).argsort()
name = ""
for idx in idxs:
name += tokens[idx]
return name
# 爬取title
h3s = soup.find_all("h3")
for h3 in h3s: # 对h3中类的不同做不同的操作
title = ""
if h3["class"] == ["name", "whole"]:
title = h3.text
elif h3["class"] == ["m-b-sm", "name"]:
title = get_title(h3)
titles.append(title)
print(title)
tag["src"]
src=“”# 爬取url
tag_img = soup.find_all("img", {"class": "cover"})
for tag in tag_img:
url = tag["src"]
urls.append(url)
author.text()
# 爬取作者名称 .authors是.item的子节点
authors1 = doc('.item .authors')
for author in authors1.items():
authors.append(author.text())
# 保存为json文件
book_dict = {
"title": "",
"cover_url": "",
"authors": ""
}
for i in range(len(titles)):
with open(r"./book_{}".format(i), "w+", encoding="utf-8") as fp:
book_dict["title"] = titles[i]
book_dict["cover_url"] = urls[i]
book_dict["authors"] = authors[i]
json.dump(book_dict, fp, ensure_ascii=False, indent=2)