文章目录
- 简单爬虫
- 1,发送请求
- 2,信息筛选
- 3,存起来
三大步骤:
主流的python库:
API参考:https://www.osgeo.cn/requests/
import requests
# 目标地址
url1 = "http://www.ibiqu.org"
url2 = "/book/50537/"
# 发出请求
info = requests.get(url1 + url2)
# 判断请求成功了没有
if info.ok:
html = info.text
print(html)
else:
print("网络异常")
效果:
API参考:https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
import requests
from bs4 import BeautifulSoup
# 信息筛选
def handle(html):
# 准备分析对象
soup = BeautifulSoup(html, 'html.parser')
# 根据css选择器查找,得到一个数组
links = soup.select("dd>a")
# 遍历
for link in links:
print(f'{link.attrs["href"]}\t{link.text}')
# 之前的代码
url1 = "http://www.ibiqu.org"
url2 = "/book/50537/"
info = requests.get(url1 + url2)
if info.ok:
handle(info.text)
else:
print("网络异常")
效果:
Python文件读写。
import requests
# C
def save(data):
txt = open("result.txt", mode="a+",encoding="utf-8")
txt.write(data)
txt.close()
# B
def handle(html):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
links = soup.select("dd>a")
for link in links:
save(link.attrs['href']+"\n")
# A
url1 = "http://www.ibiqu.org"
url2 = "/book/50537/"
info = requests.get(url1 + url2)
if info.ok:
handle(info.text)
else:
print("网络异常")
效果: