pip install requests
import requests
import requests
# 通过request发送请求
result = requests.get("https://www.douban.com/", headers=headers)
print(result) #
print(result.headers) # 响应头
with open(r"H:\File\Python\爬虫\response\01 resquest\index.html", "a", encoding="utf-8") as File:
File.write(result.text)
如果发生不可以访问,说明没有设置请求头,在python请求中默认携带的浏览器是python自己的,所以会被反爬
# 模拟request请求方式 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" }
import requests
# 模拟request请求方式
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
data = {
"wd": "python"
}
result = requests.post(f"https://www.baidu.com/", headers=headers, data=data)
print(result.text)
import requests
# 模拟request请求方式
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
keyWord = {
"wd": "python"
}
result = requests.get(f"https://www.baidu.com/", headers=headers, params=keyWord)
# 指定编码格式
result.encoding = "utf-8"
# text 返回文本数据
print(result.text)
# content 字节流数据,如图片、视频等
print(result.content)
# url 查看完整的url地址
print(result.url)
# encoding 查看编码方式
print(result.encoding)
with open(r"H:\File\Python\爬虫\response\01 resquest\02.html", "a", encoding="utf-7") as File:
File.write(result.text)
xxx.hson
获取到的是字典类型的数据import requests
url = "https://zhuanlan.zhihu.com/api/articles/xxx/xxx"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Referer": "https://zhuanlan.zhihu.com/p/620865916",
"Cookie": "_zap=c0b8f512-012a-4601-a317-5772cb"
}
resp = requests.get(url, headers=headers).json()
print(resp["paging"]["previous"])
import requests
url = "https://v.uuuab.com/72/72143/23xxx1.shtml"
data = {"usename": "dad", "password": "dasd"}
resp = requests.Session().post(url, data=data)
resp.encoding = "gbk"
print(resp.text)
lxml
pip install lxml
import requests
from lxml import etree
# 需要使用到 /a/text()
text = text.xpath('//*[@id="floor-www-index_558"]/a/text()')
# 包含 contains 假如 navigation-变化的值
text.xpath('//*[@id="floor-av_557"]/ul/li[contains(@class,"navigation-")]/a/text()')
# 获取a标签中href中的值
text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li/a[contains(@href,"https://")]')
import requests
from lxml import etree
htmlText = requests.get("https://www.csdn.net/")
text = etree.HTML(htmlText.text)
# 找到 class 名为 headswiper-item
print(text.xpath(
'//*[@id="floor-www-index_558"]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[@class="headswiper-item"]'))
# a 标签下的text内容
print(text.xpath(
'//*[@id="floor-www-index_558"]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[@class="headswiper-item"]/a/text()'))
# 按照文字匹配
print(text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li/a[text()="后端"]/text()'))
# 包含 contains 假如 navigation-变化的值
print(text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li[contains(@class,"navigation-")]/a/text()'))
# 获取a标签中的 href
for x in text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li/a[contains(@href,"https://")]'):
print(etree.tostring(x)
from lxml import etree
import requests
url = "https://tieba.baidu.com/p/84301xxx83"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
htmlText = requests.get(url, headers=headers).text
text = etree.HTML(htmlText)
imgUrlList = [item for item in text.xpath('//*[@id="post_content_147694672167"]/img/@src')]
index = 0
for imgUrl in imgUrlList:
with open(r"H:\File\Python\爬虫\response\02 xpath\{}.jpg".format(index), "wb") as File:
File.write(requests.get(imgUrl, headers=headers).content)
pass
index += 1
import time
import requests
from lxml import etree
# 发送网页请求
class Request:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
response_text = ""
response_content = ""
def __init__(self, url: str, encoding: str = "gbk") -> object:
try:
# 发送请求
response = requests.get(url, headers=self.headers)
response.encoding = encoding
self.response_text = response.text
self.response_content = response.content
except Exception as e:
print(e)
def __str__(self):
return str(self.response_text)
def get_text(self) -> str:
return str(self.response_text)
# 解析网页
class EtreeHtml:
img_infolist = []
page_list = []
def __init__(self, html_text: str):
etree_html = etree.HTML(html_text)
# 当前网页页码
self.page_list = etree_html.xpath('//*[@id="main"]/div[@class="page"]/a/@href')
# 图片列表
for item in etree_html.xpath('//*[@id="main"]/div[3]/ul/li'):
try:
img_src = item.xpath("a/img/@src")[0]
img_info = item.xpath("a/b/text()")[0]
img_obj = {
"img_info": img_info,
"img_src": img_src
}
self.img_infolist.append(img_obj)
pass
except Exception as e:
print("暂无数据", e)
pass
pass
pass
def __str__(self):
return self.img_infolist
def ImgInfolist(self):
return self.img_infolist
def Pagelist(self):
return self.page_list
# 写入文件
class WriteImages:
def __init__(self, path: str, source_data: bytes, Filename: str):
with open(f"{path}/{Filename}", "wb") as File:
print(f"正在写入{Filename}")
File.write(source_data)
class craw:
def __init__(self, url: str, index: int):
html_text = Request(url).response_text
html_info = EtreeHtml(html_text).img_infolist
for item in html_info:
img_info = item["img_info"]
img_src = item["img_src"]
WriteImages("./response/02 xpath/美女图网", Request(img_src).response_content, f"{img_info}.jpg") # 写入图片
pass
print("-------------写入完成----------------")
time.sleep(1)
if __name__ == "__main__":
url_list = [
f"http://www.netbian.com/index_{index}.htm" for index in range(2, 30)
]
count = 2
for item in url_list:
print(f"-------------{count}-------------")
craw(item, count)
count += 1
pip install bs4
url = "https://v.uuuab.com/72/72143/2xxx31.shtml"
resp = requests.Session().post(url)
resp.encoding = "gbk"
bs = BeautifulSoup(resp.text, "html.parser")
print(bs.title) # 读取title标签
print(bs.title.text) # 读取title标签内容
print(bs.title.string) # 读取title标签内容
print(bs.p.attrs) # 获取p的属性
# 获取单个属性
print(bs.p.get("class"))
print(bs.p["class"])
print(bs.a["href"])
from bs4 import BeautifulSoup
from lxml import etree
import requests
url = "https://v.uuuab.com/72/72143/232xxx1.shtml"
resp = requests.Session().post(url)
resp.encoding = "gbk"
bs = BeautifulSoup(resp.text, "html.parser")
find_p_text = bs.find("p", class_="version channel")
print(etree.HTML(str(find_p_text)).xpath("//a/@href")) # xpath相结合,但是需要将 find_p_text 转为str类型
# 找到小说的内容
find_div_content = bs.find("div", id="chaptercontent").get_text()
with open("./response/BeautifulSoup/楼柒沉煞小说-第1章 落入血人怀抱.txt", "w", encoding="utf-8") as File:
File.write(find_div_content)