# 发送请求
import requests
# 确定 url 地址
url = 'http://book.dangdang.com/?_utm_brand_id=11106&_ddclickunion=460-5-biaoti|ad_type=0|sys_id=1'
# 添加请求头 headers(User—Agent)
headers = {
'User—Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Mobile Safari/537.36'
}
# 发送请求,获取数据
response = requests.get(url,headers)
# 状态码,请求成功
#print(response.text)#字符串
#解析数据
#数据解析模块
import parsel
selector = parsel.Selector(response.text)#转换成 selector 对象
#css 选择器,根据标签属性,右键 copy
lis = selector.css('#component_403754__5298_5294__5294 > li')
#print(lis)
for li in lis:
title = li.css('.name a::attr(title)').get()
print(title)
价格那一栏是整数和小数分开的,把字符串合并一下。
修改一下,把那些分类的爬取放到同一个 for 里,分开的话最终只能出一本书的信息。
# 发送请求
import requests
# 确定 url 地址
url = 'http://book.dangdang.com/?_utm_brand_id=11106&_ddclickunion=460-5-biaoti|ad_type=0|sys_id=1'
# 添加请求头 headers(User—Agent)
headers = {
'User—Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Mobile Safari/537.36'
}
# 发送请求,获取数据
response = requests.get(url,headers)
# 状态码,请求成功
#print(response.text)#字符串
#解析数据
#数据解析模块
import parsel
selector = parsel.Selector(response.text)#转换成 selector 对象
#css 选择器,标签属性attr,文本 text 右键 copy
lis = selector.css('#component_403754__5298_5294__5294 > li')
#print(lis)
for li in lis:
title = li.css('.name a::attr(title)').get()
print(title)
author = li.css('p.author ::text').get()
print(author)
price1 = li.css('p.price > span.rob > span.num ::text').get()
#print((price1))
price2 = li.css('p.price > span.rob > span.tail ::text').get()
#print((price2))
total = price1 + price2
print(total)
#保存数据
dit = {
'标题':title,
'作者':author,
'价格':total,
}
print(dit)
import csv
f = open('书籍信息.csv',mode='a',encoding='utf-8',newline='')
csv_writer = csv.DictWriter(f,fieldnames=[
'标题',
'作者',
'价格',
])
csv_writer.writerow(dit)
csv_writer.writeheader()#写入表头
最终结果
多页的话,加个
for page in range(1,100):
找 url 的关系
url = ‘。。。。。0-0-1-{page}’