最近在b站上学习python爬虫,根据老师的教程打了一遍,出现了一些问题,主要是因为淘宝的反扒机制,但在增加headers和cookie之后就没什么问题了。
可以参考这篇教程设置cookie:https://www.cnblogs.com/huahuayu/p/8207037.html
附上源码:
import requests # 引入requests库
from bs4 import BeautifulSoup # 引入beautifulsoup库
import bs4
import re # 引入正则表达式库
# 通过requests获取网站内容
def getHTMLText(url):
headers = { # 反反爬取,添加cookie
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36',
'sec-fetch-dest': 'document',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'referer': '*', # referer和cookie略去
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cookie': '*',
}
try:
print(url)
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status() # 若r.status_code不等于200,则报错
r.encoding = r.apparent_encoding
print(r.request.url)
# print(r.text[0:50000])
return r.text
except:
print('getHTMLText Fail')
return ""
# 对HTML解码
def parsePage(ilt, html):
price_regex = re.compile(r'\"view_price\"\:\"[\d\.]*\"') # re编译,原生字符串
title_regex = re.compile(r'\"raw_title\"\:\".*?\"') # 这里的\好像没什莫用
try:
plt = price_regex.findall(html) # 返回匹配的字符串列表
tlt = title_regex.findall(html)
for t in range(len(plt)):
price = plt[t].split('"')[-2] # 使用eval()有一定风险,所以直接用"分割
title = eval(tlt[t].split(':')[1]) # eval()会去掉"",用:分割
ilt.append([price, title])
except:
print('parsePage Fail')
# 打印商品信息
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:^16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
for i in ilt:
count = count + 1
print(tplt.format(count, i[0], i[1]))
def main():
goods = '书'
depth = 2
start_url = 'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.2017.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=' + str(
44 * i)
html = getHTMLText(url)
print("getHTMLText Succeed")
parsePage(infoList, html)
print("parsePage succeed")
except:
continue
printGoodsList(infoList)
main()