用到了requests库和re库
import requests
import re
lis = []
# https://s.taobao.com/search?q=荣耀v20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306
# https://s.taobao.com/search?q=荣耀v20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=荣耀V20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=88
kv = {
"user-agent":
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Cookie":
'换成自己的Cookie'
}
# 获取html页面
def getHTMLpages(url):
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
# 将商品信息存入列表
def getGoodsinfo(lis, html):
# "raw_title":"honor/荣耀 荣耀play 微商手机 营销手机 小V 不凡 霸屏推"
# "view_price":"6280.00"
# "view_sales":"7人付款"或者"2.0万+人付款"或者"6000+人付款"
title = re.findall(r'\"raw_title\"\:\".*?\"', html) # .字符,*0或n次扩展,?表示最小匹配
price = re.findall(r'\"view_price\"\:\"[\d.]*\"',
html) # [\d.]*表示0-9或者.的0次或者多次取值
sales = re.findall(r'\"view_sales\"\:\"[\d\.]*[\u4e00-\u9fa5]?\+?人付款\"',
html) # [\u4e00-\u9fa5]表示中文字符
for i in range(len(title)):
tit = eval(title[i].split(':')[1])
pri = eval(price[i].split(':')[1])
sal = eval(sales[i].split(':')[1])
lis.append([tit, pri, sal])
# return lis
# 从列表中读取商品信息
def printGoodsinfo(lis):
form = "{:^2}\t{:<50}\t{:>8}\t{:>8},"
print(form.format("序号", "商品名称", "商品价格", "商品销量", chr(12288)))
# for i in range(len(lis)):
# print("{:^3}\t{:^20}\t{:^10}\t{:^20}".format(i+1,lis[i][0],lis[i][1],lis[i][2],lis[i][3]))
count = 1
print()
for i in lis: # i代表一行
print(form.format(count, i[0], i[1], i[2], chr(12288)))
count += 1
if __name__ == "__main__":
# f = open("D:\VscodePy\pytest\html.txt", encoding='utf-8')
# sss = f.read()
# f.close()
s = input("请输入要查询的商品名称:")
start_url = "https://s.taobao.com/search?q=" + s
print(start_url)
count = eval(input("请输入要查询的页数:"))
for i in range(count):
url = start_url + '&s=' + str(i * 44)
html = getHTMLpages(url)
# print(html)
getGoodsinfo(lis, html)
# getGoodsinfo(lis, sss)
printGoodsinfo(lis)
运行结果:
注意: 同一个cookie和ip如果反复爬取,会被淘宝的反爬限制,会让滑动验证,这时候如果不想办法解决这个滑动验证,就会出现爬取数据为空的情况,因为你获取到的网页是滑动验证页面的网页。我的做法是把一次爬取下来的网页存放到一个文本文件里面,然后调试代码时直接从那个文件读取网页代码就行,这样就不需要反复爬取了。上面贴的代码是我调试后的最终代码。