import requests
import re
def getHtmlText(url):
"""提取html页面,注意反爬"""
headers = {
'User-Agent':'自己的',
'cookie':'自己的(注意隐私)'
}
try:
resp = requests.get(url,headers=headers,timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding
return resp.text
except:
print('获取页面源代码失败')
def parsePage(ilt,html):
"""使用正则提取页面信息,进入源代码查看,爬取html页面编译器可能看不到"""
try:
"""注意转义符的处理"""
plt = re.findall(r'\"view_price\"\:\"\d.*?\"',html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
print('正则错误')
def printList(ilt):
"""设置打印模板"""
printlt = '{:<4}{:8}{:30}'
print(printlt.format('序号','价格','名称'))
count = 0
for i in ilt:
count += 1
print(printlt.format(count,i[0],i[1]))
def main():
"""爬取名称为书包,深度(页面)为2的数据。使用try……except过滤异常,让程序正常运行"""
goods = '书包'
infor_list = []
url = 'https://s.taobao.com/search?q={}'.format(goods)
for i in range(1,3):
try:
start_url = url + '&s={}'.format(44*i)
text =getHtmlText(start_url)
parsePage(infor_list,text)
except:
continue
printList(infor_list)
if __name__ == "__main__":
main()