爬取瓜子二手车

需要提供cookie才能正常返回数据 

import requests
from  lxml import etree

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
           'Cookie':'xxxxxxxxx',}

def get_html(url):
	#s=requests.Session()
	html=requests.get(url,headers=headers)
	return html.content.decode()

def html_detail(html):
	html = etree.HTML(html)
	list=html.xpath('//ul[@class="carlist clearfix js-top"]/li/a')
	for i in list:
		item={}
		item['title']=i.xpath('./@title')[0]
		item['detail'] = i.xpath('string(.//*[@class="t-i"])').replace('|','')
		item['price'] = i.xpath('string(.//p)')
		item['reduce'] = i.xpath('string(.//em)').strip().replace('\r\n','').replace(' ','')
		item['href'] = 'https://www.guazi.com'+i.xpath('./@href')[0]
		print(item)

def main():
	for i in range(1,50):
		url = 'https://www.guazi.com/www/buy/o{}c-1/#bread'.format(i)
		html=get_html(url)
		html_detail(html)

if __name__ == '__main__':
	main()

 

你可能感兴趣的:(爬虫)