实战:用xpath爬取瓜子二手车数据

思路:
1、获取各种车辆的详情页面url
2、解析详情页面,并获得想要抓取的内容
3、保存抓取到的数据
4、封装函数

import requests
from lxml import etree

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Cookie':'antipas=848bb1079O6946305Of461920; uuid=25b5ac2a-6ddf-4e84-8c74-b27d865dae39; cityDomain=luan; user_city_id=132; ganji_uuid=7903651165629267632441; lg=1; close_finance_popup=2020-03-03; clueSourceCode=%2A%2300; sessionid=ed2e63d8-6107-4159-a342-73a335a8fb02; Hm_lvt_936a6d5df3f3d309bda39e92da3dd52f=1583207104,1583207159,1583207217,1583217498; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A52698093558%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22default%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%2225b5ac2a-6ddf-4e84-8c74-b27d865dae39%22%2C%22ca_city%22%3A%22luan%22%2C%22sessionid%22%3A%22ed2e63d8-6107-4159-a342-73a335a8fb02%22%7D; preTime=%7B%22last%22%3A1583222136%2C%22this%22%3A1583204349%2C%22pre%22%3A1583204349%7D; Hm_lpvt_936a6d5df3f3d309bda39e92da3dd52f=1583222232'
}

def get_detail_urls(url):
    #获取详情页url
    resp = requests.get(url,headers=headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    detail_urls_info = html.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@href')
    detail_urls = []
    for detail_url in detail_urls_info:
        detail_url = "https://www.guazi.com"+detail_url
        detail_urls.append(detail_url)
    return detail_urls

def parse_detail_page(detail_url):
    #解析详情页面内容
    resp = requests.get(detail_url,headers=headers)
    text = resp.content.decode('utf-8')
    html = etree.HTML(text)
    title = html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
    title = title.replace('r"\r\n"','').strip()
    info = html.xpath('//div[@class="product-textbox"]/ul[@class="assort clearfix"]/li/span/text()')
    infos = {}
    infos['title'] = title
    infos['cardtime'] = info[0]
    infos['km'] = info[1]
    infos['displacement'] = info[2]
    infos['speedbox'] = info[3]
    return infos

def save_data(datas,f):
    #保存数据
    f.write('{},{},{},{},{}\n'.format(datas['title'],datas['cardtime'],datas['km'],datas['displacement'],datas['speedbox']))

def main():
    #主程序
    base_url = "https://www.guazi.com/luan/buy/o"
    with open('guaziershouche.csv', 'a', encoding='utf-8') as f:
        for i in range(1, 6):
            url = base_url + str(i)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                datas = parse_detail_page(detail_url)
                save_data(datas,f)

#判断打开的是否为主程序
if __name__ == '__main__':
    main()

你可能感兴趣的:(python)