爬取瓜子二手车信息并保存

爬取瓜子二手车信息

import requests
from lxml import etree
#ctrl + F 局部查找
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    'cookie': 'antipas=31G02831165132412Ok2k1838Fe90b; uuid=a857f373-09fa-4e10-9be0-f3c30d703299; cityDomain=www; clueSourceCode=%2A%2300; user_city_id=-1; Hm_lvt_936a6d5df3f3d309bda39e92da3dd52f=1595308713; ganji_uuid=4092319091295512877004; sessionid=14ceb1dd-6e53-47b8-d781-e4cd049e4a28; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22default%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22a857f373-09fa-4e10-9be0-f3c30d703299%22%2C%22ca_city%22%3A%22changzhou%22%2C%22sessionid%22%3A%2214ceb1dd-6e53-47b8-d781-e4cd049e4a28%22%7D; preTime=%7B%22last%22%3A1595308763%2C%22this%22%3A1595308712%2C%22pre%22%3A1595308712%7D; Hm_lpvt_936a6d5df3f3d309bda39e92da3dd52f=1595308764; GZ_TOKEN=685bH%2B6THB6fOW9i6w5vMreCiEHFrLoq0qL5FWGKChlPEqZkkfVt%2FucMKpHyFmg%2BdjK5IFWE6fLm6MLlAoHU2EkhvLxo3S%2FbzM3wmSzKMJcMKX2PuluB%2BbQ5qmY3nmSIhARAFPNi%2FpPhd1GOPw; guaZiUserInfo=2MSZvXS0DrBNBedydE%2AiQ4; userid=743177993; CHDSSO=685bH%2B6THB6fOW9i6w5vMreCiEHFrLoq0qL5FWGKChlPEqZkkfVt%2FucMKpHyFmg%2BdjK5IFWE6fLm6MLlAoHU2EkhvLxo3S%2FbzM3wmSzKMJcMKX2PuluB%2BbQ5qmY3nmSIhARAFPNi%2FpPhd1GOPw'
}


def Get_detail_url(url,headers):   
    url ='https://www.guazi.com/sc/buy'

    res = requests.get(url,headers=headers)
    #print(res.content.decode('utf-8'))

    html = etree.HTML(res.content.decode('utf-8'))
    #print(html)
    ul = html.xpath('/html/body/div[6]/ul')[0]
    #ul = html.xpath('//ul[@class="carlist clearfix js-top"]')
    #print(ul)
    #lis = html.xpath('/html/body/div[6]/ul/li')
    lis = ul.xpath('./li')#所有的li标签对象
    urls = []
    for li in lis:
        url = li.xpath('./a/@href')    #获取到a标签的href属性的值
        #print(urls)
        base_url = "https://www.guazi.com"
        url = base_url + url[0]
        urls.append(url)
    return urls

def Get_text(url):          #获取具体信息
    info_lists = []
    res = requests.get(url,headers=headers)
    #print(res.text)
    html = etree.HTML(res.content.decode('utf-8'))
    name = html.xpath('//div/h2[@class="titlebox"]/text()')     #品牌
    price = html.xpath('//div/span[@class="price-num"]/text()')     #价格
    displayed_mileage = html.xpath('//div/div/div/ul/li[@class="two"]/span/text()')      #表显里程
    displacement = html.xpath('//div/ul/li[@class="three"]/span/text()')     #排量
    transmission_case = html.xpath('//div/ul/li[@class="last"]/span/text()')  #变速箱

    #name = html.xpath('/html/body/div[4]/div[3]/div[2]/h2/text()')
    #price = html.xpath('/html/body/div[4]/div[3]/div[2]/div[1]/div[2]/span/text()')
    #displayed_mileage = html.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[2]/span/text()')
    #displacement = html.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[3]/span/text()')
    #transmission_case = html.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[4]/span/text()')  #变速箱
    info={
        'name' : name[0].strip(),
        'displayed_mileage' : displayed_mileage[0].strip(),
        'transmission_case' : transmission_case[0].strip(),
        'price' :price[0].strip()
        
    }
    #print(info)
    #info_lists.append(info)
    # print("品牌:",name[0].strip())
    # print("表显里程:",displayed_mileage[0].strip())
    # print("排量:",displacement[0].strip())
    # print("变速箱:",transmission_case[0].strip())
    # print("价格:",price[0].strip())
    # print()
    return info
    
def save_csv(info, f):
    f.write('{},{},{},{}\n'.format(info['name'], info['displayed_mileage'], info['transmission_case'], info['price']))



def main():
    urls = ['https://www.guazi.com/sc/buy/o{}'.format(str(i)) for i in range(1,5)]
    for url in urls:       
        detail_urls = Get_detail_url(url,headers)
        for detail_url in detail_urls:
            info = Get_text(detail_url)
            with open('guazi.csv', 'a', encoding='utf-8') as f:
                save_csv(info, f)

if __name__ == '__main__':
    main()

爬取瓜子二手车信息并保存_第1张图片

爬取瓜子二手车信息并保存_第2张图片
爬取瓜子二手车信息并保存_第3张图片
从这里获取Cookie 和User-Agent

你可能感兴趣的:(爬虫)