爬取瓜子二手车信息

爬取瓜子二手车信息

#爬取瓜子二手车
import requests
from lxml import etree
import csv
#https://www.guazi.com/cs/buy/#bread
#请求头
headers={
     "写你自己的"}
k=1
def save_data(data):
    global k
    with open('guazi.csv', 'a', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        title = ['品牌', '时间', '里程', '排量','现价','原价']
        writer.writerow([data[i] for i in title])
    print('保存成功{}'.format(k))
    k+=1

def get_urls(url):#获取每辆车的详情
    res = requests.get(url, headers=headers)
    # 网页源代码
    text = res.content.decode('utf-8')
    # 解析成etree格式
    html = etree.HTML(text)
    content=html.xpath('//div[@class="product-textbox"]')[0]
    #标题
    h=content.xpath('./h2/text()')[0]
    h=h.replace('\r','').replace('\n','').strip()#款型分割
    h=h.split()[0]#只取品牌
    # print(h)
    #时间
    time=content.xpath('./ul/li[@class="one"]/span/text()')[0]
    #里程
    mileage=content.xpath('./ul/li[@class="two"]/span/text()')[0][0:-3]
    #排量
    displacement=html.xpath('//ul[@class="basic-eleven clearfix"]/li[@class="six"]/div/text()')[0][0:-1]
    #现价
    Price=html.xpath('//div[@class="pricebox js-disprice"]/span[@class="pricestype"]')[0]
    presentPrice=Price.xpath('./text()')[0]+Price.xpath('./span[@class="f14"]/text()')[0]
    presentPrice=presentPrice[1:-1]
    #原价
    originalPrice=Price.xpath('//div[@class="pricebox js-disprice"]/span[@class="originprice"]/text()')[0][2:-1]
    data={
     }
    data['品牌']=h
    data['时间']=time
    data['里程']=mileage
    data['排量']=displacement
    data['现价']=presentPrice
    data['原价']=originalPrice
    save_data(data)

if __name__ == '__main__':
    with open('guazi.csv', 'a', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        title = ['品牌', '时间', '里程', '排量', '现价', '原价']
        writer.writerow([i for i in title])
    for num in range(1,51):
        url = 'https://www.guazi.com/cs/buy/o{}/#bread'.format(num)
        res = requests.get(url, headers=headers)
        # 网页源代码
        text = res.content.decode('utf-8')
        # 解析成etree格式
        html = etree.HTML(text)
        # print(text)
        ul = html.xpath('//ul[@class="carlist clearfix js-top"]')[0]
        lis = ul.xpath('./li')
        for li in lis:
            detail_url = li.xpath('./a/@href')
            detail_url = 'https://www.guazi.com' + detail_url[0]
            get_urls(detail_url)


你可能感兴趣的:(爬虫)