学习笔记(11):21天搞定分布式Python网络爬虫-xpath-实战-爬取瓜子二手车网站(3)...

立即学习:https://edu.csdn.net/course/play/24756/280694?utm_source=blogtoedu

import lxml.etree as le
import requests

headers = {
    'User-Agent': '*******************************************',
    'cookie':'*********************************************'
}
def get_hrefs(url):
    resp = requests.get(url,headers=headers)
    text = resp.content.decode('utf-8')
    html = le.HTML(text)
    ul = html.xpath('//ul[@class="carlist clearfix js-top"]')[0]
    lis = ul.xpath('./li')
    data_href = []
    for li in lis:
        href = li.xpath('./a/@href')
        href = 'https://www.guazi.com'+href[0]
        data_href.append(href)
    return data_href


def get_car(data):
        resp = requests.get(data,headers=headers)
        text = resp.content.decode('utf-8')
        html = le.HTML(text)
        info1 = html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
        info1 = info1.replace(r'\r\n','').strip()
        info2 = html.xpath('div[@class="product-textbox"]/ul/li/span/text()')[0:5]
        time = info2[0]
        km= info2[1]
        paling = info2[2]
        danged = info2[3]


        info_dit = {}
        info_dit['名称']=info1
        info_dit['排量'] = paling
        info_dit['上牌年份'] = time
        info_dit['行驶里程'] = km
        info_dit['变速箱'] = danged
        return info_dit

def save_data(data,f):
        f.write('{},{},{},{},{}\n'.format(data['名称'],data['排量'],data['上牌年份'],data['行驶里程'],data['变速箱']))

def main():
    baseurl = 'https://www.guazi.com/hf/dazhong/o{}'
    with open('aaa.txt', 'a', encoding='utf-8') as f:
        #前5页
        for x in range(1,6):
            url = baseurl.format(x)
            # 获取详情页面链接
            detail_urls = get_hrefs(url)
            # 解析详情页面内容
            for detail_url in detail_urls:
                infos = get_car(detail_url)
                save_data(infos, f)


if __name__ == '__main__':
    main()

 

你可能感兴趣的:(研发管理,python,网络爬虫,编程语言,Python,数据存储)