python爬虫之二 —— 瓜子二手车数据

前言

瓜子二手车直卖网相信大家都不陌生,毕竟那句广告词——“没有中间商赚差价,卖家多卖钱,买家少花钱”——还是很有穿透力的。在瓜子的官网上,我们确实能看到不少在售二手车以及这些车辆的详细数据。
python爬虫之二 —— 瓜子二手车数据_第1张图片
这些数据对有购车需求和想要做这方面研究的朋友会有帮助的,我们就来尝试获取这些二手车数据

实现步骤

这里我们只拿成都前100页的数据。瓜子二手车的网站主要的静态网页,我们不需要考虑ajax、JS渲染的情况,我们的爬取逻辑也简洁清晰:

  • 发出请求,接受响应(使用requests发出请求时,必须带cookie)
  • 获取源码,并解析出每辆车的详情页url
  • 获取详情页数据
    – 发出请求,接受响应
    – 获取详情页源码,并解析数据
  • 存储数据到mongodb
  • 翻页遍历,重复2、3、4步

最终我们拿到如下数据,
python爬虫之二 —— 瓜子二手车数据_第2张图片

详细代码

下面是详细的代码,想要获取其它数据或者更多数据,在函数get_detail中修改即可。

import requests
from requests import RequestException
from bs4 import BeautifulSoup as bs
import time
import re
import pymongo


class GuaZi:
    def __init__(self):
        self.link = 'https://www.guazi.com/cd/buy/o'
        self.headers = {
                'Cookie': 'uuid=8e189c5e-4b3c-4eca-9f69-50d11cd70f62; ganji_uuid=5614770255330340838852; lg=1; antipas=L693382z8954211H66291335Huw3; clueSourceCode=10104346512%2300; sessionid=f340bcee-4390-4aab-e05d-4273c453d102; cainfo=%7B%22ca_s%22%3A%22dh_hao123llq%22%2C%22ca_n%22%3A%22hao123mzpc%22%2C%22ca_i%22%3A%22-%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22scode%22%3A%2210104346512%22%2C%22ca_transid%22%3Anull%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22ca_b%22%3A%22-%22%2C%22ca_a%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%228e189c5e-4b3c-4eca-9f69-50d11cd70f62%22%2C%22sessionid%22%3A%22f340bcee-4390-4aab-e05d-4273c453d102%22%7D; cityDomain=cd; preTime=%7B%22last%22%3A1545284609%2C%22this%22%3A1544171667%2C%22pre%22%3A1544171667%7D',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
                }
        self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        self.db = self.client.spider        

    def get_page(self, url):
        try:
            resp = requests.get(url, headers=self.headers)
            resp.raise_for_status
            resp.encoding = resp.apparent_encoding
            return resp.text
        except RequestException:
            print('Can not get the page')
            pass
    
    def get_link(self, html):
        soup = bs(html, 'lxml')
        result = soup.select('div.list-wrap.js-post > ul > li > a')
        detail_url = ['https://www.guazi.com'+i['href'] for i in result]
        return detail_url
    
    def get_detail(self, html):
        detail = bs(html, 'lxml')
        title = detail.select_one('div.infor-main.clearfix > div.product-textbox > h2').get_text()
        title = re.sub(r'[\r\n]', '', title)
        time = detail.select_one('div.product-textbox > ul > li.one > span').get_text()
        used_distance = detail.select_one('div.product-textbox > ul > li.two > span').get_text()
        city = detail.select('div.product-textbox > ul > li.three > span')[0].get_text()
        displacement = detail.select('div.product-textbox > ul > li.three > span')[1].get_text()
        transmission = detail.select_one('div.product-textbox > ul > li.last > span').get_text()
        price = detail.select_one('div.product-textbox > div.pricebox.js-disprice > span.pricestype').get_text()
        guiding_price = detail.select_one('div.product-textbox > div.pricebox.js-disprice > span.newcarprice').get_text()
        guiding_price = re.sub(r'[\r\n ]', '', guiding_price)
        
        result={
                'title': title.strip().replace('                    ', ' '),
                'time': time,
                'used_distance': used_distance,
                'city': city,
                'displacement':displacement,
                'transmission': transmission,
                'price': price.replace(' ', ''),
                'guiding_price':guiding_price
                }
        return result
    
    def save_to_mongo(self, content):
        if content:
            self.db.ershouche_cd.insert(content)
            print(content['title'], 'DONE')
    
    def main(self):        
        for i in range(1, 101, 1):
            url = self.link+str(i)
            html = self.get_page(url)
            result  =self.get_link(html)
            for i in result:
                time.sleep(2)
                resp = self.get_page(i)
                content = self.get_detail(resp)
                if content:
                    self.save_to_mongo(content)
        

if __name__ == '__main__':
    ershouche = GuaZi()
    ershouche.main()

你可能感兴趣的:(spiders)