爬取瓜子二手车信息

瓜子的页面需要带上cookie爬取,不然不会给你返回真实页面。

import time
from bs4 import BeautifulSoup
import requests
import random
import re
import csv
from multiprocessing import Pool

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
           "Cookie": "uuid=f1abeca8-9bc2-470b-aa33-958a284a6a71; ganji_uuid=7007255948289753191046; antipas=4t63P191c4n51180s57oB78A8Rn06; clueSourceCode=%2A%2300; user_city_id=17; sessionid=0f04b190-26ab-4b0a-b741-dce9d485b224; lg=1; Hm_lvt_936a6d5df3f3d309bda39e92da3dd52f=1591274682; close_finance_popup=2020-06-04; lng_lat=113.97053_22.459507; gps_type=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22default%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2245536327017287680%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f1abeca8-9bc2-470b-aa33-958a284a6a71%22%2C%22sessionid%22%3A%220f04b190-26ab-4b0a-b741-dce9d485b224%22%2C%22ca_city%22%3A%22sz%22%7D; preTime=%7B%22last%22%3A1591274715%2C%22this%22%3A1591274680%2C%22pre%22%3A1591274680%7D; cityDomain=ty; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A75519901104%7D; Hm_lpvt_936a6d5df3f3d309bda39e92da3dd52f=1591275520"}

#获取详情页链接列表
def geturl(pageurl):
    urllist = []
    mainurl = 'https://www.guazi.com'
    time.sleep(random.randint(2,5))
    response = requests.get(pageurl, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    lis = soup.select('ul.carlist li')
    for li in lis:
        url = mainurl + li.a['href']
        urllist.append(url)
    return urllist

#解析详情页
def parsepage(urllist):
    carlist = []
    for url in urllist:
        response = requests.get(url,headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        title = soup.find('h2', class_='titlebox').text.strip()
        title = re.sub('\s*','',title)
        distance = soup.find('li', class_='two').span.text.strip()
        work = soup.find('li', class_='three').span.text.strip()
        price = soup.find('span', class_='price-num').text.strip()
        carlist.append([title, distance, work, price])
    return carlist


#保存数据
def savedata(data):
    with open('car.csv', 'a',encoding='utf-8',newline='') as f:
        writer = csv.writer(f)
        for d in data:
            writer.writerow(d)

#主函数
def main():
    for x in range(1,11):
        url = f'https://www.guazi.com/sz/buy/o{x}/#bread'
        pages = geturl(url)
        cars = parsepage(pages)
        savedata(cars)#输出改进
#两个思路,第一个是将整个过程改造为一个函数,使用多进程。第二个是改造成生产者和消费者

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬取瓜子二手车信息)