爬瓜子二手车的信息1.0(仅500条)

因为能力有限,不会用多线程来爬,所以时间上会慢一点,等以后学了多线程,再把这个改进一下。

import urllib.request
import bs4
import time
import random
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re

#得到网页信息,这里一定要加cookie,如果network没有,那就到console控制台输入document.cookie就能出来
def gethtml(url):
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
    headers['cookie'] = '"uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04"'
    req = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    html_1 = html.decode('utf-8')
    
    bs_html = bs4.BeautifulSoup(html_1,'lxml')
    return bs_html

#得到汽车的地址链接,限制40是页面最底下会有推荐车型,也会搜索到
car_address = []
def carlink(url):
    bs_html = gethtml(url)
    all_address = bs_html.find_all('a', class_='car-a', limit=40)
    for i in all_address:
#        i_href = i['href']
        car_address.append(i['href'])
#    return car_address

#得到翻页的链接,这里设置了七页
page_url = []
def getpageurl():
    for i in range(1,8):
        url = 'https://www.guazi.com/nc/buy/o{page}/#bread'.format(page=i)
        page_url.append(url)


#得到车子的信息
carname_list = []         #车名
cardate_list = []         #车子的日期
carmile_list = []         #车子行驶的英里数
carprice_list = []        #车子售价
newcarprice_list1 = []    #车子原价
def getcarinfo(url_base):
    for j in car_address:
        car_url = url_base + j
        bs_html = gethtml(car_url)
        
        all_carname = bs_html.find('h2', class_="titlebox")
        carname = all_carname.text
        carname1 = carname[:50]
        carname2 = carname1.strip()
        carname_list.append(carname2)
        
        
        all_cardate = bs_html.find('li', class_="one")
        cardate = all_cardate.span.string    
        cardate_list.append(cardate)
        
        all_carmile = bs_html.find('li', class_="two")
        carmile = all_carmile.span.string
#        carmile1 = carmile.replace('万公里', '')  
        carmile1 = re.sub(r'[\u4e00-\u9fa5]', '', carmile)
        carmile_list.append(float(carmile1))
        
        
        all_carprice = bs_html.find('span', class_="pricestype")
        carprice = all_carprice.text
        carprice1 = carprice[1:15]
        carprice2 = re.sub(r'[\u4e00-\u9fa5]', '', carprice1)
        carprice_list.append(float(carprice2))
        
        
        all_newcarprice = bs_html.find('span', class_="newcarprice")
        newcarprice = all_newcarprice.text
        newcarprice1 = newcarprice.strip()
        newcarprice2 = newcarprice1[:10]
#        newcarprice3 = newcarprice2.replace('万', '')
        newcarprice3 = re.sub(r'[\u4e00-\u9fa5]', '', newcarprice2)
        newcarprice_list1.append(float(newcarprice3))
        
        time.sleep(random.randint(1,5))

'''
#
for url in page_url:
    with ThreadPoolExecutor(10) as exT:
        url_base = 'https://www.guazi.com'
        carlink(url)
        exT.submit(getcarinfo,url_base)
    print('一页结束了...')
    time.sleep(random.randint(1,5))
'''


getpageurl()               
url_base = 'https://www.guazi.com'
for url in page_url:
    carlink(url)
getcarinfo(url_base)
    
#把列表存到csv文件中
df = pd.DataFrame({'carname': carname_list, 'cardate' : cardate_list, 'carmile(万公里)' : carmile_list, 'carprice(万)' : carprice_list, 'newcarprice(万)' : newcarprice_list1})
df.to_csv('wmt1.csv', sep='\t')

下面的是我最开始的一个思路,代码会略有冗余,不过重在看思路。

url = 'https://www.guazi.com/nc/buy/'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = '"uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04"'
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')

bs_html = bs4.BeautifulSoup(html_1,'lxml')
#all_car = bs_html.find_all('ul', class_ = 'carlist clearfix')

#bs_html = bs_html.find_all('ul', class_="carlist clearfix js-top")
all_address = bs_html.find_all('a', class_='car-a')
car_address = []
for i in all_address:
    i_href = i['href']
    car_address.append(i['href'])

url_base = 'https://www.guazi.com'

#车子名字
carname_list = []
for j in car_address:
    car_url = url_base + j
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
    headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
    req = urllib.request.Request(url=car_url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    html_1 = html.decode('utf-8')
    
    bs_html = bs4.BeautifulSoup(html_1,'lxml')
    all_carname = bs_html.find('h2', class_="titlebox")
    carname = all_carname.text
    carname1 = carname[:50]
    carname2 = carname1.strip()
    carname_list.append(carname2)
    time.sleep(random.randint(5,10))


cardate_list = []
for j in car_address:
    car_url = url_base + j
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
    headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
    req = urllib.request.Request(url=car_url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    html_1 = html.decode('utf-8')
    
    bs_html = bs4.BeautifulSoup(html_1,'lxml')
    all_cardate = bs_html.find('li', class_="one")
    cardate = all_cardate.span.string    
    cardate_list.append(cardate)
    time.sleep(random.randint(5,10))


carmile_list = []
for j in car_address:
    car_url = url_base + j
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
    headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
    req = urllib.request.Request(url=car_url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    html_1 = html.decode('utf-8')
    
    bs_html = bs4.BeautifulSoup(html_1,'lxml')
    all_carmile = bs_html.find('li', class_="two")
    carmile = all_carmile.span.string
    carmile1 = carmile.replace('万公里', '')    
    carmile_list.append(carmile1)
    time.sleep(random.randint(5,10))


carprice_list = []
for j in car_address:
    car_url = url_base + j
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
    headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
    req = urllib.request.Request(url=car_url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    html_1 = html.decode('utf-8')
    
    bs_html = bs4.BeautifulSoup(html_1,'lxml')
    all_carprice = bs_html.find('span', class_="pricestype")
    carprice = all_carprice.text
    carprice1 = carprice[1:10]    
    carprice_list.append(float(carprice1))
    time.sleep(random.randint(5,10))


newcarprice_list1 = []
for j in car_address:
    car_url = url_base + j
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
    headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
    req = urllib.request.Request(url=car_url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    html_1 = html.decode('utf-8')
    
    bs_html = bs4.BeautifulSoup(html_1,'lxml')
    all_newcarprice = bs_html.find('span', class_="newcarprice")
    newcarprice = all_newcarprice.text
    newcarprice1 = newcarprice.strip()
    newcarprice2 = newcarprice1[5:10]
    newcarprice3 = newcarprice2.replace('万', '')
    newcarprice_list1.append(float(newcarprice3))
    time.sleep(random.randint(5,10))


df = pd.DataFrame({'carname': carname_list, 'cardate' : cardate_list, 'carmile(万公里)' : carmile_list, 'carprice(万)' : carprice_list, 'newcarprice(万)' : newcarprice_list1})
df.to_csv('wmt1.csv', sep='\t')

其实感觉都挺不精简的,不过重在对爬虫技能又精进了一步。
而且本意也不是爬虫,最开始也是想做数据分析,但是又苦于没有数据,只好自己动手,丰衣足食了。

你可能感兴趣的:(简单的爬虫)