因为能力有限,不会用多线程来爬,所以时间上会慢一点,等以后学了多线程,再把这个改进一下。
import urllib.request
import bs4
import time
import random
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re
#得到网页信息,这里一定要加cookie,如果network没有,那就到console控制台输入document.cookie就能出来
def gethtml(url):
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = '"uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04"'
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
return bs_html
#得到汽车的地址链接,限制40是页面最底下会有推荐车型,也会搜索到
car_address = []
def carlink(url):
bs_html = gethtml(url)
all_address = bs_html.find_all('a', class_='car-a', limit=40)
for i in all_address:
# i_href = i['href']
car_address.append(i['href'])
# return car_address
#得到翻页的链接,这里设置了七页
page_url = []
def getpageurl():
for i in range(1,8):
url = 'https://www.guazi.com/nc/buy/o{page}/#bread'.format(page=i)
page_url.append(url)
#得到车子的信息
carname_list = [] #车名
cardate_list = [] #车子的日期
carmile_list = [] #车子行驶的英里数
carprice_list = [] #车子售价
newcarprice_list1 = [] #车子原价
def getcarinfo(url_base):
for j in car_address:
car_url = url_base + j
bs_html = gethtml(car_url)
all_carname = bs_html.find('h2', class_="titlebox")
carname = all_carname.text
carname1 = carname[:50]
carname2 = carname1.strip()
carname_list.append(carname2)
all_cardate = bs_html.find('li', class_="one")
cardate = all_cardate.span.string
cardate_list.append(cardate)
all_carmile = bs_html.find('li', class_="two")
carmile = all_carmile.span.string
# carmile1 = carmile.replace('万公里', '')
carmile1 = re.sub(r'[\u4e00-\u9fa5]', '', carmile)
carmile_list.append(float(carmile1))
all_carprice = bs_html.find('span', class_="pricestype")
carprice = all_carprice.text
carprice1 = carprice[1:15]
carprice2 = re.sub(r'[\u4e00-\u9fa5]', '', carprice1)
carprice_list.append(float(carprice2))
all_newcarprice = bs_html.find('span', class_="newcarprice")
newcarprice = all_newcarprice.text
newcarprice1 = newcarprice.strip()
newcarprice2 = newcarprice1[:10]
# newcarprice3 = newcarprice2.replace('万', '')
newcarprice3 = re.sub(r'[\u4e00-\u9fa5]', '', newcarprice2)
newcarprice_list1.append(float(newcarprice3))
time.sleep(random.randint(1,5))
'''
#
for url in page_url:
with ThreadPoolExecutor(10) as exT:
url_base = 'https://www.guazi.com'
carlink(url)
exT.submit(getcarinfo,url_base)
print('一页结束了...')
time.sleep(random.randint(1,5))
'''
getpageurl()
url_base = 'https://www.guazi.com'
for url in page_url:
carlink(url)
getcarinfo(url_base)
#把列表存到csv文件中
df = pd.DataFrame({'carname': carname_list, 'cardate' : cardate_list, 'carmile(万公里)' : carmile_list, 'carprice(万)' : carprice_list, 'newcarprice(万)' : newcarprice_list1})
df.to_csv('wmt1.csv', sep='\t')
下面的是我最开始的一个思路,代码会略有冗余,不过重在看思路。
url = 'https://www.guazi.com/nc/buy/'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = '"uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04"'
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
#all_car = bs_html.find_all('ul', class_ = 'carlist clearfix')
#bs_html = bs_html.find_all('ul', class_="carlist clearfix js-top")
all_address = bs_html.find_all('a', class_='car-a')
car_address = []
for i in all_address:
i_href = i['href']
car_address.append(i['href'])
url_base = 'https://www.guazi.com'
#车子名字
carname_list = []
for j in car_address:
car_url = url_base + j
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
req = urllib.request.Request(url=car_url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
all_carname = bs_html.find('h2', class_="titlebox")
carname = all_carname.text
carname1 = carname[:50]
carname2 = carname1.strip()
carname_list.append(carname2)
time.sleep(random.randint(5,10))
cardate_list = []
for j in car_address:
car_url = url_base + j
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
req = urllib.request.Request(url=car_url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
all_cardate = bs_html.find('li', class_="one")
cardate = all_cardate.span.string
cardate_list.append(cardate)
time.sleep(random.randint(5,10))
carmile_list = []
for j in car_address:
car_url = url_base + j
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
req = urllib.request.Request(url=car_url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
all_carmile = bs_html.find('li', class_="two")
carmile = all_carmile.span.string
carmile1 = carmile.replace('万公里', '')
carmile_list.append(carmile1)
time.sleep(random.randint(5,10))
carprice_list = []
for j in car_address:
car_url = url_base + j
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
req = urllib.request.Request(url=car_url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
all_carprice = bs_html.find('span', class_="pricestype")
carprice = all_carprice.text
carprice1 = carprice[1:10]
carprice_list.append(float(carprice1))
time.sleep(random.randint(5,10))
newcarprice_list1 = []
for j in car_address:
car_url = url_base + j
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
headers['Referer'] = 'https://www.guazi.com/nc/?ca_s=sem_baiduss&ca_n=bdpc_sye&ca_keywordid=142010967086&ca_term=%B6%FE%CA%D6%B3%B5&tk_p_mti=ad.sem_baiduss.bdpc_sye.1.6874372086501376'
headers['cookie'] = 'uuid=e5be0028-2c14-4b66-c27a-3cc7ad77e072; antipas=Tn1R55x3426N94214717FT75E9; clueSourceCode=%2A%2300; user_city_id=214; ganji_uuid=4831191946612775493376; sessionid=90ccf494-7db3-4bfc-d6a4-cc12be38f777; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A83102756915%7D; track_id=6874372086501376; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22-%22%7D; cityDomain=nc; preTime=%7B%22last%22%3A1572798596%2C%22this%22%3A1572793479%2C%22pre%22%3A1572793479%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25B6%25FE%25CA%25D6%25B3%25B5%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22142010967086%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%226874372086501376%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22e5be0028-2c14-4b66-c27a-3cc7ad77e072%22%2C%22sessionid%22%3A%2290ccf494-7db3-4bfc-d6a4-cc12be38f777%22%2C%22ca_city%22%3A%22nc%22%7D; close_finance_popup=2019-11-04'
req = urllib.request.Request(url=car_url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
html_1 = html.decode('utf-8')
bs_html = bs4.BeautifulSoup(html_1,'lxml')
all_newcarprice = bs_html.find('span', class_="newcarprice")
newcarprice = all_newcarprice.text
newcarprice1 = newcarprice.strip()
newcarprice2 = newcarprice1[5:10]
newcarprice3 = newcarprice2.replace('万', '')
newcarprice_list1.append(float(newcarprice3))
time.sleep(random.randint(5,10))
df = pd.DataFrame({'carname': carname_list, 'cardate' : cardate_list, 'carmile(万公里)' : carmile_list, 'carprice(万)' : carprice_list, 'newcarprice(万)' : newcarprice_list1})
df.to_csv('wmt1.csv', sep='\t')
其实感觉都挺不精简的,不过重在对爬虫技能又精进了一步。
而且本意也不是爬虫,最开始也是想做数据分析,但是又苦于没有数据,只好自己动手,丰衣足食了。