代码如下:
from bs4 import BeautifulSoup
import requests
def get_link_from(whos_sells=0):
urls=[]
list_view = 'http://xa.58.com/pingbandiannao/{}/pn1/'.format(str(whos_sells))
wb_data = requests.get(list_view)
soup = BeautifulSoup(wb_data.text,'lxml')
if whos_sells == 0:
for link in soup.select('td.t a.t'):
urls.append(link.get('href').split('?')[0])
else:
for link in soup.select('div.left a.title.t'):
urls.append(link.get('href'))
return urls
def get_views_from(url):
id = url.split('/')[-1].strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(id)
js = requests.get(api)
views = js.text.split('=')[-1]
return views
def get_info_item(whos_sells):
urls=get_link_from(whos_sells)
print(urls)
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
data = {
'title' : soup.title.text,
'price' : soup.select('span.price_now i')[0].text if soup.find_all('span','price_now') else None,
'addr' : list(soup.select('div.palce_li span i')[0].stripped_strings) if soup.find_all('div','palce_li') else None,
#也可以为
#'addr' : list(soup.select('div.palce_li span i')[0].stripped_strings) if soup.find_all('div',class_='palce_li') else None,
'cate' : 'person' if whos_sells == 0 else 'seller',
'views' : get_views_from(url)
}
print(data)
get_info_item(0)
#get_link_from(1)
#get_views_from(url)
当get_info_item()传入参数为0时,可爬取个人第一页数据,当传入参数为1时,按道理说应该爬取商家第一页信息,但是结果却出错,可能是因为商家信息爬取时便签不一样,导致爬取失败,后续将会进行代码优化