涉及详情页的信息爬取

构建两个函数,Func A 用来抓取详情页的数据,例如标题,副标题,图片等,Func B 用来抓取列表页的链接,依次访问Func B函数所抓取的链接即可。


from bs4 import BeautifulSoup
import requests
import time

def get_info(page):
    urls=get_links_from(page)   #调用获取链接函数,并利用循环依次访问
    for url in urls:
        web_data=requests.get(url)
        soup=BeautifulSoup(web_data.text,'lxml')
        titles=soup.select('div.box_left_top > h1')
        datas=soup.select('span.look_time')
        prices=soup.select('span.price_now > i')
        areas=soup.select(' div.palce_li > span > i')
        for title,data,price,area in zip(titles,datas,prices,areas):
            data={
                'title':title.get_text(),
                'data':data.get_text(),
                'price':price.get_text(),
                'area':list(area.stripped_strings) 
            }
            print(data)

def get_links_from(page):
    for page_num in range(1,page):
        urls=[]
        list_urls='http://wh.58.com/shandizixingche/pn{}/'.format(page_num)
        web_data=requests.get(list_urls)
        soup=BeautifulSoup(web_data.text,'lxml')
        links=soup.select('td.t a.t')
        for link in links:
            urls.append((link.get('href').split('?')[0]))
    return (urls)

你可能感兴趣的:(python爬虫)