爬虫例子——多页、函数模板

爬取地址:http://bj.xiaozhu.com/
包含信息:多页;每页24个链接
爬取要求:爬取每个链接的标题、地址、价格、图片链接、主人名称、主人性别

from bs4 import BeautifulSoup
import requests

def get_info(page_number):
    urls = get_page_link(page_number)
    for url in urls:
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'html.parser')

        title = soup.title.text
        address = soup.select('div.pho_info > p')[0].get('title')
        price = soup.select('div.day_l > span')[0].text
        pic = soup.select('#curBigImage')[0].get('src')
        host_name = soup.select('a.lorder_name')[0].text
        host_gender = soup.select('div.member_pic > div')[0].get('class')[0]

        def print_gender(class_name):#定义男、女
            if class_name == 'member_ico1':
                return '女'
            if class_name == 'member_ico':
                return '男'

        data = {
            'title':title,
            'address':address,
            'price':price,
            'pic':pic,
            'host_name':host_name,
            'host_gender':print_gender(host_gender)
        }
        print(data)

def get_page_link(page_number):#批量获取链接
    page_link = []#存放链接,解析时遍历这个列表然后访问
    for each_number in range(1, page_number):#每页24个链接
        full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(each_number))
        wb_data = requests.get(full_url)
        soup = BeautifulSoup(wb_data.text,'html.parser')
        for url in soup.select('a.resule_img_a'):
            page_link.append(url.get('href'))
    return page_link#注意:print的返回值不能迭代!!!所以此处不能写为:return print(page_link)
get_info(page_number)#输入爬取的总页码数

部分输出:

{'title': '三里屯繁华区太古里酒吧街工体摩登时代主题一居-北京朝阳短租房|日租房 -小猪', 'host_name': 'zoehh', 'price': '398', 'address': '北京市朝阳区三里屯南路', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,61,6262,1800,1200,05be8a2a.jpg', 'host_gender': '女'}
{'title': '【独卫】望京Soho百兆宽带&宜家床垫大床房-北京朝阳短租房|日租房 -小猪', 'host_name': 'Liicy', 'price': '285', 'address': '北京市朝阳区国风北京', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,56,6219,1800,1200,27d55c0f.jpg', 'host_gender': '女'}
{'title': '望京华彩十四号线精美豪华大一居-北京朝阳短租房|日租房 -小猪', 'host_name': '想要', 'price': '395', 'address': '北京市朝阳区望京利泽西园', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,62,2913,1800,1200,4ecf03a3.jpg', 'host_gender': None}
{'title': '地铁6号线常营长楹天街豪华公寓-北京朝阳短租房|日租房 -小猪', 'host_name': '爱猪我家萍姐', 'price': '197', 'address': '北京市朝阳区6号地铁常营天街豪华公寓', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/4,0,26,6729,1800,1200,768006fe.jpg', 'host_gender': '女'}
{'title': '双井地铁口豪华欧式装修大二居-北京朝阳短租房|日租房 -小猪', 'host_name': '少寒', 'price': '596', 'address': '北京市朝阳区双井西大望路珠江帝景', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,3,992,1800,1200,32297300.jpg', 'host_gender': '男'}
{'title': '故宫东门大院古典套间,步行至故宫天安门王府井-北京东城短租房|日租房 -小猪', 'host_name': 'zhengfanwu', 'price': '998', 'address': '北京市东城区磁器库北巷', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,99,6792,1800,1200,65ec7d49.jpg', 'host_gender': '女'}
{'title': '【爱上我的家】亚运村10分钟地铁站阳光大主卧-北京朝阳短租房|日租房 -小猪', 'host_name': '新新的家', 'price': '228', 'address': '北京市朝阳区亚运村小营北路(大屯东地铁站往东500米,比邻完美世界)', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/1,0,94,4002,825,550,d2a2390e.jpg', 'host_gender': '男'}
{'title': '六道口轻民宿-与书同眠6人间-北京海淀短租房|日租房 -小猪', 'host_name': '霹雳娇娃子', 'price': '108', 'address': '北京市海淀区学院路逸成东苑', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,5,3548,1800,1200,9f2e73e2.jpg', 'host_gender': '女'}
{'title': '望京商圈,毗邻地铁5分钟,漫威主题大两居-北京朝阳短租房|日租房 -小猪', 'host_name': '想要', 'price': '395', 'address': '北京市朝阳区广顺北大街利泽西园', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,66,803,1800,1200,38a4c686.jpg', 'host_gender': None}
{'title': '独立卫浴邻798、望京、酒仙桥更多优惠房源。-北京朝阳短租房|日租房 -小猪', 'host_name': '暖阳洋Sunny', 'price': '268', 'address': '北京市朝阳区彩虹路', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/2,0,71,458,1800,1200,a9c5ea82.jpg', 'host_gender': None}

你可能感兴趣的:(Web,Spider)