一周作业3 爬取租房信息

from bs4 import BeautifulSoup
import requests


url = 'http://bj.xiaozhu.com/fangzi/1508951935.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')


title = soup.select('div.pho_info > h4')[0].text
address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p')[0].get('title')
price = soup.select('#pricePart > div.day_l > span')[0].text
pic = soup.select('#imgMouseCusor')[0].get('src')
host_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get('title')
# host_gender = soup.select('div.member_pic > div')[0].get('class')[0]
host_gender = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span')[0].get('class')[0]


def get_gender(gender):
    if gender == 'member_girl_ico':
        return '女'
    if gender == 'member_boy_ico':
        return '男'

get_gender('member_girl_ico')


data = {
    'title':title,
    'address':address,
    'price':price,
    'pic':pic,
    'host_name':host_name,
    'host_gender':get_gender(host_gender)
}

print(data)


page_link = []

def get_page_link(page_number):
    for each_number in range(1,page_number):
        full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
        wb_data = requests.get(full_url)
        soup = BeautifulSoup(wb_data.text,'lxml')

        for link in soup.select('a.resule_img_a'):
            page_link.append(link)

你可能感兴趣的:(一周作业3 爬取租房信息)