Python实战作业1-3:爬取租房信息

成果:

Python实战作业1-3:爬取租房信息_第1张图片
Snip20170522_9.png

任务:

Level 1
爬取网页:http://sh.xiaozhu.com/fangzi/1650345535.html 中:
1、标题————title;
2、地址————address;
3、日租金————rent;
4、第一张房源图片链接————housePic;
5、房东图片链接————landlordPic;
6、房东名字————landlordName
7、房东性别————sex

Level 2
抓取300个房源详情:http://sh.xiaozhu.com/search-duanzufang-p1-0/

代码:

from bs4 import BeautifulSoup
import requests

def SexJudge(sex):
    if sex == 'member_ico':
        sex = 'man'
    else:
        sex = 'woman'
    return sex

def GetInfo(houseUrl):
    wb_data = requests.get(houseUrl)
    soup = BeautifulSoup(wb_data.text,'lxml')

    titles = soup.select('head > title')
    addresses = soup.select('span.pr5')
    rents =  soup.select('#pricePart > div.day_l > span')
    housePics = soup.select('#curBigImage')
    landlordPics = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    sexes = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')


    for title,address,rent,housePic,landlordPic,sex in zip(titles,addresses,rents,housePics,landlordPics,sexes):
        data = {
            'title':title.get_text(),
            'address':address.get_text(),
            'rent':rent.get_text(),
            'housePic':housePic.get('src'),
            'landlordPic':landlordPic.get('src'),
            'sex':SexJudge(''.join(sex.get('class'))),
        }
    print(data)
    return data

menuUrl = ['http://sh.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,14)]

def GetHouseUrl(menuUrl):
    data = []
    n = 0
    for url in menuUrl:
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'lxml')

        houseUrls = soup.select('#page_list > ul > li > a')
        for houseUrl in houseUrls:
            data.insert(-1,houseUrl.get('href'))
        n = n+1
        print('Complete Page ',n)
    return data

n=0
for houseUrl in GetHouseUrl(menuUrl):
    GetInfo(houseUrl)
    n = n+1
    print('Complete House ',n)

你可能感兴趣的:(Python实战作业1-3:爬取租房信息)