爬取租房信息
需要提取的信息包括:
标题,地址,价格,第一张房源图片,房东名字,房东性别,房东图片
完整代码如下
#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
# get info from a multipage website
#
from bs4 import BeautifulSoup
import requests
import os
import urllib
def getemtext(element):
return element.get_text().strip()
def getgender(element):
if element.get("class") == ["member_boy_ico"]:
return 'male'
elif element.get("class") == ["member_girl_ico"]:
return 'female'
else:
return 'not sure'
def geturls(url):
url_data = requests.get(url)
soup = BeautifulSoup(url_data.text, "lxml")
pagelist = soup.select('#page_list > ul > li > a')
urllist = [item.get('href') for item in pagelist]
return urllist
def downimg(url, filename, path='xiaozhuimg'):
urllib.request.urlretrieve(url, os.path.join(path, filename))
def get_target_info(url):
url_data = requests.get(url)
soup = BeautifulSoup(url_data.text, "lxml")
wrap = soup.select('div.con_bg')[0]
title = wrap.select('h4 > em')[0]
address = wrap.select('p')[0]
image = wrap.select('div.pho_show_big img')[0]
price = wrap.select('div.day_l span')[0]
lordname = wrap.select('a.lorder_name')[0]
gender = wrap.select('div.w_240 span')[0]
lordpic = wrap.select('div.member_pic img')[0]
'''
# old usage
title = soup.select('div.con_l > div.pho_info > h4 > em')
address = soup.select('div.con_l > div.pho_info > p > span.pr5')
image = soup.select('div.con_l > div > div > div.pho_show_big > div > img')
price = soup.select('div.con_r > div.bg_box > div.day_top > div > span')
lordname = soup.select('div.con_r > div.bg_box > div > div > h6 > a.lorder_name')
gender = soup.select('div.con_r > div.bg_box > div > div > h6 > span')
lordpic = soup.select('div.con_r > div.bg_box > div > div.member_pic > a > img')
'''
data = {
"title": getemtext(title),
"address": address.get('title'),
"image": image.get('src'),
"price": getemtext(price),
"lordname": getemtext(lordname),
"gender": getgender(gender),
"lordpic": lordpic.get('src')
}
print(data)
downimg(data['image'], data['title']+'.jpg')
if __name__ == "__main__":
if not os.path.exists('xiaozhuimg'):
imgdir = os.mkdir('xiaozhuimg')
urls = ["http://sh.xiaozhu.com/search-duanzufang-p{}-0/".format(pageid) for pageid in range(1, 2)]
for url in urls:
urllist = geturls(url)
for urlitem in urllist:
get_target_info(urlitem)
运行结果
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,31,917,375,376,2ae24142.jpg', 'title': '近中山公园地铁站 超便宜两房 干净清爽', 'price': '418', 'gender': 'not sure', 'lordname': '雅居时代', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,6,1584,1799,1200,a1000765.jpg', 'address': '上海市长宁区中山西路380号'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/4,0,21,8378,260,260,887b46dc.jpg', 'title': '看外滩的超大阳台 沙逊1935的河滨大楼', 'price': '538', 'gender': 'male', 'lordname': '王老闆', 'image': 'http://image.xiaozhustatic1.com/00,800,533/4,0,68,8498,1798,1200,ca0c88ef.jpg', 'address': '上海市虹口区北苏州路400号'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/4,0,21,8378,260,260,887b46dc.jpg', 'title': '外滩边的钢琴房 英国1931的披亚斯公寓', 'price': '458', 'gender': 'male', 'lordname': '王老闆', 'image': 'http://image.xiaozhustatic1.com/00,800,533/1,0,20,5385,825,550,1aeb14df.jpg', 'address': '上海市虹口区蟠龙街'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,97,1310,488,488,b24e1e39.jpg', 'title': '居有故事老洋房,逛法租界巨富长', 'price': '558', 'gender': 'female', 'lordname': '上海乔安娜', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,22,2044,1800,1200,a83dd3ef.jpg', 'address': '上海市徐汇区长乐路'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,2,3806,363,364,84490794.jpg', 'title': '[今日特价]床大也就算了,还出门就是地铁', 'price': '158', 'gender': 'male', 'lordname': 'DP', 'image': 'http://image.xiaozhustatic1.com/00,800,533/3,0,1,3234,1800,1200,96f38390.jpg', 'address': '上海市普陀区长征金沙江路金沙江路2299弄金沙雅苑(距和美广场,祁连山南路地铁站80米,距华东师大3站地铁,虹桥火车站近在咫)'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/2,0,9,2490,375,375,98d3731a.jpg', 'title': '外滩,豫园,新天地,旅游温馨便利两居室', 'price': '730', 'gender': 'male', 'lordname': 'linzhijing', 'image': 'http://image.xiaozhustatic1.com/00,800,533/4,0,65,6603,1800,1202,7bb65f83.jpg', 'address': '上海市黄浦区中华路868弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/3,0,14,4883,333,333,9396fb91.jpg', 'title': '1905CAFE碧云店【轰趴民宿!迪士尼】', 'price': '1299', 'gender': 'female', 'lordname': 'Grace0428', 'image': 'http://image.xiaozhustatic1.com/00,800,533/3,0,74,4931,800,580,7c65a07b.jpg', 'address': '上海市浦东区金桥路255号(红星美凯龙斜对面)'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,37,4020,260,260,6222df08.jpg', 'title': '两湾城苏州河火车站3号4号地铁2室2厅百平', 'price': '380', 'gender': 'female', 'lordname': 'gjg166', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,69,2462,1800,1200,cb2961d1.jpg', 'address': '上海市普陀区中潭路99弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,31,898,260,260,6eb4c9e3.jpg', 'title': '限时特价!8号世博家园阳光大床房荷马的家', 'price': '188', 'gender': 'male', 'lordname': '荷马', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,79,1089,1798,1200,06cad89e.jpg', 'address': '上海市闵行区浦申路200弄35号'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/4,0,22,9530,287,287,1e493ceb.jpg', 'title': '【玫瑰】10号线图书馆复兴西路老洋房舒适单间', 'price': '358', 'gender': 'female', 'lordname': 'XiaoV123', 'image': 'http://image.xiaozhustatic1.com/00,802,533/5,0,48,2180,1954,1300,2efaf75f.jpg', 'address': '上海市徐汇区复兴西路44弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/5,0,80,656,375,375,b7a3385b.jpg', 'title': '浦东机场迪士尼野生动物园精致2房', 'price': '338', 'gender': 'male', 'lordname': '刘国庆930', 'image': 'http://image.xiaozhustatic1.com/00,800,533/5,0,24,2512,1798,1200,bef9206f.jpg', 'address': '上海市浦东区惠南镇拱海路79弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,31,898,260,260,6eb4c9e3.jpg', 'title': '限时特价!近迪斯尼温馨大床房——荷马的家', 'price': '168', 'gender': 'male', 'lordname': '荷马', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,99,1135,1798,1200,25015c95.jpg', 'address': '上海市闵行区浦申路200弄35号'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,8,1972,260,260,3bb730b0.jpg', 'title': '精致豪华宫廷风私家单间附带独卫 共享超大茶厅', 'price': '299', 'gender': 'not sure', 'lordname': 'spring4dyu', 'image': 'http://image.xiaozhustatic1.com/00,800,533/3,0,4,3787,1800,1200,c0a10e80.jpg', 'address': '上海市长宁区番禺路118弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/4,0,64,7870,260,260,3eef612f.jpg', 'title': '烏魯木齊路老式壁爐老洋房·岁月苒·', 'price': '358', 'gender': 'female', 'lordname': 'vivion', 'image': 'http://image.xiaozhustatic1.com/00,800,533/4,0,36,8138,1798,1200,0088fdf3.jpg', 'address': '上海市徐汇区烏魯木齊路157號'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/5,0,27,1772,329,329,82f2d686.jpg', 'title': '两晚立减!2站直达迪士尼|米奇亲子主题两居室', 'price': '480', 'gender': 'female', 'lordname': 'lilac120', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,4,3743,1334,889,8027e82c.jpg', 'address': '上海市浦东区周东路'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/5,0,90,2029,281,280,9fd7e546.jpg', 'title': '驴友小憩1:市中心地铁口电梯景观房客厅床', 'price': '150', 'gender': 'female', 'lordname': 'Hecher', 'image': 'http://image.xiaozhustatic1.com/00,800,533/4,0,92,6498,1798,1200,7295d69a.jpg', 'address': '上海市静安区新客站卓悦居1235座(太阳city正对面,太平洋百货(嘉里不夜城)斜对面20米)'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/3,0,94,779,375,375,5ce07ca6.jpg', 'title': '国家会展中心 虹桥火车站附近的景观房', 'price': '368', 'gender': 'female', 'lordname': '小薇儿的家', 'image': 'http://image.xiaozhustatic1.com/00,800,533/4,0,83,7556,1800,1202,56561915.jpg', 'address': '上海市青浦区农房西郊半岛'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/2,0,12,1234,382,382,edf77c10.jpg', 'title': '近交大徐家汇商圈&虹桥路地铁口小窝', 'price': '138', 'gender': 'female', 'lordname': '戒不掉的奶茶', 'image': 'http://image.xiaozhustatic1.com/00,800,533/5,0,54,67,1798,1200,2544e0b6.jpg', 'address': '上海市徐汇区鑫城苑'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/4,0,2,9380,374,374,194746bc.jpg', 'title': '人民广场外滩上海火车站地铁站的豪华公寓', 'price': '618', 'gender': 'female', 'lordname': '空白色', 'image': 'http://image.xiaozhustatic1.com/00,800,533/4,0,72,7858,1800,1202,33ae0531.jpg', 'address': '上海市闸北区宝昌路'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/5,0,44,63,373,373,54bc8c68.jpg', 'title': '2号线虹桥机场火车站凌空soho携程临空园区', 'price': '218', 'gender': 'female', 'lordname': '妮小妮', 'image': 'http://image.xiaozhustatic1.com/00,800,533/5,0,79,629,1798,1200,5d804f61.jpg', 'address': '上海市长宁区福泉路123弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/3,0,66,4482,329,329,56fd6eea.jpg', 'title': '浦东新博展|1妇婴|迪士尼|2号线龙阳路站', 'price': '336', 'gender': 'male', 'lordname': '星期日', 'image': 'http://image.xiaozhustatic1.com/00,801,533/3,0,15,4687,1800,1198,d291672d.jpg', 'address': '上海市浦东区鹏裕苑'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/4,0,27,9096,375,376,5854cbec.jpg', 'title': '【纳美】2号线迪士尼 近浦东机场温馨两房', 'price': '498', 'gender': 'not sure', 'lordname': 'Piccolo_Wong', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,72,2348,1800,1201,44ed24dc.jpg', 'address': '上海市浦东区川沙新镇南桥路707弄永达城市公寓'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,99,710,260,260,ee0b2a8b.jpg', 'title': '2号地铁世纪公园,近新国际博览中心上海科技馆', 'price': '268', 'gender': 'female', 'lordname': 'bridgetzhang', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,52,2061,1800,1200,979fb7e4.jpg', 'address': '上海市浦东区花木路718弄'}
{'lordpic': 'http://image.xiaozhustatic1.com/21/6,0,31,898,260,260,6eb4c9e3.jpg', 'title': '特价! 8号线地铁35分达人民广场 荷马的家', 'price': '148', 'gender': 'male', 'lordname': '荷马', 'image': 'http://image.xiaozhustatic1.com/00,800,533/6,0,49,1103,1798,1200,f67feb0b.jpg', 'address': '上海市闵行区浦申路200弄35号'}
总结
- urlretrieve的用法
python 3
urllib.request.urlretrieve(url, os.path.join(path, filename))
python 2
urllib.urlretrieve(url, os.path.join(path, filename))
- beautifulsoup选择目标标签的简单写法:
title = wrap.select('h4 > em')[0]