1.
标题 | 说明 |
---|---|
网址 | http://sh.xiaozhu.com/search-duanzufang-p1-0/ |
要求1 | 爬取前3页的数据并存储到mongodb中 |
要求2 | 从mogodb中筛选房价大于500元的房源并打印出来 |
2. 分析
- 需要导入连接mongodb的库
- 页面随着p后面的数字变化而变化
- 每个列表页中有24条房源链接
- 详情页中的房东性别根据
member_girl_ico
,member_boy_ico
来判断
3. 实现
# vim spider_xiaozhu.py
代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = 'jhw'
# 导入连接mongodb的库
from pymongo import MongoClient
from bs4 import BeautifulSoup
import requests
# 连接mongodb
client = MongoClient('10.66.17.17', 27017)
# 选择数据库
database = client['xiaozhu']
# 选择collection
item_info = database['item_info_sh']
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
}
# 定义获取房屋链接的函数
def get_url_from():
url_list = []
urls = ['http://sh.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1, 14)]
for url in urls:
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, 'lxml')
links = soup.select('.result_btm_con.lodgeunitname')
for link in links:
url_list.append(link.get('detailurl'))
print(link.get('detailurl'))
return url_list
# 定义获取房屋信息的函数
def get_item_from(url):
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, 'lxml')
# 房屋标题
house_titles = soup.select('.pho_info > h4 > em')
# 房屋地址
house_addrs = soup.select('.pr5')
# 房屋评分
house_scores = soup.select('em.score-rate')
# 房屋价格
house_prices = soup.select('.day_l > span')
# 房屋第一张图片, 有链接但直接打开报错
house_pics = soup.select('.pho_show_big > div > img')
# 房东图片, 有链接但直接打开报错
landlord_imgs = soup.select('.member_pic > a > img')
# 房东姓名
landlord_names = soup.select('div.w_240 > h6 > a')
# 房东芝麻信用
landlord_zms = soup.select('.zm_ico.zm_credit')
# 判断房东性别
if soup.select('.member_girl_ico'):
landlord_gens = 'MM'
elif soup.select('.member_boy_ico'):
landlord_gens = 'FM'
else:
landlord_gens = 'FF'
data = {
'title': house_titles[0].get_text() if house_titles else None,
'house_addr': house_addrs[0].get_text().strip() if house_addrs else None,
'house_score': house_scores[0].get_text() if house_scores else None,
'house_price': int(house_prices[0].get_text()) if house_pics else None,
'house_pic': house_pics[0].get('src') if house_pics else None,
'landlord_img': landlord_imgs[0].get('src') if landlord_imgs else None,
'landlord_gen': landlord_gens,
'landlord_name': landlord_names[0].get_text() if landlord_names else None,
'landlord_zm': landlord_zms[0].get_text() if landlord_zms else None,
'url': url,
}
# 将数据存储至mongodb
item_info.insert_one(data)
print(data)
# 将房屋链接存储至列表
url_list = get_url_from()
# 从列表中取出链接传入get_item_from函数以获取房屋信息
for url in url_list:
get_item_from(url)
# 从mongodb中筛选出价格大于500元的房屋
for house in item_info.find({'house_price': {'$gt': 500}}):
print(house)
# python3 spider_xiaozhu.py // 运行结果
结果
http://sh.xiaozhu.com/fangzi/1174953165.html
http://sh.xiaozhu.com/fangzi/1677560635.html
.
.
.
http://sh.xiaozhu.com/fangzi/2514835762.html
http://sh.xiaozhu.com/fangzi/3424890030.html
http://sh.xiaozhu.com/fangzi/3092487529.html
.
.
.
{'landlord_zm': '671', 'house_price': 466, 'title': '迪士尼|新国际博览中心|地铁零距离|舒适三房', 'house_addr': '上海市浦东区环桥路1137弄', 'url': 'http://sh.xiaozhu.com/fangzi/3266673930.html', 'landlord_gen': 'FM', 'landlord_img': 'http://image.xiaozhustatic1.com/21/4,0,68,7037,374,374,1d69c0b2.jpg', 'house_score': '5分', '_id': ObjectId('577ba20b3dd54e6a47569fec'), 'landlord_name': '水中花7089', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,50,4614,1798,1200,021d8c01.jpg'}
{'landlord_zm': '731', 'house_price': 198, 'title': '打浦桥#日月光#田子坊#瑞金南路独卫主卧', 'house_addr': '上海市徐汇区瑞金南路546号海晖公寓', 'url': 'http://sh.xiaozhu.com/fangzi/2792949163.html', 'landlord_gen': 'MM', 'landlord_img': 'http://image.xiaozhustatic1.com/21/2,0,12,1234,382,382,edf77c10.jpg', 'house_score': '5分', '_id': ObjectId('577ba20c3dd54e6a47569fed'), 'landlord_name': '戒不掉的奶茶', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,64,797,1798,1200,d21c9325.jpg'}
.
.
.
{'landlord_zm': '737', 'house_price': 730, 'title': '外滩,豫园,新天地,旅游温馨便利两居室', 'house_addr': '上海市黄浦区中华路868弄', 'url': 'http://sh.xiaozhu.com/fangzi/1882270235.html', 'landlord_gen': 'FM', 'landlord_img': 'http://image.xiaozhustatic1.com/21/2,0,9,2490,375,375,98d3731a.jpg', 'house_score': '5分', '_id': ObjectId('577ba20c3dd54e6a47569fee'), 'landlord_name': 'linzhijing', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/4,0,65,6603,1800,1202,7bb65f83.jpg'}
.
.
.
{'house_price': 566, 'title': '迪士尼|新国展|中式温馨三房', 'landlord_gen': 'FM', 'house_addr': '上海市浦东区秀沿路2585弄', 'landlord_name': '水中花7089', 'landlord_zm': '671', 'landlord_img': 'http://image.xiaozhustatic1.com/21/4,0,68,7037,374,374,1d69c0b2.jpg', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,22,5143,1798,1200,706a8741.jpg', 'url': 'http://sh.xiaozhu.com/fangzi/3267192930.html', '_id': ObjectId('577b83833dd54e5ba1aa663a'), 'house_score': '5分'}
{'house_price': 650, 'title': '法租界 衡山路【复古法式老洋房】带花园', 'landlord_gen': 'MM', 'house_addr': '上海市徐汇区永嘉新村', 'landlord_name': 'CarolineCheung', 'landlord_zm': '724', 'landlord_img': 'http://image.xiaozhustatic1.com/21/3,0,31,877,375,375,e1511563.jpg', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,70,2817,1800,1200,a2d1b86f.jpg', 'url': 'http://sh.xiaozhu.com/fangzi/1265282535.html', '_id': ObjectId('577b83843dd54e5ba0aa663b'), 'house_score': None}
{'house_price': 1680, 'title': '豪华联排别墅虹桥火车站虹桥机场国家会展中心', 'landlord_gen': 'MM', 'house_addr': '上海市青浦区徐泾明珠路555弄(国家会展中心西4公...', 'landlord_name': 'jenny0103', 'landlord_zm': '751', 'landlord_img': 'http://image.xiaozhustatic1.com/21/2,0,73,2626,375,375,ef9e175b.jpg', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/3,0,17,5911,1798,1200,d8d5b9d0.jpg', 'url': 'http://sh.xiaozhu.com/fangzi/1682762135.html', '_id': ObjectId('577b83863dd54e5ba0aa663f'), 'house_score': None}
{'house_price': 730, 'title': '外滩,豫园,新天地,旅游温馨便利两居室', 'landlord_gen': 'FM', 'house_addr': '上海市黄浦区中华路868弄', 'landlord_name': 'linzhijing', 'landlord_zm': '737', 'landlord_img': 'http://image.xiaozhustatic1.com/21/2,0,9,2490,375,375,98d3731a.jpg', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/4,0,65,6603,1800,1202,7bb65f83.jpg', 'url': 'http://sh.xiaozhu.com/fangzi/1882270235.html', '_id': ObjectId('577ba20c3dd54e6a47569fee'), 'house_score': '5分'}
.
.
.
{'house_price': 558, 'title': '居有故事老洋房,逛法租界巨富长 ', 'landlord_gen': 'MM', 'house_addr': '上海市徐汇区长乐路', 'landlord_name': '上海乔安娜', 'landlord_zm': '788', 'landlord_img': 'http://image.xiaozhustatic1.com/21/6,0,97,1310,488,488,b24e1e39.jpg', 'house_pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,22,2044,1800,1200,a83dd3ef.jpg', 'url': 'http://sh.xiaozhu.com/fangzi/2980986063.html', '_id': ObjectId('577ba20d3dd54e6a47569fef'), 'house_score': '5分'}
4. 总结
python借助于pymongo连接mongodb, 连接时指定IP地址和端口
mongodb中的表用
collection
表示-
mongodb常用的命令:
-
find
: 查询文档 - database.collection.find()
查找某collection中的所有数据
- database.collection.find({}, {'_id': 1, 'field': 1})
只显示所有数据中的
field
字段- 条件操作符:
- (>) 大于 -
$gt
- (<) 小于 -
$lt
- (>=) 大于等于 -
$gte
- (<= ) 小于等于 -
$lte
- (>) 大于 -
-
MongoDB 与 RDBMS Where 语句比较
如果你熟悉常规的 SQL 数据,通过下表可以更好的理解 MongoDB 的条件语句查询:
操作 | 格式 | 范例 | RDBMS中的类似语句 |
---|---|---|---|
等于 | { |
db.col.find({"by":"菜鸟教程"}).pretty() | where by = '菜鸟教程' |
小于 | { |
db.col.find({"likes":{$lt:50}}).pretty() | where likes < 50 |
小于或等于 | { |
db.col.find({"likes":{$lte:50}}).pretty() | where likes <= 50 |
大于 | { |
db.col.find({"likes":{$gt:50}}).pretty() | where likes > 50 |
大于或等于 | { |
db.col.find({"likes":{$gte:50}}).pretty() | where likes >= 50 |
不等于 | { |
db.col.find({"likes":{$ne:50}}).pretty() | where likes != 50 |