Python实战 - 第5节:开始使用MongoDB

笔记

  • 连接数据库服务:
    client = pymongo.MongoClient('localhost', 27017)
    
  • 创建/访问数据库:
    $dbName = client['$dbName']
    
  • 创建/访问数据表:
    $tableName = $dbName['$tableName']
    
  • 插入数据:
    $tableName.insert_one(data)
    
  • 查询数据:
    $tableName.find()
    $tableName.find({'$columnName':$columnValue})
    # $lt/$lte/$gt/$gte/$ne 依次为 />=/!=
    $tableName.find({'$columnName':{'$lte':$value}})
    

作业

  • 爬取租房信息入库
import pymongo
from bs4 import BeautifulSoup
import requests
import time

\# MongoDB connect
client = pymongo.MongoClient('localhost', 27017)
xiaozhu = client['xiaozhu']
duanzufang = xiaozhu['duanzufang']

\# URL Parse
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 4)]


def parse_gender(gender_class):

    if gender_class == 'member_ico1':
        return '女'
    elif gender_class == 'member_ico':
        return '男'
    else:
        return '未知'


def parse_datail_page(url):

    web_data = requests.get(url)
    soap = BeautifulSoup(web_data.text, 'lxml')

    titles = soap.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
    addrs = soap.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
    rants = soap.select('#pricePart > div.day_l > span')
    pics = soap.select('#curBigImage')
    owner_pics = soap.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    gender_class = soap.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0]['class'][0]
    owner_names = soap.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')

    data = {
        'title': titles[0].get_text(),
        'addr': addrs[0].get_text().strip(),
        'rant': int(rants[0].get_text()),
        'pic': pics[0].get('src'),
        'owner_pic': owner_pics[0].get('src'),
        'owner_name': owner_names[0].get_text(),
        'gender': parse_gender(gender_class),
    }

    print(data)
    \# insert to MongoDB
    duanzufang.insert_one(data)
    time.sleep(1)


def parse_list_page(url):

    web_data = requests.get(url)
    soap = BeautifulSoup(web_data.text, 'lxml')

    detail_urls = soap.select('#page_list > ul > li > a')
    for detail_url in detail_urls:
        parse_datail_page(detail_url.get('href'))


for url in urls:
    parse_list_page(url)

  • 过滤查询租金大于等于500的房源信息
import pymongo

client = pymongo.MongoClient('localhost', 27017)
xiaozhu = client['xiaozhu']
duanzufang = xiaozhu['duanzufang']

for item in duanzufang.find({'rant':{'$gte':500}}):
    print(item)

你可能感兴趣的:(Python实战 - 第5节:开始使用MongoDB)