爬取租房信息

from bs4 import BeautifulSoup
import requests
import time

def url_get(url_number):
    urls=[]
    urls_homes = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, url_number)]
    for urls_home in urls_homes:
        time.sleep(4)
        wb_data = requests.get(urls_home)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        srcs = soup.select('div[id="page_list"] > ul > li > a')
        for src in srcs:
            time.sleep(4)
            wb_data1 = requests.get(src.get('href'))
            soup1 = BeautifulSoup(wb_data1.text,'lxml')#response has no len() .text
            titles = soup1.select( 'body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
            addresses= soup1.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
            prices = soup1.select('div.day_l > span')
            imgs = soup1.select('div.pho_show_l > div > div > img')
            human_imgs =soup1.select('div.js_box.clearfix > div.member_pic > a > img')
            names =soup1.select('div.js_box.clearfix > div.w_240 > h6 > a')
            sexs = soup1.select(' div.js_box.clearfix > div.member_pic > div')
            for title, address, price, img, human_img, name, sex in zip(titles,addresses,prices,imgs,human_imgs,names,sexs):

                #性别
                if sex.get('class')[0] == 'member_ico1':
                    sex_self = "man"
                elif sex.get('class')[0] == '':
                    sex_self = "unknown"
                else:
                    sex_self = "women"
                data = {
                     'title':title.get_text(),
                     'address':address.get_text().strip(),
                     'price':price.get_text(),
                     'img':img.get('src'),
                     'human_img':human_img.get('src'),
                     'name':name.get_text(),
                     'sex':sex_self
                }
                print(data)
url_get(2)

重点

  • 性别判断
    sex.get('class')[0]
    是一个列表
  • class用点
    属性用[id="xxxx"]
  • strip()用来去除空白符

你可能感兴趣的:(爬取租房信息)