W1-3作业

from bs4 import BeautifulSoup
import requests
import re

def singal_url(url_arg):
    urls = []
    info = []

    wb_data = requests.get(url_arg)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    urls_temp = soup.select('#page_list > ul > li > a')
    for url in urls_temp:
        urls.append(url.get('href'))
    #print(urls)

    for url in urls:
        singal_wb_data = requests.get(url)
        soup = BeautifulSoup(singal_wb_data.text, 'lxml')
        title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')[0].get_text()
        address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')[0].get_text().replace(' ','')[:-1]
        price = soup.select('#pricePart > div.day_l > span')[0].get_text()
        house_image = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(2) > img')[0].get('data-src')
        owner_image = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')[0].get('src')
        owner_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get_text()
        gender = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')[0]
        #print(type(gender[0]))
        if gender == 'member_ico':
            gender = 'man'
        elif gender == 'member_ico1':
            gender = 'woman'
        else:
            gender = ''

        dic = {
            'title':title,
            'address':address,
            'price':price,
            'house_image':house_image,
            'owner_image':owner_image,
            'owner_name':owner_name,
            'gender':gender
        }
        print(dic)
        info.append(dic)
    return info



def some_url(urls_arg):
    urls = []
    info = []

    for url_arg_temp in urls_arg:
        wb_data = requests.get(url_arg_temp)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        urls_temp = soup.select('#page_list > ul > li > a')
        for url in urls_temp:
            urls.append(url.get('href'))
        print(urls)

    for url in urls:
        singal_wb_data = requests.get(url)
        soup = BeautifulSoup(singal_wb_data.text, 'lxml')
        title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')[0].get_text()
        address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')[
                      0].get_text().replace(' ', '')[:-1]
        price = soup.select('#pricePart > div.day_l > span')[0].get_text()
        house_image = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(2) > img')[0].get(
            'data-src')
        owner_image = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')[0].get('src')
        owner_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get_text()
        gender = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')[0]
        # print(type(gender[0]))
        if gender == 'member_ico':
            gender = 'man'
        elif gender == 'member_ico1':
            gender = 'woman'
        else:
            gender = ''

        dic = {
            'title': title,
            'address': address,
            'price': price,
            'house_image': house_image,
            'owner_image': owner_image,
            'owner_name': owner_name,
            'gender': gender
        }
        print(dic)
        info.append(dic)
    return info



'''
单张列表页
'''
# url = 'http://bj.xiaozhu.com/search-duanzufang-p1-0/'
# singal_info = singal_url(url)
# print(singal_info)


'''
多张列表页
'''
# urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,301,1)]
# some_info = some_url(urls)
# print(some_info)

总结

  • soup.select('XXXXXXXXXXX').get('class')出来的是list,get('src')出来的是str
  • requests.get(url)获取真实网页后,Beautifulsoup('XXX'.text, 'lxml')使可读

你可能感兴趣的:(W1-3作业)