python爬虫获取自如,爱上租固定小区房源

1. 通过模拟网页请求获取到自如,爱上租页面信息:
def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 6.1; WOW64)',
    }
    try:
        response = requests.get(url, headers=headers)
        html = response.text
        return html
    except:
        print('request error')
    pass
2. 取出房源信息,soup匹配到关键的ul字段,自如为houseList,爱上租没看到ul字段,改用div:
  1. 自如房源获取:
def get_ziru_hourse(html):
    soup = BeautifulSoup(html, 'lxml')
    house_names = []
    house_urls = []
    links = soup.find('ul', id='houseList').find_all('a')
    spans = soup.find('ul', id='houseList').find_all('span')
    special = ''
    spes = []
    for span in spans:
        special += (span.get_text()) + ' '
        if '每月' in span.get_text():
            spes.append(special)
            special = ''

    for link in links:
        house_url = link.get('href')
        if 'http' not in house_url:
            house_url = 'http:' + house_url
        if house_url not in house_urls and 'youjia_fbh' not in house_url:
            house_urls.append(house_url)
        house_name = link.get_text()
        if '龙湖春江' in house_name:
            house_names.append(house_name)

    return zip(house_names, house_urls, spes)
  1. 爱上租房源获取:
def get_isz_hourse(html):
    soup = BeautifulSoup(html, 'lxml')
    house_names = []
    house_urls = []
    links = soup.find('div', class_='left').find_all('a')
    house_name = ''
    for link in links:
       span = link.get_text()
        if '龙湖春江' in span:
            house_names.append(house_name)
            house_name = ''
        house_url = link.get('href')
        house_name += span + ' '
        if house_url is not None and house_url not in house_urls and '%E9%BE%99%E6%B9%96%E6%98%A5%E6%B1%9F' not in house_url:
            house_urls.append(house_url)

    del house_names[0]
    return zip(house_names,house_urls)

最后,模块调用,以杭州龙湖春江为例子:

if __name__=='__main__':
    url = 'http://hz.ziroom.com/z/nl/z3-d330108.html?qwd=%E9%BE%99%E6%B9%96%E6%98%A5%E6%B1%9F'
    html = get_html(url)
    houses = get_hourse(html)
    print('自如房子:')
    for house in houses:
         print(house[0], house[1], house[2])

    isz_url = 'http://www.ishangzu.com/zufang/?q=%E9%BE%99%E6%B9%96%E6%98%A5%E6%B1%9F'
    html = get_html(isz_url)
    houses = get_isz_hourse(html)
    print('爱上租房子:')
    for house in houses:
        print(house[0], house[1])

结果:

python爬虫获取自如,爱上租固定小区房源_第1张图片
ziru.jpg

源码地址:

https://github.com/Blueyin/ziru-isz-house/tree/master

你可能感兴趣的:(python爬虫获取自如,爱上租固定小区房源)