Python爬虫整站爬取安居客新房房源数据

对安居客新房房源进行整站爬取,全国所有城市的新房数据。

获取数据字段也主要集中在第一个页面,并未进行二层页面的挖掘
在持续爬取过程中,会出现验证码的过程,不过它的验证码还是比较友好的,
只需要在浏览器拖动图片就行,代码又能重新运行了。整个爬取过程中我只
拖动了两次。所以安居客是算非常友好的网站了,当然上述问题也可以使用代理IP
解决。接下来还可以做的,加上代理IP,加上多线程,进行二级页面的进一步挖掘。
在此感谢友好的安居客,给我们这些小白有了练手的网站。

"""
整站爬取安居客的新房数据
"""

import requests
import re
from pyquery import PyQuery as pq
import json
import csv


headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}


#获取所有城市列表
def get_city_page():
    city_url = "https://www.anjuke.com/sy-city.html"
    response = requests.get(city_url,headers=headers)
    if response.status_code == 200 :
        return response.content.decode("utf-8")
    else:
        return None


#解析城市列表,获得所有城市名称和url
def parse_city_name(html):
    doc = pq(html)
    items = doc(".letter_city ul li .city_list a").items()
    for item in items:
        city_name = item.text()
        city_url = item.attr.href

        pattern = re.compile("https://(.*?).anjuke",re.S)
        city_english_name = re.search(pattern,city_url).group(1)
        city_fang_url = "https://"+city_english_name+".fang.anjuke.com/"

        yield {
            "city_name":city_name,
            # "city_url":city_url,
            # "city_english_name":city_english_name,
            "city_fang_url":city_fang_url,
        }


#对每个城市进行房源爬取
def get_city_fang_page(url):
    response = requests.get(url,headers=headers)
    if response.status_code == 200 :
        return response.content.decode("utf-8")
    else:
        return None


#解析页面
def parse_first_page(html):
    doc = pq(html)
    items = doc(".key-list.imglazyload .item-mod").items()
    for item in items:
        con = pq(item)
        name = con(".infos .lp-name .items-name").text()
        address = con(".infos .address .list-map").text()
        shape = con(".infos .huxing span").text()
        if shape:
            shape = shape.strip().replace("\r\n","")
        price = con("a.favor-pos > p.price").text()
        around_price = con(".favor-pos .around-price").text()
        if around_price:
            around_price = around_price.strip().replace("\r\n","")

        yield {
            "name":name,
            "address":address.replace("\xa0",""),
            "shape":shape,
            "price":price,
            "around_price":around_price,
        }


#获取下一页
def next_page(html):
    doc = pq(html)
    next_url = doc(".pagination .next-page").attr.href
    return next_url


#写入csv文件标题
def write_title_file():
    with open ("anjuke_newhome.csv","a+",encoding="utf-8-sig",newline="") as f:
        wea_for=csv.writer(f,delimiter=",")
        wea_for.writerow(["city_name","name","address","shape","price","around_price"])


#写入csv文件内容
def write_content_file(content):
    with open ("anjuke_newhome.csv","a+",encoding="utf-8-sig",newline="") as f:
        wea_for=csv.writer(f,delimiter=",")
        wea_for.writerow([content["city_name"],content["name"],content["address"],content["shape"],content["price"],content["around_price"]])


def main():
    html = get_city_page()
    for city in parse_city_name(html):
        #print(city)
        city_url = city["city_fang_url"]
        city_html = get_city_fang_page(city_url)
        for content in parse_first_page(city_html):
            content["city_name"] = city["city_name"]
            write_content_file(content)
            #print(content)

        while next_page(city_html):
            next_url = next_page(city_html)
            city_html = get_city_fang_page(next_url)
            for content in parse_first_page(city_html):
                content["city_name"] = city["city_name"]
                write_content_file(content)


if __name__ == '__main__':
    write_title_file()
    main()

部分结果展示:一共获取到了126916条数据,大小大约在16M左右。
Python爬虫整站爬取安居客新房房源数据_第1张图片
总体来说还是比较容易的,使用初学者练手。

你可能感兴趣的:(Python爬虫整站爬取安居客新房房源数据)