python_安居客区域房源均价工具(matplotlib)

让数据更有价值,就需要对抓取的信息进行适当的处理,然后展现出来。

0.打开源码,修改源码该位置选择城市:

1.抓取的安居客不团区域房价,然后计算该区域均价,然后通过matploylib绘图:
python_安居客区域房源均价工具(matplotlib)_第1张图片

python_安居客区域房源均价工具(matplotlib)_第2张图片

2.python源码:

#-*- encoding=UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import xlwt
import time
import xlrd
import matplotlib.pyplot as plt

city="xa"     ###城市缩写
sheet_name="西安"
url_area="https://"+str(city)+".fang.anjuke.com/loupan/"

###各个title信息
area_key=r""
huxing_key="huxing"
url_key="class=\"tags-wrap\" href=\""
panel_key="class=\"tag-panel\""
price_key="class=\"price\""
price_around_key="\"favor-tag around-price\""
tel_key="class=\"tel\">"

###区域
loupan_area=[]
###楼盘名
loupan_title=[]
###楼盘地址
loupan_address=[]
###楼盘户型
loupan_huxing=[]
###楼盘URL
loupan_url=[]
###楼盘panel
loupan_panel=[]
###楼盘price
loupan_price=[]
###楼盘tel
loupan_tel=[]

################################抓取安居客楼盘价格#################################
###获取所有区域
anjuke_area = requests.get(url_area).text
anjuke_area=anjuke_area.split("\n")
area_loupan={}
for ihtml in anjuke_area:
    if area_key in ihtml:
        area_loupan[ihtml.split(">")[1].split("<")[0]]=ihtml.split("a href=")[1].split(">")[0]+"p"

for ikey in area_loupan.keys():
    if ikey[-1] == "线":
        continue
    print(ikey)
    inum = 0
    sheet_count = []
    while 1:
        real_url = area_loupan[ikey] + str(inum + 1) + "w1_/"
        inum=inum+1
        html = urlopen(real_url)
        anjuke_html = BeautifulSoup(html.read())
        ###得到原始信息
        title_key_start = ""
        loupan_arr = []
        for ihtml in anjuke_html:
            data = str(ihtml).split(title_key_start)
            if len(data) > 1:
                for i in data:
                    loupan_arr.append(i)
        ###筛选原始信息
        title_key_stop=""
        dest_loupan_arr = []
        for i in range(len(loupan_arr)):
            if i != 0 and i != len(loupan_arr) - 1:
                dest_loupan_arr.append(loupan_arr[i])
            if i == len(loupan_arr) - 1:
                data = str(loupan_arr[i]).split(title_key_stop)
                dest_loupan_arr.append(data[0])
        ###解析网页
        for i in dest_loupan_arr:
            price_flag = 0  ##部分楼盘售价待定
            huxing_flag = 0  ##部分楼盘户型未知
            tel_flag = 0  ##部分楼盘电环未知
            data = str(i).split("\n")
            for j in range(len(data)):
                if j == 0:  # loupan_key
                    loupan_title.append(data[j].split("<")[0])
                    continue
                if address_key in data[j]:
                    loupan_address.append(data[j].split(address_key)[1].split("<")[0])
                    continue
                if huxing_key in data[j]:
                    huxing_flag = 1
                    real_j = j + 1
                    tmp_huxing_str = ""
                    while data[real_j] != "":
                        if "" in data[real_j]:
                            tmp = data[real_j].split("")
                            for it in tmp:
                                if "<" in it:
                                    t = it.split("<")[0]
                                    if "建筑面积" in t:
                                        tmp_huxing_str = tmp_huxing_str.rstrip("/") + " "
                                        tmp_huxing_str = tmp_huxing_str + t
                                    else:
                                        tmp_huxing_str = tmp_huxing_str + t + "/"
                        else:
                            tmp_huxing_str = tmp_huxing_str + data[real_j].strip()
                        real_j = real_j + 1
                    loupan_huxing.append(tmp_huxing_str)
                    continue
                if url_key in data[j]:
                    loupan_url.append(data[j].split(url_key)[1].split("\"")[0])
                    continue
                if panel_key in data[j]:
                    real_j = j + 1
                    tmp_panel_str = ""
                    while data[real_j] != "
": tmp_panel_str = tmp_panel_str + data[real_j].split(">")[1].split("<")[0] + " " real_j = real_j + 1 loupan_panel.append(tmp_panel_str.strip()) continue if price_key in data[j]: price_flag = 1 tmp = data[j].split(">") tmp_price_str = "" for it in tmp: tmp_price_str = tmp_price_str + it.split("<")[0] loupan_price.append(tmp_price_str) continue if price_around_key in data[j]: price_flag = 1 real_j = j + 1 tmp = data[real_j].split(">") tmp_price_str = "" for it in tmp: tmp_price_str = tmp_price_str + it.split("<")[0].strip() loupan_price.append(tmp_price_str) continue if tel_key in data[j]: tel_flag = 1 loupan_tel.append(data[j].split(tel_key)[1].split("<")[0]) continue if price_flag == 0: loupan_price.append("售价待定") if huxing_flag == 0: loupan_huxing.append("户型未知") if tel_flag == 0: loupan_tel.append("号码未知") loupan_area.append(ikey) if sheet_count != [] and sheet_count[-1] != len(dest_loupan_arr): break sheet_count.append(len(dest_loupan_arr)) excel_col=[] excel_col.append([u'楼盘',u'价格',u'区域',u'户型',u'地址',u'状态',u'网址',u'电话']) for icol in range(len(loupan_title)): if "套" in loupan_price[icol]: continue if "套" not in loupan_price[icol]: tmp_price = "" for i in loupan_price[icol]: if str(i) >= '0' and str(i) <= '9': tmp_price=tmp_price+i if tmp_price != "": loupan_price[icol]=int(tmp_price) else: continue tmp=[loupan_title[icol],loupan_price[icol],loupan_area[icol],loupan_huxing[icol],loupan_address[icol],loupan_panel[icol],loupan_url[icol],loupan_tel[icol]] excel_col.append(tmp) ####写入excel app = xlwt.Workbook() #创建工作簿 sheet1 = app.add_sheet(sheet_name,cell_overwrite_ok=True) #创建sheetapp for icol in range(len(excel_col)): for jcol in range(0,len(excel_col[icol])): sheet1.write(icol,jcol,excel_col[icol][jcol]) t=time.strftime('%Y-%m-%d_%H_%M_%S',time.localtime(time.time())) t_path="C:/bz/"+str(city)+t+".xlsx" app.save(t_path) #保存文件 ################################抓取安居客楼盘价格################################# ################################读取excel数据################################# # 获取一个Book对象 workbook = xlrd.open_workbook(t_path) # 获取一个sheet对象的列表 sheets = workbook.sheets() sheet_data = workbook.sheet_by_name(sheet_name) cols=sheet_data.col_values(0) title=sheet_data.row_values(0) ###数据标记和index loupan_index=title.index(u'楼盘') loupan_key=[] area_index=title.index(u'区域') area_key=[] price_index=title.index(u'价格') ###每行数据 lines_data=[] lines_data.append(title) for icol in range(1,len(cols)): rows = sheet_data.row_values(icol) if rows[loupan_index] not in loupan_key: loupan_key.append(rows[loupan_index]) lines_data.append(rows) if rows[area_index] not in area_key: area_key.append(rows[area_index]) ###获取区域均价 area_price={} for ikey in area_key: tmp_price=0 tmp_count=0 for iline in lines_data: if iline[area_index] == ikey: tmp_count=tmp_count+1 tmp_price=tmp_price+int(iline[price_index]) if tmp_count != 0: area_price[ikey]=int(tmp_price/tmp_count*1.0) ################################读取excel数据################################# ################################绘图################################# #解决中文乱码问题 plt.rcParams['font.sans-serif'] = ['simHei'] plt.rcParams['axes.unicode_minus'] = False avg_price_x=[] avg_price_y=[] for ikey in area_key: avg_price_x.append(ikey) avg_price_y.append(area_price[ikey]) width = 0.5 # the width of the bars x = range(len(avg_price_x)) fig, ax = plt.subplots(figsize=(10*(len(avg_price_x)/10),6)) rects1 = ax.bar(x, avg_price_y, width, color='yellowgreen') ax.set_title(sheet_name+'各个区域楼盘均价') plt.ylabel(u"区域均价(元)") plt.xticks(x, avg_price_x) for rect in rects1: height = rect.get_height() ax.text(rect.get_x() + rect.get_width() / 2, height,'%d' % int(height),ha='center', va='bottom') plt.show() ################################绘图#################################3.后续会做界面出来,通过界面选择不同的城市,然后点按钮跳出该城市的房价信息。

你可能感兴趣的:(python)