让数据更有价值,就需要对抓取的信息进行适当的处理,然后展现出来。
2.python源码:
#-*- encoding=UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import xlwt
import time
import xlrd
import matplotlib.pyplot as plt
city="xa" ###城市缩写
sheet_name="西安"
url_area="https://"+str(city)+".fang.anjuke.com/loupan/"
###各个title信息
area_key=r""
huxing_key="huxing"
url_key="class=\"tags-wrap\" href=\""
panel_key="class=\"tag-panel\""
price_key="class=\"price\""
price_around_key="\"favor-tag around-price\""
tel_key="class=\"tel\">"
###区域
loupan_area=[]
###楼盘名
loupan_title=[]
###楼盘地址
loupan_address=[]
###楼盘户型
loupan_huxing=[]
###楼盘URL
loupan_url=[]
###楼盘panel
loupan_panel=[]
###楼盘price
loupan_price=[]
###楼盘tel
loupan_tel=[]
################################抓取安居客楼盘价格#################################
###获取所有区域
anjuke_area = requests.get(url_area).text
anjuke_area=anjuke_area.split("\n")
area_loupan={}
for ihtml in anjuke_area:
if area_key in ihtml:
area_loupan[ihtml.split(">")[1].split("<")[0]]=ihtml.split("a href=")[1].split(">")[0]+"p"
for ikey in area_loupan.keys():
if ikey[-1] == "线":
continue
print(ikey)
inum = 0
sheet_count = []
while 1:
real_url = area_loupan[ikey] + str(inum + 1) + "w1_/"
inum=inum+1
html = urlopen(real_url)
anjuke_html = BeautifulSoup(html.read())
###得到原始信息
title_key_start = ""
loupan_arr = []
for ihtml in anjuke_html:
data = str(ihtml).split(title_key_start)
if len(data) > 1:
for i in data:
loupan_arr.append(i)
###筛选原始信息
title_key_stop=""
dest_loupan_arr = []
for i in range(len(loupan_arr)):
if i != 0 and i != len(loupan_arr) - 1:
dest_loupan_arr.append(loupan_arr[i])
if i == len(loupan_arr) - 1:
data = str(loupan_arr[i]).split(title_key_stop)
dest_loupan_arr.append(data[0])
###解析网页
for i in dest_loupan_arr:
price_flag = 0 ##部分楼盘售价待定
huxing_flag = 0 ##部分楼盘户型未知
tel_flag = 0 ##部分楼盘电环未知
data = str(i).split("\n")
for j in range(len(data)):
if j == 0: # loupan_key
loupan_title.append(data[j].split("<")[0])
continue
if address_key in data[j]:
loupan_address.append(data[j].split(address_key)[1].split("<")[0])
continue
if huxing_key in data[j]:
huxing_flag = 1
real_j = j + 1
tmp_huxing_str = ""
while data[real_j] != "":
if "" in data[real_j]:
tmp = data[real_j].split("")
for it in tmp:
if "<" in it:
t = it.split("<")[0]
if "建筑面积" in t:
tmp_huxing_str = tmp_huxing_str.rstrip("/") + " "
tmp_huxing_str = tmp_huxing_str + t
else:
tmp_huxing_str = tmp_huxing_str + t + "/"
else:
tmp_huxing_str = tmp_huxing_str + data[real_j].strip()
real_j = real_j + 1
loupan_huxing.append(tmp_huxing_str)
continue
if url_key in data[j]:
loupan_url.append(data[j].split(url_key)[1].split("\"")[0])
continue
if panel_key in data[j]:
real_j = j + 1
tmp_panel_str = ""
while data[real_j] != "