本文试通过python爬虫获取xxx网页公开的部分信息并保存在excel工作文件中。本项目主要应用python爬虫,数据库存取,excel文件操作等功能实现。下面首先给出开头代码片段,本人纯编程小白,这里主要想分享自己的思路:
from bs4 import BeautifulSoup
import requests, random, fake_useragent
import redis, re, time
import pandas as pd
# 从代理IP池中随机获取一个IP
def get_proxy():
ip = '127.0.0.1'
password = None
r = redis.Redis(host=ip, password=password, port=6379, db=0, decode_responses=True)
len = int(r.zcard('proxies:universal'))
min_len = [0 if len < 100 else len - 100][0]
ip_list = r.zrange('proxies:universal', min_len, len, withscores=True)
random_ip = random.choices(ip_list)[0][0]
return random_ip
由于xxx网页反爬措施较强,这里使用了自建IP代理池实现随机IP地址获取,仅随机选取排名靠前的100个地址。关于自建IP池主要参考下面这篇文章:自建IP代理池。当然网上也有其他自建IP代理池的文章,了解的朋友们可以相互分享一下。对于不想花钱购买代理IP的,自建IP池是个不错的选择,当然花钱买的代理高匿性好访问更不容易被发现。
# 初始化爬虫
def _init(url, city):
ua = fake_useragent.UserAgent()
browsers = ua.data_browsers
random_ua = random.choices(browsers[random.choices(ua.data_randomize)[0]])[0]
headers = {
'User-Agent': random_ua,
'referer': "https://{}.anjuke.com/sale/p1/".format(city),
'cookie': ''
}
agent_ip = get_proxy()
proxy = {"http": "http://{}".format(agent_ip)}
response = requests.get(url, headers=headers, proxies=proxy)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
# 验证访问是否成功
def test_ping(soup):
head = soup.find('head')
title = head.find('title').string
if '访问验证-安居客' in title:
print('访问失败')
return False
else:
print('访问成功')
print(title)
return True
初始化部分使用了随机模拟浏览器头UserAgent,代码里的cookie需要你们自己去获取了,限于时间关系就没去做自动获取cookie的代码。尽管反爬措施较为完善,但真正实现还是有一定距离,因此在获取url后还需要验证访问是否成功。
# 获取所有城市
def get_city_info():
url = 'https://nc.anjuke.com/sale/'
web = _init(url, 'nc')
_ul = web.find_all('ul', {'class': 'city-list'})
ip = '127.0.0.1'
password = None
r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
li_end = 0
pattern = re.compile('/[a-z]+')
for ul in _ul:
_a = ul.find_all('a')
for i in range(len(_a)):
a = _a[i]
city_info = re.search(pattern, a.attrs['href']).group().replace('/', '')
r.hset('city_info', '{}'.format(li_end + i), '{}'.format(city_info))
li_end += len(_a)
首先获取到xxx网页下所有的城市并保存在redis数据库中。关于redis数据的操作网上有很多教程,这里用到的也很简单,就不再赘述。
# 将获取的房屋url保存在redis数据库中
def get_redis(update=False, target_city=None, page=1):
ip = '127.0.0.1'
password = None
r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
check_point = eval(r.hget('checkpoint', '0'))
cities = list(r.hgetall('city_info').values())
for city in cities[check_point:]:
if update is False and target_city is None:
time.sleep(random.randint(5, 10))
url = 'https://{0}.anjuke.com/sale/p{1}/'.format(city, page)
web = _init(url, city)
Flag = test_ping(web)
if Flag is False:
break
else:
_a = web.find_all('a', {'class': 'property-ex'})
for i in range(len(_a)):
a = _a[i]
href = a.attrs['href']
r.hset('my_city:{0}'.format(city), '{}'.format(i), '{}'.format(href))
r.hset('checkpoint', '0', '{}'.format(cities.index(city)))
elif update is True and target_city is not None:
city = target_city
time.sleep(random.randint(5, 10))
url = 'https://{0}.anjuke.com/sale/p{1}/'.format(city, page)
web = _init(url, city)
Flag = test_ping(web)
if Flag is False:
break
else:
_a = web.find_all('a', {'class': 'property-ex'})
for i in range(len(_a)):
a = _a[i]
href = a.attrs['href']
r.hset('my_city:{0}'.format(city), '{}'.format(i), '{}'.format(href))
在获取完成所有的城市信息后,逐一访问每一个城市爬取当前page下的二手房url并保存在redis数据库中,以便随时调用。在访问代码部分我使用了三个参数分别代表是否更新,更新目标城市,更新page。由于xxx网站上的数据会随时变动,因此保存的网页如果长时间不去访问很可能会丢失,这就需要重新再爬取,通过指定的target_city参数便可实现定向更新。
# 检查点
def checkpoint(code='0000'):
ip = '127.0.0.1'
password = None
r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
if code != '0000':
r.hset('checkpoint', '0', code)
try:
current_point = r.hget('checkpoint', '0')
return [int(current_point[0:2]), int(current_point[2:])]
except:
r.hset('checkpoint', '0', code)
current_point = r.hget('checkpoint', '0')
return [int(current_point[0:2]), int(current_point[2:])]
# 获取房屋具体数据
def get_house_data():
ip = '127.0.0.1'
password = None
r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
getPoint = checkpoint()
cityCode, listCode = getPoint[0], getPoint[1]
pattern = re.compile(r'[\d+]*(\.{1})?\d+') # 匹配任何正实数
# 循环更新
while cityCode <= 65:
city = r.hget('city_info', '{}'.format(cityCode))
url = r.hget('my_city:{}'.format(city), '{}'.format(listCode))
web = _init(url, city)
time.sleep(random.randint(5, 10))
flag = test_ping(web)
if flag is False: break
try:
houseAvgPrice = re.search(pattern, web.find('div', {'class': 'maininfo-avgprice-price'}).string).group()
item_01 = web.find('div', {'class': 'maininfo-model-item maininfo-model-item-1'})
item_02 = web.find('div', {'class': 'maininfo-model-item maininfo-model-item-2'})
item_03 = web.find('div', {'class': 'maininfo-model-item maininfo-model-item-3'})
houseModelNum = item_01.find_all('i', {'class': 'maininfo-model-strong-num'})
houseBedRoomNum = houseModelNum[0].string
houseSaloonNum = houseModelNum[1].string
try:
houseBathroomNum = houseModelNum[2].string
except:
houseBathroomNum = '0'
houseModelPos = item_01.find('div', {'class': 'maininfo-model-weak'}).string
houseTotalArea = item_02.find('i', {'class': 'maininfo-model-strong-num'}).string
houseFitmentLv = item_02.find('div', {'class': 'maininfo-model-weak'}).string
houseToward = item_03.find('i', {'class': 'maininfo-model-strong-text'}).string
houseAge = re.search(pattern, item_03.find('div', {'class': 'maininfo-model-weak'}).string).group()
houseName = web.find('div', {'class': 'crumbs crumbs-middle'}).find_all('a')[-1].string
span = web.find_all('span', {'class': 'houseInfo-main-item-name'})[1]
houseProperty = span.string
communityAvgPrice = re.search(pattern, web.find('span', {'class': 'monthchange-money'}).string).group()
tr = web.find_all('div', {'class': 'community-info-tr'})
communityPopulation = re.search(pattern,
tr[0].find('p', {'class': 'community-info-td-value'}).string).group()
div = tr[1].find('div', {'class': 'community-info-td'})
communityPropertyPrice = re.search(pattern,
div.find('p', {'class': 'community-info-td-value'}).string).group()
div2 = tr[1].find('div', {'class': 'community-info-td community-info-right'})
communityHouse = div2.find_all('p')[-1].string
communityGreening = re.search(pattern, tr[2].find('p', {'class': 'community-info-td-value'}).string).group()
# 数据录入
excelName = '安居客数据收集表.xlsx'
Excel = pd.read_excel(excelName, 'Sheet2')
Data = pd.DataFrame(Excel)
row = str(len(Data.index) + 1)
Data.at[row, '房屋名称'] = '{}:'.format(city) + houseName
Data.at[row, '小区历史均价'] = communityAvgPrice
Data.at[row, '卧室数'] = houseBedRoomNum
Data.at[row, '客厅数'] = houseSaloonNum
Data.at[row, '浴室数'] = houseBathroomNum
Data.at[row, '面积'] = houseTotalArea
Data.at[row, '楼层'] = houseModelPos
Data.at[row, '装修'] = houseFitmentLv
Data.at[row, '房龄'] = houseAge
Data.at[row, '物业类型'] = houseProperty
Data.at[row, '朝向'] = houseToward
Data.at[row, '物业费'] = communityPropertyPrice
Data.at[row, '小区户数'] = communityPopulation
Data.at[row, '绿化率'] = communityGreening
Data.at[row, '容积率'] = communityHouse
Data.at[row, '售价'] = houseAvgPrice
Data.to_excel(excelName, index=False, sheet_name='Sheet2')
except Exception as e:
print(e)
pass
# 记录当前位置
listCode += 1
if listCode == 60:
cityCode += 1
listCode = 0
code = '{0}{1}'.format(['0{}'.format(cityCode) if cityCode < 10 else cityCode][0],
['0{}'.format(listCode) if listCode < 10 else listCode][0])
print(code)
checkpoint(code)
这部分是代码的核心,首先创建一个检查点用于记录当前爬取的网页信息点位,以便随时可以关闭或重新爬取而不用担心重复爬取的问题。这里的检查点我用了一个四位数来表示,其中前两位代表城市编码,后两位代表对应的url编码。在核心代码片段中用了一个正则表达式来匹配需要的数值,利用pandas库保存数据到excel中,每次保存数据时都会在excel中的最后一行下新增一行。为了避免出现找不到数据的问题,这里用try过滤了缺失值。
# 清空数据库
def clear_data():
ip = '127.0.0.1'
password = None
r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
cities = list(r.hgetall('city_info').values())
for city in cities:
maxLen = len(list(r.hgetall('my_city:{}'.format(city)).keys()))
for i in range(maxLen):
r.hdel('my_city:{}'.format(city), '{}'.format(i))
print('数据库my_city:{}已清除'.format(city))
print('数据库已清空!')
最后在爬取完全部数据库中的url后清空当前数据库中的信息,以便可以重新爬取并记录新的页面。