自己动手做一个爬虫项目

本文试通过python爬虫获取xxx网页公开的部分信息并保存在excel工作文件中。本项目主要应用python爬虫,数据库存取,excel文件操作等功能实现。下面首先给出开头代码片段,本人纯编程小白,这里主要想分享自己的思路:

from bs4 import BeautifulSoup
import requests, random, fake_useragent
import redis, re, time
import pandas as pd


# 从代理IP池中随机获取一个IP
def get_proxy():
    ip = '127.0.0.1'
    password = None
    r = redis.Redis(host=ip, password=password, port=6379, db=0, decode_responses=True)
    len = int(r.zcard('proxies:universal'))
    min_len = [0 if len < 100 else len - 100][0]
    ip_list = r.zrange('proxies:universal', min_len, len, withscores=True)
    random_ip = random.choices(ip_list)[0][0]
    return random_ip

由于xxx网页反爬措施较强,这里使用了自建IP代理池实现随机IP地址获取,仅随机选取排名靠前的100个地址。关于自建IP池主要参考下面这篇文章:自建IP代理池。当然网上也有其他自建IP代理池的文章,了解的朋友们可以相互分享一下。对于不想花钱购买代理IP的,自建IP池是个不错的选择,当然花钱买的代理高匿性好访问更不容易被发现。

# 初始化爬虫
def _init(url, city):
    ua = fake_useragent.UserAgent()
    browsers = ua.data_browsers
    random_ua = random.choices(browsers[random.choices(ua.data_randomize)[0]])[0]
    headers = {
        'User-Agent': random_ua,
        'referer': "https://{}.anjuke.com/sale/p1/".format(city),
        'cookie': ''
    }
    agent_ip = get_proxy()
    proxy = {"http": "http://{}".format(agent_ip)}
    response = requests.get(url, headers=headers, proxies=proxy)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup


# 验证访问是否成功
def test_ping(soup):
    head = soup.find('head')
    title = head.find('title').string
    if '访问验证-安居客' in title:
        print('访问失败')
        return False
    else:
        print('访问成功')
        print(title)
        return True

初始化部分使用了随机模拟浏览器头UserAgent,代码里的cookie需要你们自己去获取了,限于时间关系就没去做自动获取cookie的代码。尽管反爬措施较为完善,但真正实现还是有一定距离,因此在获取url后还需要验证访问是否成功。

# 获取所有城市
def get_city_info():
    url = 'https://nc.anjuke.com/sale/'
    web = _init(url, 'nc')
    _ul = web.find_all('ul', {'class': 'city-list'})
    ip = '127.0.0.1'
    password = None
    r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
    li_end = 0
    pattern = re.compile('/[a-z]+')
    for ul in _ul:
        _a = ul.find_all('a')
        for i in range(len(_a)):
            a = _a[i]
            city_info = re.search(pattern, a.attrs['href']).group().replace('/', '')
            r.hset('city_info', '{}'.format(li_end + i), '{}'.format(city_info))
        li_end += len(_a)

首先获取到xxx网页下所有的城市并保存在redis数据库中。关于redis数据的操作网上有很多教程,这里用到的也很简单,就不再赘述。

# 将获取的房屋url保存在redis数据库中
def get_redis(update=False, target_city=None, page=1):
    ip = '127.0.0.1'
    password = None
    r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
    check_point = eval(r.hget('checkpoint', '0'))
    cities = list(r.hgetall('city_info').values())
    for city in cities[check_point:]:
        if update is False and target_city is None:
            time.sleep(random.randint(5, 10))
            url = 'https://{0}.anjuke.com/sale/p{1}/'.format(city, page)
            web = _init(url, city)
            Flag = test_ping(web)
            if Flag is False:
                break
            else:
                _a = web.find_all('a', {'class': 'property-ex'})
                for i in range(len(_a)):
                    a = _a[i]
                    href = a.attrs['href']
                    r.hset('my_city:{0}'.format(city), '{}'.format(i), '{}'.format(href))
                r.hset('checkpoint', '0', '{}'.format(cities.index(city)))
        elif update is True and target_city is not None:
            city = target_city
            time.sleep(random.randint(5, 10))
            url = 'https://{0}.anjuke.com/sale/p{1}/'.format(city, page)
            web = _init(url, city)
            Flag = test_ping(web)
            if Flag is False:
                break
            else:
                _a = web.find_all('a', {'class': 'property-ex'})
                for i in range(len(_a)):
                    a = _a[i]
                    href = a.attrs['href']
                    r.hset('my_city:{0}'.format(city), '{}'.format(i), '{}'.format(href))

在获取完成所有的城市信息后,逐一访问每一个城市爬取当前page下的二手房url并保存在redis数据库中,以便随时调用。在访问代码部分我使用了三个参数分别代表是否更新,更新目标城市,更新page。由于xxx网站上的数据会随时变动,因此保存的网页如果长时间不去访问很可能会丢失,这就需要重新再爬取,通过指定的target_city参数便可实现定向更新。 

# 检查点
def checkpoint(code='0000'):
    ip = '127.0.0.1'
    password = None
    r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
    if code != '0000':
        r.hset('checkpoint', '0', code)
    try:
        current_point = r.hget('checkpoint', '0')
        return [int(current_point[0:2]), int(current_point[2:])]
    except:
        r.hset('checkpoint', '0', code)
        current_point = r.hget('checkpoint', '0')
        return [int(current_point[0:2]), int(current_point[2:])]


# 获取房屋具体数据
def get_house_data():
    ip = '127.0.0.1'
    password = None
    r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
    getPoint = checkpoint()
    cityCode, listCode = getPoint[0], getPoint[1]
    pattern = re.compile(r'[\d+]*(\.{1})?\d+')  # 匹配任何正实数
    # 循环更新
    while cityCode <= 65:
        city = r.hget('city_info', '{}'.format(cityCode))
        url = r.hget('my_city:{}'.format(city), '{}'.format(listCode))
        web = _init(url, city)
        time.sleep(random.randint(5, 10))
        flag = test_ping(web)
        if flag is False: break
        try:
            houseAvgPrice = re.search(pattern, web.find('div', {'class': 'maininfo-avgprice-price'}).string).group()
            item_01 = web.find('div', {'class': 'maininfo-model-item maininfo-model-item-1'})
            item_02 = web.find('div', {'class': 'maininfo-model-item maininfo-model-item-2'})
            item_03 = web.find('div', {'class': 'maininfo-model-item maininfo-model-item-3'})
            houseModelNum = item_01.find_all('i', {'class': 'maininfo-model-strong-num'})
            houseBedRoomNum = houseModelNum[0].string
            houseSaloonNum = houseModelNum[1].string
            try:
                houseBathroomNum = houseModelNum[2].string
            except:
                houseBathroomNum = '0'
            houseModelPos = item_01.find('div', {'class': 'maininfo-model-weak'}).string
            houseTotalArea = item_02.find('i', {'class': 'maininfo-model-strong-num'}).string
            houseFitmentLv = item_02.find('div', {'class': 'maininfo-model-weak'}).string
            houseToward = item_03.find('i', {'class': 'maininfo-model-strong-text'}).string
            houseAge = re.search(pattern, item_03.find('div', {'class': 'maininfo-model-weak'}).string).group()
            houseName = web.find('div', {'class': 'crumbs crumbs-middle'}).find_all('a')[-1].string
            span = web.find_all('span', {'class': 'houseInfo-main-item-name'})[1]
            houseProperty = span.string
            communityAvgPrice = re.search(pattern, web.find('span', {'class': 'monthchange-money'}).string).group()
            tr = web.find_all('div', {'class': 'community-info-tr'})
            communityPopulation = re.search(pattern,
                                            tr[0].find('p', {'class': 'community-info-td-value'}).string).group()
            div = tr[1].find('div', {'class': 'community-info-td'})
            communityPropertyPrice = re.search(pattern,
                                               div.find('p', {'class': 'community-info-td-value'}).string).group()
            div2 = tr[1].find('div', {'class': 'community-info-td community-info-right'})
            communityHouse = div2.find_all('p')[-1].string
            communityGreening = re.search(pattern, tr[2].find('p', {'class': 'community-info-td-value'}).string).group()

            # 数据录入
            excelName = '安居客数据收集表.xlsx'
            Excel = pd.read_excel(excelName, 'Sheet2')
            Data = pd.DataFrame(Excel)
            row = str(len(Data.index) + 1)
            Data.at[row, '房屋名称'] = '{}:'.format(city) + houseName
            Data.at[row, '小区历史均价'] = communityAvgPrice
            Data.at[row, '卧室数'] = houseBedRoomNum
            Data.at[row, '客厅数'] = houseSaloonNum
            Data.at[row, '浴室数'] = houseBathroomNum
            Data.at[row, '面积'] = houseTotalArea
            Data.at[row, '楼层'] = houseModelPos
            Data.at[row, '装修'] = houseFitmentLv
            Data.at[row, '房龄'] = houseAge
            Data.at[row, '物业类型'] = houseProperty
            Data.at[row, '朝向'] = houseToward
            Data.at[row, '物业费'] = communityPropertyPrice
            Data.at[row, '小区户数'] = communityPopulation
            Data.at[row, '绿化率'] = communityGreening
            Data.at[row, '容积率'] = communityHouse
            Data.at[row, '售价'] = houseAvgPrice
            Data.to_excel(excelName, index=False, sheet_name='Sheet2')
        except Exception as e:
            print(e)
            pass

        # 记录当前位置
        listCode += 1
        if listCode == 60:
            cityCode += 1
            listCode = 0
        code = '{0}{1}'.format(['0{}'.format(cityCode) if cityCode < 10 else cityCode][0],
                               ['0{}'.format(listCode) if listCode < 10 else listCode][0])
        print(code)
        checkpoint(code)

这部分是代码的核心,首先创建一个检查点用于记录当前爬取的网页信息点位,以便随时可以关闭或重新爬取而不用担心重复爬取的问题。这里的检查点我用了一个四位数来表示,其中前两位代表城市编码,后两位代表对应的url编码。在核心代码片段中用了一个正则表达式来匹配需要的数值,利用pandas库保存数据到excel中,每次保存数据时都会在excel中的最后一行下新增一行。为了避免出现找不到数据的问题,这里用try过滤了缺失值。

# 清空数据库
def clear_data():
    ip = '127.0.0.1'
    password = None
    r = redis.Redis(host=ip, password=password, port=6379, db=1, decode_responses=True)
    cities = list(r.hgetall('city_info').values())
    for city in cities:
        maxLen = len(list(r.hgetall('my_city:{}'.format(city)).keys()))
        for i in range(maxLen):
            r.hdel('my_city:{}'.format(city), '{}'.format(i))
        print('数据库my_city:{}已清除'.format(city))
    print('数据库已清空!')

最后在爬取完全部数据库中的url后清空当前数据库中的信息,以便可以重新爬取并记录新的页面。

你可能感兴趣的:(自己动手做系列,python,网络爬虫)