python爬虫进阶(三):微博的抓取

说明:仅做学习之用


下面用常用的两种方法来爬取微博:使用selenium+phantomjs和API解析


一、使用selenium+phantomjs


最重要的是设置user_agent,否则无法跳转链接

当然,还可以设置其它更多参数

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities


user_agent = (
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE'
)

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = user_agent

driver = webdriver.PhantomJS(desired_capabilities=dcap)


在登录时需要输入用户名与密码,然后点击登录按钮,还好这里不需要输入验证码




一般的,我们现在浏览器console控制台找到我们想要的东西

document.getElementById('loginname').value = '123'

document.getElementsByName('password')[0].value = '123'


下面是两种登录方式,但我在登录过程中遇到了验证码问题,后来在输入用户名和密码后各自休息了几秒钟,居然登录成功了,难道是程序输入太快而被阻止了吗???

#driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver = webdriver.Chrome()
#driver.set_window_size(1280,2400)
driver.get('https://www.weibo.com/')

time.sleep(10)

driver.find_element_by_id('loginname').send_keys(username)
time.sleep(5)
driver.find_element_by_name('password').send_keys(password)
time.sleep(2)
driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()

# wait = WebDriverWait(driver, 10)
# u_id = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#loginname')))
# p_word = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.password > div > input')))
# login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.login_btn > a')))
# u_id.send_keys(username)
# time.sleep(5)
# p_word.send_keys(password)
# time.sleep(2)
# login.click()

time.sleep(15)

html = driver.page_source

print(html)



第一种方法代码简单,但需要自己分析代码位置,而且要设置一个slee等待页面加载成功,如果等待时间太长,效率低,如果等待时间太短,则有可能达不到效果。

第二种方法代码复杂一些,需要导入一些库,但它不用分析网页源码,可以自动等待页面加载,只需要在chrome浏览器控制台源码中找到然后右键》copy》copy selector




就这样登录成功了!!!!

当我们在翻阅别人的微博时,只能翻一页就需要登录


剩下的就是结合需求,分析源码,找到你想要的内容了!!!!


代码:


import hashlib
import threading

import time

import re
from lxml import etree

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from pybloom import BloomFilter
from collections import deque






user_agent = (
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE'
)

username = 'your weibo ID'
password = 'your password'

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = user_agent

#爬取个人主页的爬虫
#feeds_crawler = webdriver.PhantomJS(desired_capabilities=dcap)
feeds_crawler = webdriver.Chrome()
feeds_crawler.set_window_size(1280,2400)
#爬取个人中心:关注,粉丝,微博
#user_crawler = webdriver.PhantomJS(desired_capabilities=dcap)
user_crawler = webdriver.Chrome()
feeds_crawler.set_window_size(1280,2400)
domain = 'weibo.com'
url_home = 'http://' + domain

download_bf = BloomFilter(1024*1024*16,0.01)
cur_queue = deque()

seed_user = 'http://weibo.com/yaochen'

#获取有价值用户所设阈值
min_mblogs_allowed = 100            #user所发微博最低数量
max_follow_fans_ratio_allowed = 3    #关注(follow)/粉丝(fans)的倍数不能错过3倍



def extract_user(users):
    print('extract user')
    for i in range(0,20):
        for user_element in user_crawler.find_elements_by_xpath('//*[contains(@class, "follow_item")]'):
            tried = 0
            while tried < 3:
                try:
                    user = {}
                    user['follows'] = re.findall('(\d+)', user_element.find_element_by_xpath('.//div[@class="info_connect"]/span').text)[0]
                    user['follows_link'] = user_element.find_element_by_xpath('.//div[@class="info_connect"]/span//a').get_attribute('href')
                    user['fans'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[1].text)[0]
                    user['fans_link'] = user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span//a')[1].get_attribute('href')
                    user['mblogs'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[2].text)[0]
                    user_link = user_element.find_element_by_xpath('.//div[contains(@class,"info_name")]/a')
                    user['link'] = re.findall('(.+)\?', user_link.get_attribute('href'))[0]
                    if user['link'][:4] != 'http':
                        user['link'] = domain + user['link']
                    user['name'] = user_link.text
                    user['icon'] = re.findall('/([^/]+)$', user_element.find_element_by_xpath('.//dt[@class="mod_pic"]/a/img').get_attribute('src'))[0]
                    # name = user_element.find_element_by_xpath('.//a[@class="S_txt1"]')

                    print('--------------------')
                    print(user['name'] + ' follows: ' + user['follows'] + ' blogs:' + user['mblogs'])
                    print(user['link'])

                    # 如果微博数量少于阈值或者关注数量与粉丝数量比值超过阈值,则跳过
                    if int(user['mblogs']) < min_mblogs_allowed or int(user['follows'])/int(user['fans']) > max_follow_fans_ratio_allowed:
                        break

                    enqueueUrl(user['link'])
                    users.append(user)
                    break
                except Exception:
                    time.sleep(1)
                    tried += 1
        if go_next_page(user_crawler) is False:
            return users


def scroll_to_bottom():
    print('scroll down !!')
    for i in range(50):
        feeds_crawler.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        html = feeds_crawler.page_source
        res = etree.HTML(html)
        next_page_url = res.xpath('//a[contains(@class,"page next")]')
        if len(next_page_url) > 0:
            return next_page_url[0].get('href')
        if len(re.findall('点击重新载入', html)) > 0:
            print('scrolling failed, reload it')
            feeds_crawler.find_element_by_link_text('点击重新载入').click()
        time.sleep(1)


def go_next_page(cur_driver):
    try:
        next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href')
        print('next page is ' + next_page)
        cur_driver.get(next_page)
        time.sleep(3)
        return True
    except Exception:
        print('next page is not found')
        return False


def extract_feed(feeds):
    for i in range(20):
        scroll_to_bottom()
        #博文内容爬取
        for element in feeds_crawler.find_elements_by_class_name('WB_detail'):
            tried = 0
            while tried < 3:
                try:
                    feed = {}
                    feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text
                    feed['content'] = element.find_element_by_class_name('WB_text').text
                    feed['image_names'] = []
                    for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'):
                        feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src')))
                    feeds.append(feed)
                    print('--------------------')
                    print(feed['time'])
                    print(feed['content'])
                    break
                except Exception:
                    tried += 1
                    time.sleep(1)
        if go_next_page(feeds_crawler) is False:
            return feeds


def enqueueUrl(url):
    #将要爬取的url加入爬取队列
    try:
        md5v = hashlib.md5(url.encode('gb2312')).hexdigest()        #在Python3中字符串要编码成byte类型
        if md5v not in download_bf:
            print(url + ' is added to queue')
            cur_queue.append(url)
            download_bf.add(md5v)

    except ValueError:
        print('enqueueUrl err !!!!!')


def Login(username, password):
    #登录页面
    '''
    :param username: your weibo id
    :param password: your password
    :return: 
    '''
    feeds_crawler.get(url=url_home)
    user_crawler.get(url=url_home)

    time.sleep(8)

    print('find click button to login')
    feeds_crawler.find_element_by_id('loginname').send_keys(username)
    feeds_crawler.find_element_by_name('password').send_keys(password)
    time.sleep(3)
    #点击登录按钮
    feeds_crawler.find_element_by_xpath('//div[contains(@class, "login_btn")][1]/a').click()
    # 也可以使用 execute_script 来执行一段 javascript
    # feeds_crawler.execute_script('document.getElementsByClassName("W_btn_a btn_32px")[0].click()')
    #
    #同样对于另一个爬虫也需要登录
    user_crawler.find_element_by_id('loginname').send_keys(username)
    user_crawler.find_element_by_name('password').send_keys(password)
    time.sleep(3)
    # # 执行 click()
    user_crawler.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()


def dequeUrl():
    return cur_queue.popleft()      #将队列中的元素(url)一个一个的返回

def get_element_by_xpath(cur_driver,path):
    tried = 0
    while tried<6:
        html = cur_driver.page_source
        res = etree.HTML(html)
        elements = res.xpath(path)
        if len(elements) == 0:
            time.sleep(1)
            continue
        return elements


def fetch_user(url):
    print('Downloading ' + url)
    feeds_crawler.get(url)
    time.sleep(5)
    #提取用户姓名
    account_name = get_element_by_xpath(feeds_crawler,'//h1')[0].text
    photo = get_element_by_xpath(feeds_crawler, '//p[@class="photo_wrap"]/img')[0].get('src')

    account_photo = re.findall('/([^/]+)$', photo)

    # 提取他的关注主页
    follows_link = get_element_by_xpath(feeds_crawler, '//a[@class="t_link S_txt1"]')[0].get('href')

    print('account: ' + account_name)
    print('follows link is ' + follows_link)
    follows_link = 'http:' + follows_link
    user_crawler.get( follows_link )

    feeds = []
    users = []
    #设置两个线程同时抓取
    t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,))
    #t_users = threading.Thread(target=extract_user, name=None, args=(users,))

    t_feeds.setDaemon(True)
    #t_users.setDaemon(True)

    t_feeds.start()
    #t_users.start()

    t_feeds.join()
    #t_users.join()


def crawl():
    while True:
        url = dequeUrl()
        fetch_user(url)


def main():
    enqueueUrl(seed_user)
    Login(username,password)
    crawl()
    
if __name__ == '__main__':
    main()
    

注意:在python3中用hashlib库进行加密时,要将加密内容转换成byte类型:

用于判断是否已经抓取,也可用前面讲的mmh库

>>> 
>>> import hashlib
>>> url = 'http://www.weibo.com'
>>> md5v = hashlib.md5(url).hexdigest()
Traceback (most recent call last):
  File "", line 1, in 
    md5v = hashlib.md5(url).hexdigest()
TypeError: Unicode-objects must be encoded before hashing
>>> md5v = hashlib.md5(url.encode('gb2312')).hexdigest()
>>> 
>>> 
>>> md5v2 = hashlib.md5(b'www.baidu.com').hexdigest()
>>> md5v
'17d7b29a31328702848d2d42ae79a240'
>>> md5v2
'dab19e82e1f9a681ee73346d3e7a575e'
>>> 
>>> 
>>> md5v3 = hashlib.md5(url.encode('gb2312')).hexdigest()
>>> md5v3
'17d7b29a31328702848d2d42ae79a240'
>>> 

关于微博图片:



实际上,微博中只有图片文件名没有改变,而存储域名、分辨率可能会改变,因此在储存时,只需要储存图片文件名即可,当我们要用时补充完整即可。


二、API接口分析


之前已经掌握,这里不再赘述!!

网上也有很多





你可能感兴趣的:(python)