说明:仅做学习之用
下面用常用的两种方法来爬取微博:使用selenium+phantomjs和API解析
最重要的是设置user_agent,否则无法跳转链接
当然,还可以设置其它更多参数
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
user_agent = (
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE'
)
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = user_agent
driver = webdriver.PhantomJS(desired_capabilities=dcap)
一般的,我们现在浏览器console控制台找到我们想要的东西
document.getElementById('loginname').value = '123'
document.getElementsByName('password')[0].value = '123'
下面是两种登录方式,但我在登录过程中遇到了验证码问题,后来在输入用户名和密码后各自休息了几秒钟,居然登录成功了,难道是程序输入太快而被阻止了吗???
#driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver = webdriver.Chrome()
#driver.set_window_size(1280,2400)
driver.get('https://www.weibo.com/')
time.sleep(10)
driver.find_element_by_id('loginname').send_keys(username)
time.sleep(5)
driver.find_element_by_name('password').send_keys(password)
time.sleep(2)
driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()
# wait = WebDriverWait(driver, 10)
# u_id = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#loginname')))
# p_word = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.password > div > input')))
# login = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_form > div > div:nth-child(3) > div.info_list.login_btn > a')))
# u_id.send_keys(username)
# time.sleep(5)
# p_word.send_keys(password)
# time.sleep(2)
# login.click()
time.sleep(15)
html = driver.page_source
print(html)
第二种方法代码复杂一些,需要导入一些库,但它不用分析网页源码,可以自动等待页面加载,只需要在chrome浏览器控制台源码中找到然后右键》copy》copy selector
就这样登录成功了!!!!
当我们在翻阅别人的微博时,只能翻一页就需要登录
剩下的就是结合需求,分析源码,找到你想要的内容了!!!!
代码:
import hashlib
import threading
import time
import re
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from pybloom import BloomFilter
from collections import deque
user_agent = (
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE'
)
username = 'your weibo ID'
password = 'your password'
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.userAgent'] = user_agent
#爬取个人主页的爬虫
#feeds_crawler = webdriver.PhantomJS(desired_capabilities=dcap)
feeds_crawler = webdriver.Chrome()
feeds_crawler.set_window_size(1280,2400)
#爬取个人中心:关注,粉丝,微博
#user_crawler = webdriver.PhantomJS(desired_capabilities=dcap)
user_crawler = webdriver.Chrome()
feeds_crawler.set_window_size(1280,2400)
domain = 'weibo.com'
url_home = 'http://' + domain
download_bf = BloomFilter(1024*1024*16,0.01)
cur_queue = deque()
seed_user = 'http://weibo.com/yaochen'
#获取有价值用户所设阈值
min_mblogs_allowed = 100 #user所发微博最低数量
max_follow_fans_ratio_allowed = 3 #关注(follow)/粉丝(fans)的倍数不能错过3倍
def extract_user(users):
print('extract user')
for i in range(0,20):
for user_element in user_crawler.find_elements_by_xpath('//*[contains(@class, "follow_item")]'):
tried = 0
while tried < 3:
try:
user = {}
user['follows'] = re.findall('(\d+)', user_element.find_element_by_xpath('.//div[@class="info_connect"]/span').text)[0]
user['follows_link'] = user_element.find_element_by_xpath('.//div[@class="info_connect"]/span//a').get_attribute('href')
user['fans'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[1].text)[0]
user['fans_link'] = user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span//a')[1].get_attribute('href')
user['mblogs'] = re.findall('(\d+)', user_element.find_elements_by_xpath('.//div[@class="info_connect"]/span')[2].text)[0]
user_link = user_element.find_element_by_xpath('.//div[contains(@class,"info_name")]/a')
user['link'] = re.findall('(.+)\?', user_link.get_attribute('href'))[0]
if user['link'][:4] != 'http':
user['link'] = domain + user['link']
user['name'] = user_link.text
user['icon'] = re.findall('/([^/]+)$', user_element.find_element_by_xpath('.//dt[@class="mod_pic"]/a/img').get_attribute('src'))[0]
# name = user_element.find_element_by_xpath('.//a[@class="S_txt1"]')
print('--------------------')
print(user['name'] + ' follows: ' + user['follows'] + ' blogs:' + user['mblogs'])
print(user['link'])
# 如果微博数量少于阈值或者关注数量与粉丝数量比值超过阈值,则跳过
if int(user['mblogs']) < min_mblogs_allowed or int(user['follows'])/int(user['fans']) > max_follow_fans_ratio_allowed:
break
enqueueUrl(user['link'])
users.append(user)
break
except Exception:
time.sleep(1)
tried += 1
if go_next_page(user_crawler) is False:
return users
def scroll_to_bottom():
print('scroll down !!')
for i in range(50):
feeds_crawler.execute_script('window.scrollTo(0, document.body.scrollHeight)')
html = feeds_crawler.page_source
res = etree.HTML(html)
next_page_url = res.xpath('//a[contains(@class,"page next")]')
if len(next_page_url) > 0:
return next_page_url[0].get('href')
if len(re.findall('点击重新载入', html)) > 0:
print('scrolling failed, reload it')
feeds_crawler.find_element_by_link_text('点击重新载入').click()
time.sleep(1)
def go_next_page(cur_driver):
try:
next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href')
print('next page is ' + next_page)
cur_driver.get(next_page)
time.sleep(3)
return True
except Exception:
print('next page is not found')
return False
def extract_feed(feeds):
for i in range(20):
scroll_to_bottom()
#博文内容爬取
for element in feeds_crawler.find_elements_by_class_name('WB_detail'):
tried = 0
while tried < 3:
try:
feed = {}
feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text
feed['content'] = element.find_element_by_class_name('WB_text').text
feed['image_names'] = []
for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'):
feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src')))
feeds.append(feed)
print('--------------------')
print(feed['time'])
print(feed['content'])
break
except Exception:
tried += 1
time.sleep(1)
if go_next_page(feeds_crawler) is False:
return feeds
def enqueueUrl(url):
#将要爬取的url加入爬取队列
try:
md5v = hashlib.md5(url.encode('gb2312')).hexdigest() #在Python3中字符串要编码成byte类型
if md5v not in download_bf:
print(url + ' is added to queue')
cur_queue.append(url)
download_bf.add(md5v)
except ValueError:
print('enqueueUrl err !!!!!')
def Login(username, password):
#登录页面
'''
:param username: your weibo id
:param password: your password
:return:
'''
feeds_crawler.get(url=url_home)
user_crawler.get(url=url_home)
time.sleep(8)
print('find click button to login')
feeds_crawler.find_element_by_id('loginname').send_keys(username)
feeds_crawler.find_element_by_name('password').send_keys(password)
time.sleep(3)
#点击登录按钮
feeds_crawler.find_element_by_xpath('//div[contains(@class, "login_btn")][1]/a').click()
# 也可以使用 execute_script 来执行一段 javascript
# feeds_crawler.execute_script('document.getElementsByClassName("W_btn_a btn_32px")[0].click()')
#
#同样对于另一个爬虫也需要登录
user_crawler.find_element_by_id('loginname').send_keys(username)
user_crawler.find_element_by_name('password').send_keys(password)
time.sleep(3)
# # 执行 click()
user_crawler.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()
def dequeUrl():
return cur_queue.popleft() #将队列中的元素(url)一个一个的返回
def get_element_by_xpath(cur_driver,path):
tried = 0
while tried<6:
html = cur_driver.page_source
res = etree.HTML(html)
elements = res.xpath(path)
if len(elements) == 0:
time.sleep(1)
continue
return elements
def fetch_user(url):
print('Downloading ' + url)
feeds_crawler.get(url)
time.sleep(5)
#提取用户姓名
account_name = get_element_by_xpath(feeds_crawler,'//h1')[0].text
photo = get_element_by_xpath(feeds_crawler, '//p[@class="photo_wrap"]/img')[0].get('src')
account_photo = re.findall('/([^/]+)$', photo)
# 提取他的关注主页
follows_link = get_element_by_xpath(feeds_crawler, '//a[@class="t_link S_txt1"]')[0].get('href')
print('account: ' + account_name)
print('follows link is ' + follows_link)
follows_link = 'http:' + follows_link
user_crawler.get( follows_link )
feeds = []
users = []
#设置两个线程同时抓取
t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,))
#t_users = threading.Thread(target=extract_user, name=None, args=(users,))
t_feeds.setDaemon(True)
#t_users.setDaemon(True)
t_feeds.start()
#t_users.start()
t_feeds.join()
#t_users.join()
def crawl():
while True:
url = dequeUrl()
fetch_user(url)
def main():
enqueueUrl(seed_user)
Login(username,password)
crawl()
if __name__ == '__main__':
main()
用于判断是否已经抓取,也可用前面讲的mmh库
>>>
>>> import hashlib
>>> url = 'http://www.weibo.com'
>>> md5v = hashlib.md5(url).hexdigest()
Traceback (most recent call last):
File "", line 1, in
md5v = hashlib.md5(url).hexdigest()
TypeError: Unicode-objects must be encoded before hashing
>>> md5v = hashlib.md5(url.encode('gb2312')).hexdigest()
>>>
>>>
>>> md5v2 = hashlib.md5(b'www.baidu.com').hexdigest()
>>> md5v
'17d7b29a31328702848d2d42ae79a240'
>>> md5v2
'dab19e82e1f9a681ee73346d3e7a575e'
>>>
>>>
>>> md5v3 = hashlib.md5(url.encode('gb2312')).hexdigest()
>>> md5v3
'17d7b29a31328702848d2d42ae79a240'
>>>
实际上,微博中只有图片文件名没有改变,而存储域名、分辨率可能会改变,因此在储存时,只需要储存图片文件名即可,当我们要用时补充完整即可。
之前已经掌握,这里不再赘述!!
网上也有很多