Python使用Selenium + PhantomJS抓取动态网页:今日头条

下载安装PhantomJS,这是一个无界浏览器,不能使用pip安装,以及安装lxml库,Selenium库。另外BeautifulSoup库和Selenium自带的css选择器也想用用看。

#coding:utf8
#python27
import re
from selenium import webdriver
import time
from lxml.html import fromstring

class Toutiao():
    links = []

首先是获取今日头条指定栏目下的新闻链接

class Toutiao():
    #省略...
    def getLinks(self,url):
        #调试时可以使用火狐浏览器,需事先下载火狐的驱动
        #driver = webdriver.Firefox(executable_path=r'C:\Python27\geckodriver.exe')
        driver = webdriver.PhantomJS()  
        driver.get(url)
        #设置隐式等待,最多等待10s
        driver.implicitly_wait(5) 
        for i in range(2):
            #设置下拉次数模拟下拉滚动条加载网页
            driver.execute_script("window.scrollBy(0,700)")
            driver.implicitly_wait(2)
        #将网页源码保存
        html = driver.page_source
        #一定要关闭
        driver.close()
        tree = fromstring(html)

另外,在爬取的时候,发现今日头条网站页面的元素属性并不是固定的,而且链接也不总是http,有时候是https,因此需要两者都考虑到。

    def getLinks(self,url):
        #省略...
        try:
            list = tree.xpath('//a[@class="link"]/@href')
        except:
            list = tree.xpath('//a[@class="link title"]/@href')
        for i in list:
            #需将抓到的链接拼接完全
            link = 'http://www.toutiao.com' + i
            print 'Get link:',link
            self.links.append(link)

看一下抓到的链接:

if __name__=='__main__':
    Toutiao().getLinks('http://www.toutiao.com/')

Python使用Selenium + PhantomJS抓取动态网页:今日头条_第1张图片
接下来自然是抓取新闻页面的内容了。我们只需要今日头条自己的新闻,并且是本日的新闻,今日头条里面有一部分是视频新闻,还有一些是其他网站的新闻,因此我们需要做一个过滤。
Python使用Selenium + PhantomJS抓取动态网页:今日头条_第2张图片
时间上的过滤方法:将网页中的发布时间抓取下来与本地时间做一个对比

def isToday(fb_time):
    today = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    fb_time = re.findall('(.*?) .*?',fb_time)[0]
    if fb_time != today:
        return True
    else:
        return False

接下来抓取正文,这里用下Selenium自带的选择器:

def getNews(self,url):
        print url,'开始抓取'
        driver = webdriver.PhantomJS()
        #driver = webdriver.Firefox(executable_path=r'C:\Python27\geckodriver.exe') 
        driver.get(url)
        driver.implicitly_wait(5) 
        #文章网址
        wen_zhang_url = url
        #文章标题,遇到视频跳出
        try:
            title = driver.find_element_by_xpath('//h1[@class="article-title"]').text
        except:
            print url,'条件不符,跳过'
            driver.close()
            exit(0)
        #发布时间
        try:
            fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="articleInfo"]/span')[2].text
        except:
            try:
                fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="article-sub"]/span')[2].text
            except:
                try:
                    fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="article-sub"]/span')[1].text
                except:
                    fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="articleInfo"]/span')[1].text
        #评论数量
        if self.isToday(fa_bu_shi_jian):
            print url,'日期不符,跳过'
            exit(0)
        try:
            submit_button = driver.find_element_by_class_name('c-load-more')
            submit_button.click()
        except:
            pass
        time.sleep(3)
        page = driver.page_source
        pin_lun_shu_liang = driver.find_element_by_xpath('//a[@class="share-count"]//span').text
        if int(pin_lun_shu_liang) :
            try:
                self.getCommen(url,page)
            except:
                pass
        else:
            print url,'无评论'
        #文章来源
        try:
            wen_zhang_source = driver.find_elements_by_xpath('//div[@class="articleInfo"]/span')[0].text
        except:
            wen_zhang_source = driver.find_elements_by_xpath('//div[@class="article-sub"]/span')[0].text
        #文章正文
        wen_zhang_zheng_wen = ''
        wen_zhang = driver.find_elements_by_xpath('//div[@class="article-content"]//p')
        for i in wen_zhang:
            wen_zhang_zheng_wen += i.text
        #抓取时间
        #now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        now = time.time()
        #抓取网站
        zhan_dian = u'今日头条'
        #图片链接
        imgs = ''
        try:
            img = driver.find_elements_by_xpath('//div[@class="article-content"]//p/img')
            for i in img:
                i = i.get_attribute('src')
                imgs = imgs + i + ' '
        except:
            imgs = ''
        #文章栏目
        wen_zhang_lan_mu = wen_zhang = driver.find_element_by_xpath('//a[@ga_event="click_channel"]').text
        #文章作者
        author = None
        #关键词
        guan_jian_ci = []
        try:
            cis = driver.find_elements_by_xpath('//a[@class="label-link"]')
        except:
            cis = driver.find_elements_by_xpath('//a[@class="tag-item"]')
        for i in cis:
            guan_jian_ci.append(i.text)
        #阅读数量
        readers = None
        #主键     
        driver.close()
        data = {'wen_zhang_wang_zhi':wen_zhang_url,\
                'wen_zhang_biao_ti':title,\
                'fa_bu_shi_jian':fa_bu_shi_jian,\
                'ping_lun_shu_liang':pin_lun_shu_liang,\
                'wen_zhang_lai_yuan':wen_zhang_source,\
                'wen_zhang_zheng_wen':wen_zhang_zheng_wen,\
                'do_time':now,\
                'zhan_dian':zhan_dian,\
                'tu_pian_lian_jie':imgs,\
                'wen_zhang_lan_mu':wen_zhang_lan_mu,\
                'wen_zhang_zuo_zhe':author,\
                'xiang_guan_biao_qian':guan_jian_ci,\
                'guan_jian_ci':None,\
                'yue_du_shu':readers,\
                '_id':wen_zhang_url}

需要存到数据库,团队用的mongodb。这里我另写到一个文件

#coding:utf8
#clientDb.py
import pymongo

client = pymongo.MongoClient('你自己的ip',端口)
db = client.news

def InsertNews(data):
    collection = db.content
    collection.insert(data)

def InsertComments(data):
    collection = db.comment
    collection.insert(data)

接着getNews:

from clientDb import InsertNews,InsertComments
#省略...
class Toutiao():
    #省略...
    def getNews(self,url):
        #省略...
        #存到数据库
        try:
            InsertNews(data)
        except:
            print url,'该文章已存在'
            exit(0)
        print url,'文章获取成功'

还有需要获取评论,这里我们试着用下BeautifulSoup库:

class Toutiao():
    #.....
    def getCommen(self,url,page):
        soup = BeautifulSoup(page,'html.parser')
        all = soup.find_all('div',class_='c-content')
        comments = []
        for i in all:
            id = 'http://www.toutiao.com'+i.find('a').get('href')
            name = i.find('a').get_text()
            c_content =  i.find('p').get_text()
            c_time = i.find('span',class_='c-create-time').get_text()
            if c_time[1] == u'\u5206':
                c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            else:
                now = time.strftime('%H',time.localtime(time.time()))
                time1 = int(now) - int(c_time[0])
                if time1<10:
                    time1 = '0'+str(time1)+':'
                else:
                    time1 = str(time1)+':'
                now = now+':'
                c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
                c_time = c_time.replace(now,time1)
            try:
                r_count =  i.find('span',class_='c-reply-count').get_text()
            except:
                r_count = None
            z_count = i.find('span',title='点赞').get_text()
            #z_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            z_time = time.time()
            c_url = u'今日头条'
            comment = {}
            #comment['news_url'] = url
            comment['ping_lun_id'] = id
            comment['yong_hu_ming'] = name
            comment['xing_bie'] = None
            comment['yong_hu_deng_ji'] = None
            comment['yong_hu_sheng_fen'] = None
            comment['ping_lun_nei_rong'] = c_content
            comment['hui_fu_shu'] = r_count
            comment['ping_lun_shi_jian'] = c_time
            #print comment['ping_lun_shi_jian']
            comment['do_time'] = z_time
            comment['dian_zan_shu'] = z_count
            comment['zhan_dian'] = c_url
            comment['_id'] = id+url
            comments.append(comment)
        try:
            InsertComments(comments)
        except:
            print url,'该评论已存在'
            exit(0)
        print url,'评论获取成功'

到这里就算基本完了,现在试试最终效果:

if __name__=='__main__':
    test = Toutiao()
    test.getLinks('http://www.toutiao.com/')
    count = len(test.links)
    print '共%d' % count
    while len(test.links):
        url = test.links.pop()
        try:
            test.getNews(url)
        except:
            pass
        count -= 1
        print '余%d' % count

效果大概就是这样:
Python使用Selenium + PhantomJS抓取动态网页:今日头条_第3张图片
数据库中效果:
Python使用Selenium + PhantomJS抓取动态网页:今日头条_第4张图片
源码在这里源码

你可能感兴趣的:(Python使用Selenium + PhantomJS抓取动态网页:今日头条)