下载安装PhantomJS,这是一个无界浏览器,不能使用pip安装,以及安装lxml库,Selenium库。另外BeautifulSoup库和Selenium自带的css选择器也想用用看。
#coding:utf8
#python27
import re
from selenium import webdriver
import time
from lxml.html import fromstring
class Toutiao():
links = []
首先是获取今日头条指定栏目下的新闻链接
class Toutiao():
#省略...
def getLinks(self,url):
#调试时可以使用火狐浏览器,需事先下载火狐的驱动
#driver = webdriver.Firefox(executable_path=r'C:\Python27\geckodriver.exe')
driver = webdriver.PhantomJS()
driver.get(url)
#设置隐式等待,最多等待10s
driver.implicitly_wait(5)
for i in range(2):
#设置下拉次数模拟下拉滚动条加载网页
driver.execute_script("window.scrollBy(0,700)")
driver.implicitly_wait(2)
#将网页源码保存
html = driver.page_source
#一定要关闭
driver.close()
tree = fromstring(html)
另外,在爬取的时候,发现今日头条网站页面的元素属性并不是固定的,而且链接也不总是http,有时候是https,因此需要两者都考虑到。
def getLinks(self,url):
#省略...
try:
list = tree.xpath('//a[@class="link"]/@href')
except:
list = tree.xpath('//a[@class="link title"]/@href')
for i in list:
#需将抓到的链接拼接完全
link = 'http://www.toutiao.com' + i
print 'Get link:',link
self.links.append(link)
看一下抓到的链接:
if __name__=='__main__':
Toutiao().getLinks('http://www.toutiao.com/')
接下来自然是抓取新闻页面的内容了。我们只需要今日头条自己的新闻,并且是本日的新闻,今日头条里面有一部分是视频新闻,还有一些是其他网站的新闻,因此我们需要做一个过滤。
时间上的过滤方法:将网页中的发布时间抓取下来与本地时间做一个对比
def isToday(fb_time):
today = time.strftime('%Y-%m-%d',time.localtime(time.time()))
fb_time = re.findall('(.*?) .*?',fb_time)[0]
if fb_time != today:
return True
else:
return False
接下来抓取正文,这里用下Selenium自带的选择器:
def getNews(self,url):
print url,'开始抓取'
driver = webdriver.PhantomJS()
#driver = webdriver.Firefox(executable_path=r'C:\Python27\geckodriver.exe')
driver.get(url)
driver.implicitly_wait(5)
#文章网址
wen_zhang_url = url
#文章标题,遇到视频跳出
try:
title = driver.find_element_by_xpath('//h1[@class="article-title"]').text
except:
print url,'条件不符,跳过'
driver.close()
exit(0)
#发布时间
try:
fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="articleInfo"]/span')[2].text
except:
try:
fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="article-sub"]/span')[2].text
except:
try:
fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="article-sub"]/span')[1].text
except:
fa_bu_shi_jian = driver.find_elements_by_xpath('//div[@class="articleInfo"]/span')[1].text
#评论数量
if self.isToday(fa_bu_shi_jian):
print url,'日期不符,跳过'
exit(0)
try:
submit_button = driver.find_element_by_class_name('c-load-more')
submit_button.click()
except:
pass
time.sleep(3)
page = driver.page_source
pin_lun_shu_liang = driver.find_element_by_xpath('//a[@class="share-count"]//span').text
if int(pin_lun_shu_liang) :
try:
self.getCommen(url,page)
except:
pass
else:
print url,'无评论'
#文章来源
try:
wen_zhang_source = driver.find_elements_by_xpath('//div[@class="articleInfo"]/span')[0].text
except:
wen_zhang_source = driver.find_elements_by_xpath('//div[@class="article-sub"]/span')[0].text
#文章正文
wen_zhang_zheng_wen = ''
wen_zhang = driver.find_elements_by_xpath('//div[@class="article-content"]//p')
for i in wen_zhang:
wen_zhang_zheng_wen += i.text
#抓取时间
#now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
now = time.time()
#抓取网站
zhan_dian = u'今日头条'
#图片链接
imgs = ''
try:
img = driver.find_elements_by_xpath('//div[@class="article-content"]//p/img')
for i in img:
i = i.get_attribute('src')
imgs = imgs + i + ' '
except:
imgs = ''
#文章栏目
wen_zhang_lan_mu = wen_zhang = driver.find_element_by_xpath('//a[@ga_event="click_channel"]').text
#文章作者
author = None
#关键词
guan_jian_ci = []
try:
cis = driver.find_elements_by_xpath('//a[@class="label-link"]')
except:
cis = driver.find_elements_by_xpath('//a[@class="tag-item"]')
for i in cis:
guan_jian_ci.append(i.text)
#阅读数量
readers = None
#主键
driver.close()
data = {'wen_zhang_wang_zhi':wen_zhang_url,\
'wen_zhang_biao_ti':title,\
'fa_bu_shi_jian':fa_bu_shi_jian,\
'ping_lun_shu_liang':pin_lun_shu_liang,\
'wen_zhang_lai_yuan':wen_zhang_source,\
'wen_zhang_zheng_wen':wen_zhang_zheng_wen,\
'do_time':now,\
'zhan_dian':zhan_dian,\
'tu_pian_lian_jie':imgs,\
'wen_zhang_lan_mu':wen_zhang_lan_mu,\
'wen_zhang_zuo_zhe':author,\
'xiang_guan_biao_qian':guan_jian_ci,\
'guan_jian_ci':None,\
'yue_du_shu':readers,\
'_id':wen_zhang_url}
需要存到数据库,团队用的mongodb。这里我另写到一个文件
#coding:utf8
#clientDb.py
import pymongo
client = pymongo.MongoClient('你自己的ip',端口)
db = client.news
def InsertNews(data):
collection = db.content
collection.insert(data)
def InsertComments(data):
collection = db.comment
collection.insert(data)
接着getNews:
from clientDb import InsertNews,InsertComments
#省略...
class Toutiao():
#省略...
def getNews(self,url):
#省略...
#存到数据库
try:
InsertNews(data)
except:
print url,'该文章已存在'
exit(0)
print url,'文章获取成功'
还有需要获取评论,这里我们试着用下BeautifulSoup库:
class Toutiao():
#.....
def getCommen(self,url,page):
soup = BeautifulSoup(page,'html.parser')
all = soup.find_all('div',class_='c-content')
comments = []
for i in all:
id = 'http://www.toutiao.com'+i.find('a').get('href')
name = i.find('a').get_text()
c_content = i.find('p').get_text()
c_time = i.find('span',class_='c-create-time').get_text()
if c_time[1] == u'\u5206':
c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
else:
now = time.strftime('%H',time.localtime(time.time()))
time1 = int(now) - int(c_time[0])
if time1<10:
time1 = '0'+str(time1)+':'
else:
time1 = str(time1)+':'
now = now+':'
c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
c_time = c_time.replace(now,time1)
try:
r_count = i.find('span',class_='c-reply-count').get_text()
except:
r_count = None
z_count = i.find('span',title='点赞').get_text()
#z_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
z_time = time.time()
c_url = u'今日头条'
comment = {}
#comment['news_url'] = url
comment['ping_lun_id'] = id
comment['yong_hu_ming'] = name
comment['xing_bie'] = None
comment['yong_hu_deng_ji'] = None
comment['yong_hu_sheng_fen'] = None
comment['ping_lun_nei_rong'] = c_content
comment['hui_fu_shu'] = r_count
comment['ping_lun_shi_jian'] = c_time
#print comment['ping_lun_shi_jian']
comment['do_time'] = z_time
comment['dian_zan_shu'] = z_count
comment['zhan_dian'] = c_url
comment['_id'] = id+url
comments.append(comment)
try:
InsertComments(comments)
except:
print url,'该评论已存在'
exit(0)
print url,'评论获取成功'
到这里就算基本完了,现在试试最终效果:
if __name__=='__main__':
test = Toutiao()
test.getLinks('http://www.toutiao.com/')
count = len(test.links)
print '共%d' % count
while len(test.links):
url = test.links.pop()
try:
test.getNews(url)
except:
pass
count -= 1
print '余%d' % count