#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/11/13 10:20
# @Author : jia.zhao
# @Desc :
# @File : csdn_demo.py
# @Software: PyCharm
import requests
from lxml import etree
import pymysql
import re
import urllib.request
import urllib.error
import hashlib
from html import unescape
import ssl
from log_content import Logger
ssl._create_default_https_context = ssl._create_unverified_context
class CSDNSpider():
def __init__(self):
'''
初始化
'''
self.url = 'https://blog.csdn.net/'
self.start_requests()
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'blog.csdn.net',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def start_requests(self):
'''
请求分类数据
:return:
'''
xpath_nav = 'div[@class="container clearfix"]/nav[@class="clearfix"]/div[@class="clearfix"]/div[@class="nav_com"]/ul/li/a'
# 请求url
page = requests.get(self.url)
# 使用xpath解析
nav_html = etree.HTML(page.text)
# 获取分类的url和name
nav_urls = nav_html.xpath(
'//%s/@href' % xpath_nav)
nav_titles = nav_html.xpath(
'//%s//text()' % xpath_nav)
for i in range(len(nav_urls)):
# 判断分类的url是不是区块链的链接
if nav_urls[i] == 'https://blockchain.csdn.net':
nav_url = nav_urls[i]
else:
# 如果不是,url需要拼接
nav_url = self.url + nav_urls[i]
# 调用方法获取博客列表及内容
self.content_article(nav_url, nav_titles[i])
def content_article(self, nav_url, nav_title):
# 不是区块链博客
if nav_url != 'https://blockchain.csdn.net':
# 根据分类获得HTML标签的class内容
if nav_url.split('/')[-1] == '':
ele = 'home'
else:
ele = nav_url.split('/')[-1]
param = 'ul[@class="feedlist_mod %s"]/li[@class="clearfix"]/div[@class="list_con"]' % ele
# 根据分类url请求文章列表
page = requests.get(nav_url)
blog_list_html = etree.HTML(page.text)
if blog_list_html != '' and blog_list_html != None:
# 博客文章的url
global blog_list_urls
blog_list_urls = blog_list_html.xpath('//%s/div[@class="title"]/h2/a/@href' % param)
# 博客文章的name
global blog_list_titles
blog_list_titles = blog_list_html.xpath('//%s/div[@class="title"]/h2/a//text()' % param)
# 作者
global blog_list_authors
blog_list_authors = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/dd[@class="name"]/a//text()' % param)
# 博客文章的阅读数量
global blog_list_read_num
blog_list_read_num = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/div[@class="interactive floatR"]/dd[@class="read_num"]/a/span[@class="num"]//text()' % param)
# 博客文章的评论数量
global blog_list_common_num
blog_list_common_num = blog_list_html.xpath('//%s/dl[@class="list_userbar"]/div[@class="interactive floatR"]/dd[@class="common_num "]/a/span[@class="num"]//text()' % param)
elif nav_url == 'https://blockchain.csdn.net':
page = requests.get(nav_url)
blog_list_html = etree.HTML(page.text)
param = 'div[@id="content"]/ul[@class="list"]/li[@class="zixun_img"]'
param_2 = 'div[@class="cont"]/div[@class="fr right_cont"]/div[@class="bot_info"]'
# 博客文章的url
blog_list_urls = blog_list_html.xpath('//%s/a/@href' % param)
# 博客文章的name
blog_list_titles = blog_list_html.xpath('//%s/a//text()' % param)
# 作者
blog_list_authors = blog_list_html.xpath(
'//%s/%s/a[@class="nick_name fl"]//text()' % (param, param_2))
# 博客文章的阅读数量
blog_list_read_num = blog_list_html.xpath(
'//%s/%s/span[@class="num fr"]//text()' % (param, param_2))
# 博客文章的评论数量
blog_list_common_num = blog_list_html.xpath('//%s/%s/span[@class="comment fr"]//text()' % (param, param_2))
# 判断获取评论的值的个数和url是否一致
if len(blog_list_urls) > len(blog_list_common_num):
for i in range(len(blog_list_urls)-len(blog_list_common_num)):
blog_list_common_num.append('0')
if len(blog_list_urls) > len(blog_list_read_num):
for i in range(len(blog_list_urls) - len(blog_list_read_num)):
blog_list_read_num.append('0')
# 循环取出文章的标题及其他内容
for i in range(len(blog_list_titles)):
# 访问文章的url
page = requests.get(blog_list_urls[i])
content_html = etree.HTML(page.text)
# 文章的内容
article_list_contents = content_html.xpath(
'//div[@class="blog-content-box"]/article/div[@class="article_content clearfix csdn-tracking-statistics"]/div[@class="htmledit_views"]')
# 发布时间
publish_time = content_html.xpath(
'//div[@class="blog-content-box"]/div[@class="article-header-box"]/div[@class="article-header"]/div[@class="article-info-box"]/div[@class="article-bar-top"]/span[@class="time"]//text()')
# csdn有的页面富文本标签不一样,有的文章是用markdown,有的是用富文本编辑器,所以标签有区别
if len(article_list_contents) == 0:
article_list_contents = content_html.xpath(
'//div[@id="article_content"]/div[1]')
if len(article_list_contents) == 0:
log.logger.info('该文章里没有找到内容:' + blog_list_urls[i])
continue
# 取出内容
for j in range(len(article_list_contents)):
# 调用方法,保存内容及图片到文件服务器,返回路径
content_path = self.replace_content(article_list_contents[j], blog_list_urls[i])
# 将其他信息插入数据库
self.insert_mysql(str(blog_list_titles[i]).strip(), nav_title, blog_list_urls[i], str(blog_list_authors[i]).strip()
, str(publish_time[0]).strip(),
blog_list_read_num[i], blog_list_common_num[i], content_path)
if i == 11:
break
def insert_mysql(self, title, nav_title, article_url, author, publish_time, read_num, comment_num, content_path):
'''
:param title:
:param article_url:
:param author:
:param publish_time:
:param read_num:
:param comment_num:
:param content_path:
:return:
'''
conn = pymysql.connect(
host='127.0.0.1',
db='csdn',
user='root',
passwd='********',
# charset='utf8', # 编码要加上,否则可能出现中文乱码问题
use_unicode=False)
cursor = conn.cursor()
sql = """insert into csdn_demo(title,nav_title, article_url, author, publish_time, read_num, comment_num, content_path)
VALUES("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s");""" \
% (title, nav_title, article_url, author, publish_time, read_num, comment_num, content_path)
try:
cursor.execute(sql)
conn.commit()
except Exception as e:
log.logger.warning('数据库插入数据出现错误', e)
def replace_content(self, content, article_url):
'''
:param content:
:param article_url:
:return: 文件路径
'''
reg = r'src="(.+?)"'
content = etree.tostring(content).decode('utf8')
content = unescape(content)
imgre = re.compile(reg)
imglist = re.findall(imgre, content)
if len(imglist) != 0:
# 循环
for i in range(len(imglist)):
# 将图片的url转为MD5 ,确保唯一
img_name = hashlib.md5(imglist[i].encode('utf8'))
# 转换为16机制打印md5值
img_name = img_name.hexdigest()
# 解决下载不完全问题且避免陷入死循环
try:
# 判断图片链接是否是正确的
if not imglist[i].endswith('.js') and imglist[i].startswith('http') and 'note.youdao.com' not in imglist[i]:
if imglist[i].endswith('.gif'):
global path_name
path_name = 'csdn/img/%s.gif' % img_name
urllib.request.urlretrieve(imglist[i], path_name)
content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/csdn/img/%s.gif' % img_name)
else:
path_name = 'csdn/img/%s.jpg' % img_name
# 保存图片
urllib.request.urlretrieve(imglist[i], path_name)
# 根据每个图片的src的内容进行替换
content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/csdn/img/%s.jpg' % img_name)
except Exception as e:
log.logger.warning('图片下载出异常,进入异常处理模块,异常的链接:'+imglist[i]+'\n'+str(e))
if 'HTTP Error 403: Forbidden' in str(e):
log.logger.info('403错误,重新访问')
res = requests.get(imglist[i], headers=self.headers, verify=False)
if res.status_code == 200:
log.logger.info('重新获取成功')
with open(path_name, 'wb') as f:
f.write(res.content)
content = content.replace(imglist[i], '/home/zhaojia/csdn_demo/%s' % path_name)
else:
log.logger.error('重新获取失败,已记录'+imglist[i]+'文章url是:'+article_url, )
continue
else:
continue
except urllib.error as e:
log.logger.warning('图片地址错误,无响应', e)
continue
# 将文章的url转为MD5,作为文件名,确保唯一
article_name_md5 = hashlib.md5(article_url.encode('utf8'))
# 转换为16机制打印md5值
article_name_md5 = article_name_md5.hexdigest()
file_name = '/home/zhaojia/csdn_demo/csdn/article/'+article_name_md5+'.txt'
# 写入文件
with open(file_name, 'w', encoding='utf8') as f:
f.write(content)
return file_name
if __name__ == '__main__':
log = Logger('csdn_all.log', level='info')
CSDNSpider()
代码的注释很详细,我就不过多废话
log模块
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/11/16 11:20
# @Author : jia.zhao
# @Desc :
# @File : log_content.py
# @Software: PyCharm
import logging
from logging import handlers
class Logger(object):
level_relations = {
'debug':logging.DEBUG,
'info':logging.INFO,
'warning':logging.WARNING,
'error':logging.ERROR,
'crit':logging.CRITICAL
}#日志级别关系映射
def __init__(self,filename,level='info',when='D',backCount=3,fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
self.logger = logging.getLogger(filename)
format_str = logging.Formatter(fmt)#设置日志格式
self.logger.setLevel(self.level_relations.get(level))#设置日志级别
sh = logging.StreamHandler()#往屏幕上输出
sh.setFormatter(format_str) #设置屏幕上显示的格式
th = handlers.TimedRotatingFileHandler(filename=filename,when=when,backupCount=backCount,encoding='utf-8')#往文件里写入#指定间隔时间自动生成文件的处理器
#实例化TimedRotatingFileHandler
#interval是时间间隔,backupCount是备份文件的个数,如果超过这个个数,就会自动删除,when是间隔的时间单位,单位有以下几种:
# S 秒
# M 分
# H 小时、
# D 天、
# W 每星期(interval==0时代表星期一)
# midnight 每天凌晨
th.setFormatter(format_str)#设置文件里写入的格式
self.logger.addHandler(sh) #把对象加到logger里
self.logger.addHandler(th)
# if __name__ == '__main__':
# log = Logger('all.log',level='info')
# log.logger.info('info')
# log.logger.warning('警告')
# log.logger.error('报错')
# log.logger.critical('严重')