首先需要下载requests库和BeautifulSoup库
pip install requests
pip install beautifulsoup4
代码有的时候会报如下错误,'NoneType' object is not callable。需要检查空,这里我没有判断,后续更新异常处理
代码如下:
# requests 模块请求csdn
import requests
from bs4 import BeautifulSoup
import random
from lxml import *
# 获取一个随机的请求头
# def getUserAgent():
# agent = '';
# list = [];
# list.append(
# 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36');
# # list.append('Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1');
# agent = list[random.randint(0, len(list))];
# return agent;
# 请求页数数据
def getPage():
print("获取CSDN的页数数据!");
totalPage = 0;
url = 'https://blog.csdn.net/h_j_c_123';
# userAgent = getUserAgent();
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
response = requests.get(url, headers=headers);
code = response.status_code;
if code == 200:
print("总页数状态码返回正常");
text = response.text; #有的网站如果不加请求头的话就不会显示出正文数据只会返回状态码
html = BeautifulSoup(text, 'html.parser');
totalCountStr = html.find('span', {'class': 'count'})
totalCountStr = totalCountStr.text;
totalCount = int(totalCountStr);
if totalCount % 40 == 0:
totalPage = totalCount // 40;
else:
totalPage = totalCount // 40 + 1; # python的整除不是 / 是 // /在python中只是除法可以除出小数来
else:
print("状态码返回不正常====>" + code)
return totalPage;
# 获取列表数据
def getList():
print("获取列表数据");
totalPage = getPage();
for i in range(1, totalPage):
url = 'https://blog.csdn.net/h_j_c_123/article/list/%d' % i;
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
response = requests.get(url, headers=headers);
code = response.status_code;
if code == 200:
print("循环状态码返回正常====>%d" % i); # 当字符串拼接上数字时不是单纯的+ 而是需要用占位符号 %d 和 % 进行拼接
text = response.text;
html = BeautifulSoup(text, 'html.parser');
rows = html.findAll('div', {'class': 'article-item-box'});
for row in rows:
resolverList(row);
# 解析列表
def resolverList(row):
# select 方法
titleNode = row.select('h4 > a');
source_url = titleNode[0].attrs['href']; # 详情地址
spanNode = titleNode[0].select('span');
originalNode = titleNode[0].select('span')[0];
original = originalNode.string # 原创
originalNode.clear();
title = titleNode[0].text.strip(); # 用于删除字符串左右两个的空格和特殊字符
contentNode = row.select('p > a')[0];
content = contentNode.text.strip(); # 正文摘要
publish_time = row.find('span', {'class', 'date'}).text.strip(); # 发布时间
read_num = row.findAll('span', {'class', 'read-num'})[0].text.strip(); # 阅读量
chat_num = row.findAll('span', {'class', 'read-num'})[1].text.strip(); # 聊天量
if source_url != "" and source_url != None:
getDetail(source_url);
else:
print("详情地址为空,不能抓取详情")
# 获取详情
def getDetail(source_url):
print("获取详情数据=====>" + source_url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
response = requests.get(source_url, headers=headers);
code = response.status_code;
if code == 200:
text = response.text;
html = BeautifulSoup(text, 'html.parser');
content = html.find('div',{'id':'article_content'}).strip();
print(content)
else:
print("详情数据返回失败")
if __name__ == '__main__':
getList()