python爬虫学习:爬取CSDN 采用requests + lxml 进行xpath解析 (二)

之前讲了BeautifulSoup 解析页面,今天来看一下xpath解析页面,首先需要安装lxml模块,pip install lxml

代码如下:

from lxml import etree
import requests


# 请求页数数据
def getPage():
    print("获取CSDN的页数数据!");
    totalPage = 0;
    url = 'https://blog.csdn.net/h_j_c_123';
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    response = requests.get(url, headers=headers);
    code = response.status_code;
    if code == 200:
        print("总页数状态码返回正常");
        text = response.text;  # 有的网站如果不加请求头的话就不会显示出正文数据只会返回状态码
        html = etree.HTML(text);
        totalCountStr = html.xpath('//*[@id="asideProfile"]/div[2]/dl[1]/dt/a/span')[0].text
        totalCount = int(totalCountStr);
        if totalCount % 40 == 0:
            totalPage = totalCount // 40;
        else:
            totalPage = totalCount // 40 + 1;  # python的整除不是 / 是 //  /在python中只是除法可以除出小数来
    else:
        print("状态码返回不正常====>" + code)
    return totalPage;


# 获取列表数据

def getList():
    print("获取列表数据");
    totalPage = getPage();
    for i in range(1, (totalPage + 1)):
        url = 'https://blog.csdn.net/h_j_c_123/article/list/%d' % i;
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
        }
        response = requests.get(url, headers=headers);
        code = response.status_code;
        if code == 200:
            print("循环状态码返回正常====>%d" % i);  # 当字符串拼接上数字时不是单纯的+ 而是需要用占位符号 %d 和 % 进行拼接
            text = response.text;
            html = etree.HTML(text);
            rows = html.xpath('//*[@id="mainBox"]/main/div[2]/div')
            for row in rows:
                resolverList(row);


def resolverList(row):
    # select 方法
    titleNode = row.xpath('h4/a')[0]
    source_url = titleNode.xpath('@href')[0]  # 详情地址 get('href') 获取属性
    spanNode = titleNode.xpath('span')
    originalNode = spanNode[0]
    original = originalNode.text  # 原创
    # title = titleNode.xpath('string(.)').strip()  # 用于删除字符串左右两个的空格和特殊字符
    titleList = titleNode.xpath('text()')  # 用于删除字符串左右两个的空格和特殊字符
    title = ''
    for i in range(len(titleList)): # xpath得到的是list
        str = titleList[i]
        title += str
    if title != "" and title != None:
        title = title.strip()
    print(title) # 标题
    # print(row.xpath('//*[@id="mainBox"]/main/div[2]/div[1]/h4/a[not name() = "span"]/text()'))
    # print(row.xpath("//a/*[name() != 'span']"))
    # contentNode = row.select('p > a')[0];
    # content = contentNode.text.strip();  # 正文摘要
    # publish_time = row.find('span', {'class', 'date'}).text.strip();  # 发布时间
    # read_num = row.findAll('span', {'class', 'read-num'})[0].text.strip();  # 阅读量
    # chat_num = row.findAll('span', {'class', 'read-num'})[1].text.strip();  # 聊天量
    if source_url != "" and source_url != None:
        getDetail(source_url);
    # print(row)
    # print(row.xpath('h4/a/text()'))
    # print(row.xpath('h4/a')[0].xpath('string(.)'))
    else:
        print("详情地址为空,不能抓取详情")



# 获取详情
def getDetail(source_url):
    print("获取详情数据=====>" + source_url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    response = requests.get(source_url, headers=headers);
    code = response.status_code;
    if code == 200:
        text = response.text;
        html = etree.HTML(text)
        content = html.xpath('//*[@id="article_content"]')[0].xpath('string(.)')
        print(content)
    else:
        print("详情数据返回失败")


if __name__ == '__main__':
    getList()

xpath每次解析出来的都是一个list,所以需要获取索引下标是取出哪个元素,格式化html文本的语句是etree.HTML(text),需要引入etree,然后在取值的时候出现list的形式有两种办法可以解决,第一循环list,第二使用string(.)方法,取出属性的办法就是xpath('@href'),具体使用办法看代码

你可能感兴趣的:(python,爬虫,lxml,requests,xpath,python,爬虫)