Xpath实战之爬取学习猿地的猿著(上)

爬取猿著(代码篇)

1、爬取地址:url = https://www.lmonkey.com/essence/
2、请求头池用于该博客分享:https://www.cnblogs.com/huangyuechujiu/p/12893982.html
3、代码部分:(爬取的数据写入json文件中)

import requests, json
from lxml import etree

//封装为类,便于管理
class Xp_test():

    #请求的地址  猿著
    url = 'https://www.lmonkey.com/essence'
    headers = {
        'user-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
    }

# 爬取的数据
    data = ''
#存储数据
    filepath = './yq.json'

    #初始化
    def __init__(self):
        #发送请求
        res = requests.get(url = self.url,headers = self.headers)
        if res.status_code == 200:
            # 请求内容写入文件
            with open('./yq.html','wb') as fp:
                fp.write(res.content)
            if self.parth_data():
                self.write_data()

    def parth_data(self):
        #解析数据
        html = etree.parse('./yq.html',etree.HTMLParser())

        #提取数据 文章作者   标题   地址
        authors = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"list-group-item-action")]//strong/a/text()')
        titles = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"list-group-item-action")]//div[contains(@class,"flex-fill")]//div/text()')
        titleurl = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"list-group-item-action")]//div[contains(@class,"flex-fill")]//a/@href')
        #整理数据
        data = []
        for i in range(0,len(authors)):
            res = {'author':authors[i],
                   'title': titles[i],
                   'titleurl': titleurl[i]
                   }
            data.append(res)
        self.data = data
        return True


    def write_data(self):
        #写入数据
        print(self.data)
        with open(self.filepath,'w',encoding='utf-8') as fp:
            json.dump(self.data,fp,ensure_ascii=False)


#实例化对象
Xp_test()

4,爬取结果:(yq.json)

[{"author": "xxyd_h5x", "title": "JetBrains开发工具正版授权领取", "titleurl": "https://www.lmonkey.com/t/lpLmQeKLg"}, 
{"author": "IT头条", "title": "面向回家编程!GitHub标星两万的”Python抢票教程”,我们先帮你跑了一遍", "titleurl": "https://www.lmonkey.com/t/lpLmQeKLg"}, 
{"author": "duke", "title": "Python教程-一文读懂运算和运算符", "titleurl": "https://www.lmonkey.com/t/lpLmQeKLg"}, 
{"author": "dragonsz", "title": "CentOS7 下使用 rsync+sersync 配置文件自动同步", "titleurl": "https://www.lmonkey.com/t/user/15"}, 
{"author": "qingqi", "title": "Python 教程-代码测试", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"}, 
{"author": "jhxspy", "title": "Python教程-强制数据类型转换", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"}, 
{"author": "xxyd_python", "title": "Python 教程-从变量开始", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"}, 
{"author": "IT头条", "title": "Python 教程-Python 安装", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"}, 
{"author": "IT头条", "title": "Python 教程-了解Python", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "GaiJoon", "title": "喊话 JavaScript 开发者:玩 DOM 也要专业范儿", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
 {"author": "IT头条", "title": "1000 行 Python 代码脚本 bug,或影响上百篇学术论文", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"}, 
 {"author": "IT头条", "title": "生产环境下的LAMP环境搭建", "titleurl": "https://www.lmonkey.com/t/user/168547"}, 
 {"author": "王炸", "title": "Golang语言的主要特性与发展的环境和影响因素", "titleurl": "https://www.lmonkey.com/t/G5yvRWXyp"},
 {"author": "王炸", "title": "分享 10 个有用的 Laravel 5.8 集合辅助方法", "titleurl": "https://www.lmonkey.com/t/G5yvRWXyp"}]

你可能感兴趣的:(Python,python,xpath)