python爬取runoob.com编程教程

runoob上有很多编程基础教程,看着觉得挺不错的,何不据为已有.


image.png

作为教学,我们今天只爬取服务端编程语言下的所有教程.

Step0: 环境准备

    # 创建py虚拟环境,我使用python3.6.8,其他版本也可以
    python3 -m venv venv
    source venv/bin/activate

    # 安装requests 包,用来访问网页,获取相应的html文本
    pip install requests  
   
    # 安装bs4,用来解析获得的html文本
    pip install beautifulsoup4

Step1: 获取首页的html文本

#!/usr/bin/env python
# coding: utf-8 
# Gao Ming Ming Create At 2021-06-21 16:25:09
# Description:爬虫初体验
import requests
import os
from urllib import parse
from bs4 import BeautifulSoup

url = "https://www.runoob.com"

response = requests.get(url)
print(response)

这里打印出的是整个html文档,由于比较多而乱,我只贴一点,看看就行


image.png

Step2: 获取服务端编程语言的目录和地址

image.png

右键inspect发现相应的链接位于class=cate3下,此时我们要使用bs4来查找相应的a列表.代码如下:

# 构建bs对象
soup= BeautifulSoup(response.text)
# 查找标贴为div,且class为cate3下所有的标贴a
links = soup.find("div",class_="cate3").find_all("a")

# 取出链接名字和相应的链接地址并构成一个字典供下边使用
papers = dict()
for link in links:
    key = link.h4.string.strip().replace("】","").replace("【","")
    papers[key] = link['href']

此时的结果是这样:

{
    '学习 Python': '//www.runoob.com/python3/python3-tutorial.html',
    '学习 Python2.x': '//www.runoob.com/python/python-tutorial.html',
    '学习 Linux': '//www.runoob.com/linux/linux-tutorial.html',
    '学习 Docker': '//www.runoob.com/docker/docker-tutorial.html',
    '学习 Ruby': '//www.runoob.com/ruby/ruby-tutorial.html',
    '学习 Java': '//www.runoob.com/java/java-tutorial.html',
    '学习 C': '//www.runoob.com/c/c-tutorial.html',
    '学习 C++': '//www.runoob.com/cplusplus/cpp-tutorial.html',
    '学习 Perl': '//www.runoob.com/perl/perl-tutorial.html',
    '学习 Servlet ': '//www.runoob.com/servlet/servlet-tutorial.html',
    '学习 JSP': '//www.runoob.com/jsp/jsp-tutorial.html',
    '学习 Lua': '//www.runoob.com/lua/lua-tutorial.html',
    '学习 Rust': '//www.runoob.com/rust/rust-tutorial.html',
    '学习 R': '//www.runoob.com/r/r-tutorial.html',
    '学习 Scala': '//www.runoob.com/scala/scala-tutorial.html',
    '学习 Go': '//www.runoob.com/go/go-tutorial.html',
    '学习 PHP': '//www.runoob.com/php/php-tutorial.html',
    '学习 Django': '//www.runoob.com/django/django-tutorial.html',
    '学习 Zookeeper': '//www.runoob.com/w3cnote/zookeeper-tutorial.html',
    '设计模式': '//www.runoob.com/design-pattern/design-pattern-tutorial.html',
    '正则表达式': '//www.runoob.com/regexp/regexp-tutorial.html',
    '学习 Maven': '//www.runoob.com/maven/maven-tutorial.html',
    '学习 NumPy': '//www.runoob.com/numpy/numpy-tutorial.html',
    '学习 Verilog': '//www.runoob.com/w3cnote/verilog-tutorial.html',
    'Verilog 进阶': '//www.runoob.com/w3cnote/verilog2-tutorial.html',
    '学习 ASP': '//www.runoob.com/asp/asp-tutorial.html',
    '学习 AppML': '//www.runoob.com/appml/appml-tutorial.html',
    '学习 VBScript': '//www.runoob.com/vbscript/vbscript-tutorial.html'
}

Step3.获取每种语言下的课程列表和相应的链接

for key,link in papers.items():

    response = requests.get("https:"+link)
    soup = BeautifulSoup(response.text)
    
    key = key.replace("学习","").strip() # 目录里去掉学习两字

    left_menus = soup.find("div",id="leftcolumn").find_all("a")

像python得到的结果是这样的:
看着有点乱,其实就是python每小节的名称和链接


image.png

Step4.下载具体的每个页面的数据.我们只要图示区域的数据

image.png

代码:

# 遍历我们刚刚得到的目录
for index,menu in enumerate(left_menus):
        try:
            # 获取每节课的名称和地址
            title,address = menu.string,menu['href']

            file_path = "{}/{}.{}.html".format(language_dir,index,title.strip().replace("/",""))
            
            # fix python3 url problems
            if address.find("/") <0:
                address = "python3/" + address
            
            # 构建每个详情页的地址
            detail_url =parse.urljoin(url,address.replace("///",""))
            print(detail_url)
            response = requests.get(detail_url)
            # 用bs4取得中间的数据
            soup = BeautifulSoup(response.text,features="html.parser")
            content = soup.find("div",id="content")
            # 保存每一个页面
            with open(file_path,"w") as f:
                f.write(content.prettify())
        except Exception as e:
            # 保存下载错误的页
            with open("error.log","a+") as f:
                print(e)
                f.write(detail_url+str(e)+"\n")

完整代码:

#!/usr/bin/env python
# coding: utf-8 
# Gao Ming Ming Create At 2021-06-21 16:25:09
# Description:爬虫初体验
import requests
import os
from urllib import parse
from bs4 import BeautifulSoup

url = "https://www.runoob.com"

response = requests.get(url)


soup= BeautifulSoup(response.text)
links = soup.find("div",class_="cate3").find_all("a")

papers = dict()
for link in links:
    key = link.h4.string.strip().replace("】","").replace("【","")
    papers[key] = link['href']



base_dir = "runoob"
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

for key,link in papers.items():

    response = requests.get("https:"+link)
    soup = BeautifulSoup(response.text)
    
    key = key.replace("学习","").strip()
    print("正在下载{}相关教程".format(key))

    # 创建语言文件夹
    language_dir = "{}/{}".format(base_dir,key)
    if not os.path.exists(language_dir):
        os.mkdir(language_dir)

    left_menus = soup.find("div",id="leftcolumn").find_all("a")

    for index,menu in enumerate(left_menus):
        try:
            title,address = menu.string,menu['href']

            file_path = "{}/{}.{}.html".format(language_dir,index,title.strip().replace("/",""))
            
            # fix python3 url problems
            if address.find("/") <0:
                address = "python3/" + address

            detail_url =parse.urljoin(url,address.replace("///",""))
            print(detail_url)
            response = requests.get(detail_url)
            soup = BeautifulSoup(response.text,features="html.parser")
            content = soup.find("div",id="content")
            with open(file_path,"w") as f:
                f.write(content.prettify())
        except Exception as e:
            # 保存下载错误的页
            with open("error.log","a+") as f:
                print(e)
                f.write(detail_url+str(e)+"\n")

print("Done.")

运行:


image.png

爬取的结果:


image.png

具体的文件:
image.png

记得,仅供学习,不可用于干坏事哦~

你可能感兴趣的:(python爬取runoob.com编程教程)