初高中知识点爬取

因为工作需要,需从网上爬去大量初高中知识点内容,此前并未接触过爬虫,参考网上的代码进行修改,对51edu网站初高中相关科目知识点进行提取。(仅供记录,不建议参考)

网站链接:http://www.51edu.com/

所用工具:requests + beautifulsoup 4 +谷歌浏览器

其中,requests库使用简介方便,用于发出网址请求,beautifulsoup库用于提取数据

代码如下:(任务所需,代码较为粗糙,未进行相应的优化)


# -*-coding:UTF-8 -*-
import requests
from bs4 import BeautifulSoup

#在指定页面获得文本内容
def gethtml(url):
    """
    :param url: 指定页面网址
    :return: 文本内容
    """
    target = url
    req = requests.get(url=target)
    # 查看网页的编码格式
    enconding = requests.utils.get_encodings_from_content(req.text)
    # print(enconding) //gb2312
    content = req.content.decode('gb2312', errors='ignore')
    # print(content)
    # 定位到我们需要的内容
    bf = BeautifulSoup(content)
    texts = bf.find_all('div', class_='conL-1-2')
    try:
        texts = texts[0].text.replace(' ', '')
    except:
        pass
    try:
        texts = texts.replace("\n", '')
    except:
        pass
    # print(texts)
    return texts


def geturl(web_link, server):
    """
    :param web_link: 对应科目主页面网址
    :param server: 网站主网址
    :return: 不同专题的url
    """
    dic_url = {}
    exclude_list = ["高一数学知识点", "高二数学知识点", "高三数学知识点", "高一英语知识点", "高二英语知识点", "高三英语知识点", "高一语文知识点", "高二语文知识点","高三语文知识点","初一数学知识点", "初二数学知识点",
                    "初三数学知识点","初一英语知识点", "初二英语知识点", "初三英语知识点", "英语知识点", "初一语文知识点","初二语文知识点","初三语文知识点","物理知识点", "化学知识点", "【详情】", "分享", "首页", "上一页", "尾页"]
    req = requests.get(url=web_link)
    content = req.content.decode('gb2312', errors='ignore')
    bf = BeautifulSoup(content)
    div = bf.find_all('div', class_='lb-lt')
    # print(div[0])
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    for each in a[5:]:
        if each.string in exclude_list:
            continue
        full_url = server + each.get('href')
        title = each.string
        dic_url[title] = full_url
    print(dic_url)
    return dic_url

def contents_save(file_path, content):
    """
    :param file_path: 爬取文件保存路径
    :param content: 爬取文本文件内容
    :return: None
    """
    with open(file_path, 'a', encoding="utf-8") as f:
        f.write(str(content))
        f.write('\n')
#判断是否存在下一页,如果存在,则翻页继续爬取
def judge_nextweb(dic_url):
    """
    :param dic_url: 页面内所有专题的url
    :return: 下一页的url
    """
    for title in dic_url.keys():
        if title != "下一页":
            url = dic_url[title]
            print(title)
            in_dic_url = next_url_judge(url)
            for titles in in_dic_url:
                urls = in_dic_url[titles]
                content = gethtml(urls)
                contents_save(file_path, content)
            content = gethtml(url)
            contents_save(file_path, content)
    return dic_url["下一页"]

#判断专题内是否存在多页,如果存在则逐页爬取
def next_url_judge(in_url):
    """
    :param in_url: 专题url
    :return: 页码链接,存在则返回相应页码的url,不存在则返回空
    """
    in_dic_url = {}
    exclude_list = ([str(i) for i in range(1, 20)])
    req = requests.get(url=in_url)
    # 查看网页的编码格式
    enconding = requests.utils.get_encodings_from_content(req.text)
    # print(enconding)
    content = req.content.decode('gb2312', errors='ignore')
    # print(content)
    # 定位到我们需要的内容
    bf = BeautifulSoup(content)
    text = bf.find_all('div', id="pages")
    a_bf = BeautifulSoup(str(text[0]))
    a = a_bf.find_all('a')
    for each in a:
        if each.string in exclude_list:
            full_url = server + each.get('href')
            title = each.string
            in_dic_url[title] = full_url
    print(in_dic_url)
    return in_dic_url

if __name__ == '__main__':
    save_dir = "E:/实习/data/subject_dataset"
    subject_file = "senior_chemistry1.txt"
    file_path = save_dir + '/' + subject_file
    server = "http://www.51edu.com"
    web_link = "http://www.51edu.com/gaozhong/gaoyi/huaxue/zhishidian/" #爬取的网址
    web_transfer = web_link
    cal = 0
    while cal < 9: #9为待爬取网页页数
        dic_url = geturl(web_transfer, server)
        web_transfer = judge_nextweb(dic_url)
        cal += 1
        print(cal)

你可能感兴趣的:(爬虫)