Python爬虫——爬取百度文库文章

爬取-百度文库中的文章

爬取大多数百度文库的文章或图片数据

创建了两个调用方法,分别获取文字和图片内容
输入文章的url链接尝试获取数据
使用set进行了简单去重
文档以doc格式,图片使用jpg格式进行保存

思路来自 https://www.52pojie.cn/thread-1012567-1-1.html
仅限学习使用!

import os
import re
import json
import requests
from urllib.request import urlretrieve


class BaiduWk:
    def __init__(self):
        self.list_info = []
        self.session = requests.session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/80.0.3987.87 Mobile Safari/537.36'}

    # 获取网页源代码的数据
    def get_html(self, start_url):
        response = self.session.get(start_url, headers=self.headers)
        response.encoding = response.apparent_encoding
        return response.text

    # 获取文档标题, 提取请求参数
    def parse_html(self, data):     
        re_title = re.findall("'title': '(.*?)',", data)
        title = re_title[0] if re_title else re.findall('(.*?)', data)[0]
        params = {
            'bucketNum': re.findall(r'"bucketNum":(\d+)', data)[0],
            'md5sum': re.findall('md5sum=(.*?)&', data)[0],
            'sign': re.findall('sign=(.*?)&', data)[0],
            'rtcs_flag': re.findall('rtcs_flag=(.*?)&', data)[0],
            'rtcs_ver': re.findall('rtcs_ver=(.*?)&', data)[0],
            'rsign': re.findall('"rsign":"(.*?)"', data)[0], }
        # 提取页码列表
        page_range = re.findall(r'{"page":\d+,"range":"(.*?)"}', data)
        return params, page_range, title
	
	# 以页码列表依次迭代
    def words_data(self, params, page_range):
        pages = len(page_range) + 1
        url = r'https://wkrtcs.bdimg.com/rtcs/webapp'
        for i in range(1, pages):
            print(f'正在解析第{i}页数据,飞速读取中...')
            # 添加所需的页码信息
            params['pn'] = i
            params['range'] = page_range[i - 1]
            response = self.session.get(url, params=params).text
            yield response

    # 解析文章数据
    def get_words(self, response):
        pages = 1
        for data in response:
            # 转化为json数据
            a = data[5:-1]
            text = ''
            d = json.loads(a)
            # 提取 c键 的文本数据
            for j in d['document.xml']:
                for c in j['c']:
                    text += '\n'
                    for c2 in c['c']:
                        try:
                            text += c2['c'] + '\n'
                        except:
                            continue
            text += f'\n------------------------当前第{pages}页-------------------------\n'
            pages += 1
            self.list_info.append(text)

    # 保存文件
    def save_info(self, title, path):   
        os.makedirs('百度文库', exist_ok=True)
        with open(path, 'w', encoding='utf-8') as f:
            f.writelines(self.list_info)

    def get_img(self, start_url):
        print('开始尝试解析百度文库图片...\n')
        r = self.session.get(start_url)
        r.encoding = r.apparent_encoding
        title = re.findall("'title': '(.*?)'", r.text)[0]
        print(title)
        docId = re.findall("'docId': '(.*?)'", r.text)[0]
        totalPageNum = re.findall("'totalPageNum': '(.*?)'", r.text)[0]
        totalPageNum = int(totalPageNum) + 1

        return totalPageNum, title, docId

    def download_img(self, totalPageNum, title, docId):
        for pn in range(1, totalPageNum):
            params = {'doc_id': docId, 'pn': pn, 'rn': 1, 'type': 'ppt', }
            api_url = 'https://wenku.baidu.com/browse/getrequest'
            r = self.session.get(api_url, params=params, headers=self.headers)
            src = r.json()[0].get('zoom')
            os.makedirs(title, exist_ok=True)
            path = title + '/' + str(pn) + '.jpg'
            urlretrieve(src, path)
            print(f'正在提取第{pn}页,请稍等...')
    
    # 文章去重
    def set_word(self, path):
        word_set = list()
        with open(path, 'r', encoding='utf-8') as f:
            for each_line in f:
                word_set.append(each_line)
        result = list(set(word_set))
        result.sort(key=word_set.index)
        with open(path, 'w', encoding='utf-8') as f:
            f.writelines(result)
            print('done')

	# 获取文字内容
    def run_word(self):
        print('开始尝试解析百度文库页面...\n')
        start_url = input('输入百度文库中的连接:')
        print('running...\n')
        start_url = re.sub('wenku', 'wk', start_url)
        html = self.get_html(start_url)
        param, ranges, title = self.parse_html(html)
        print(f'当前文章:{title}\n')
        path = '百度文库/' + title + '.doc'
        response = self.words_data(param, ranges)
        self.get_words(response)
        self.save_info(title, path)
        self.set_word(path)
        print('done!!!')
        print('程序执行完毕!')

	# 获取图片数据
    def run_img(self):
        print('开始尝试解析百度文库图片信息...\n')
        start_url = input('输入百度文库中的连接:')
        print('running...\n')
        totalPageNum, title, docId = self.get_img(start_url)
        self.download_img(totalPageNum, title, docId)
        print('done!!!')
        print('程序执行完毕!')


if __name__ == '__main__':
    wk = BaiduWk()
    wk.run_word()
    # wk.run_img()

你可能感兴趣的:(python爬虫)