Python爬取ppt工作项目模板

前言

ppt模板爬取,大约有一百多套工作项目ppt模板,需要的小伙伴可以通过以下程序来下载!

(1)爬取程序

# author:爱分享的山哥
import requests
from bs4 import BeautifulSoup
import random
import os
import time


def getHeaders():
    user_agent_list = [\
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers
if not os.path.exists('./PPT/'):
    os.mkdir('./PPT/')


headers = getHeaders()
# 分页
for i in range(0,10):
    url = "http://www.51pptmoban.com/e/search/result/index.php?page={}&searchid=2194".format(str(i))
    res = requests.get(url=url,headers=headers).text

    #  bs4解析数据
    # 1.使用通用爬虫解析首页,获取每个ppt的url
    soup = BeautifulSoup(res,'lxml')
    url_list = soup.select('.pptlist > dl dd')
    for dd in url_list:
        dowm_url = 'http://www.51pptmoban.com'+dd.div.a['href']

        # 对url下载的地址发送请求,获取下载页面
        res = requests.get(url=dowm_url, headers=headers).text
        soup = BeautifulSoup(res, 'lxml')

        # 新知识点
        node = soup.find('div',class_='ppt_xz')
        new_url = 'http://www.51pptmoban.com/'+node.a['href']

        # 获取名字
        div = soup.find('div',class_='title')
        # 解决乱码
        name = (div.div.h1.get_text()).encode("iso-8859-1").decode("gbk")

        # 获取到下载地址的页面之后,对下载地址的url进行请求
        res = requests.get(url=new_url, headers=headers).text
        soup = BeautifulSoup(res, 'lxml')
        dowm = soup.find('div',class_='down')
        url = dowm.a['href']
        dowm_rar_url = 'http://www.51pptmoban.com/e/DownSys/GetDown/'+''.join(url.split('/')[2:])
        ppt_date = requests.get(url=dowm_rar_url, headers=headers).content
        path = './PPT/'+name+'.zip'
        try:
            with open(path,'wb') as fp:
                fp.write(ppt_date)
            print("%s爬取完成!"%name)
            time.sleep(1)
        except:
            print('无法爬取%s!'%name)
            continue

    print('第一页爬取完成!')
    time.sleep(5)  # 减慢爬取速度,防止被发现

2.爬取结果:
Python爬取ppt工作项目模板_第1张图片

你可能感兴趣的:(爬虫项目,ppt模板,python爬虫,ppt爬取)