python之简单的爬取下载

文章目录

  • 一、简单的爬取
  • 二、配合多线程
    • 1. 超时汇报的
    • 2. 一直等待汇报进度的


一、简单的爬取

python之简单的爬取下载_第1张图片

import urllib
import requests
from bs4 import BeautifulSoup

# 模仿正常浏览器的request header
headers = {
    # 'user-agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.122Safari / 537.36'
}

# 要访问的html的url: 要带上http或Https的协议的,表示是个网页url,不带就被认为是本地文件
url = 'http://www.baidu.com'

# 发送请求
request = urllib.request.Request(url, headers=headers)

# 传回的页面响应,是对象
response = urllib.request.urlopen(request)

# 响应状态
print('[response.status]', response.status)

# 转化成bytes字节类型
html = response.read()

# 转化成str文本
html_str = html.decode()

# print(html_str)






# 解析成bs文档树
bs = BeautifulSoup(html_str, 'html.parser')

print(bs.__class__)

for item in bs.find_all('img'):
    print(item)

print('-----------')
for item in bs.find_all('img'):
    print(item.get('src'))






# 
response_file = requests.get('http://ss.bdimg.com/static/superman/img/topnav/[email protected]')

# 写入图片
with open('test.png', 'wb') as fp:
    # 判断状态码
    if response_file.status_code == 404:
        print(['response_file.status_code'], response_file.status_code)
    # 写入数据
    else:
        fp.write(response_file.content)

二、配合多线程

1. 超时汇报的

import requests
import threading
import time

# 要下载的链接前缀
baseUrl = 'https://ai-ecu.github.io/Xidian/'

# 要下载的链接后面部分
# 一种升序的数字后缀写法
# list = ['%03d' % (i,) + '.ts' for i in range(range_a, range_b)]
list = ['lecture3.pdf', 'lecture4.pdf', 'lectures_5and6.pdf', 'lectures_7and8.pdf']

# 显示下载成功失败信息
map = {}
for i in list:
    # 0 失败;1 成功
    map[i] = 0


# 多线程下载
class myThread(threading.Thread):
    def __init__(self, threadID, name, item):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.item = item

    def run(self):
        headers = {'Proxy-Connection': 'keep-alive'}
        response_file = requests.get(baseUrl + self.item, headers=headers)
        print('[url]', baseUrl + self.item)

        # 保存位置:./video/中
        with open('./video/' + self.item, 'wb') as fp:
            # 判断状态码
            if response_file.status_code == 404:
                print(['response_file.status_code'], response_file.status_code, self.item)
            # 写入数据
            else:
                fp.write(response_file.content)
                # 标记为下载成功
                map[item] = 1
                print('[download]', self.item)


if __name__ == '__main__':
    # 创建线程,开始任务
    i = 0
    threadList = []
    for item in list:
        i += 1
        thread = myThread(i, item, item)
        thread.start()
        threadList.append(thread)
        print('[thread]', i)
    print('[-----------------all thread have started--------------]')

    # 主线程等待至各子线程结束才结束
    for item in threadList:
        # 延迟时间:超过这个时间就认为任务是失败的。比如,60秒
        item.join(60)
    print('[-----------------all thread have joined--------------]')

    # 汇报下载成功情况
    print('-------success---------')
    for i in list:
        if map[i] == 1:
            print(i)

    # 汇报下载失败情况
    print('-------failure---------')
    for i in list:
        if map[i] == 0:
            print(i)

2. 一直等待汇报进度的

import requests
import threading
import time

# 要下载的链接前缀
baseUrl = 'https://ai-ecu.github.io/Xidian/'

# 要下载的链接后面部分
# 一种升序的数字后缀写法
# list = ['%03d' % (i,) + '.ts' for i in range(range_a, range_b)]
list = ['lectures_5and6.pdf', 'lectures_7and8.pdf']

# 进度
processCount = 0


# 多线程下载
class myThread(threading.Thread):
    def __init__(self, threadID, name, item):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.item = item

    def run(self):
        headers = {'Proxy-Connection': 'keep-alive'}
        response_file = requests.get(baseUrl + self.item, headers=headers)
        print('[url]', baseUrl + self.item)

        # 保存位置:./video/中
        with open('./video/' + self.item, 'wb') as fp:
            # 判断状态码
            if response_file.status_code == 404:
                print(['response_file.status_code'], response_file.status_code, self.item)
            # 写入数据
            else:
                fp.write(response_file.content)
                # 下载成功则计数加一
                global processCount
                processCount += 1
                print('[download]', self.item, '[process]', str(processCount) + '/' + str(len(list)))


if __name__ == '__main__':
    # 创建线程,开始任务
    i = 0
    threadList = []
    for item in list:
        i += 1
        thread = myThread(i, item, item)
        thread.start()
        threadList.append(thread)
        print('[thread-start]', i)
    print('[-----------------all thread have started--------------]')

    # 主线程等待至各子线程结束才结束
    for item in threadList:
        # 表示这个线程结束
        # 延迟时间:超过这个时间就认为任务是失败的。比如,60秒
        item.join()
        print('[thread-join]', item.getName())
    print('[-----------------all thread have joined--------------]')

uqmn

你可能感兴趣的:(python库)