Python-urllib

urllib提供了一系列用于操作URL的功能。

Get

context = ssl._create_unverified_context()  # 解决加载https问题
response = urllib.request.urlopen('https://bbs.csdn.net/topics/390978001', context=context)
read = response.read()  # 读出来的是二进制
decode = read.decode('utf-8')  # 解码
print(decode)
print('code = ', response.getcode())  # 状态码
print('url = ', response.geturl())  # 获取请求的url
print('info = ', response.info())  # 请求头

Post+添加Head

url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'

data = {
    "type": "AUTO",
    "i": 'Hello',
    "doctype": "json",
    "xmlVersion": "1.8",
    "keyfrom": "fanyi.web",
    "ue": "UTF-8",
    "action": "FY_BY_CLICKBUTTON",
    "typoResult": "true"
}
head = {
    'User-Agent': user_agent
}

data = urllib.parse.urlencode(data).encode('utf-8')  # urllib.parse.urlencode encode都是必须的

# 带有data的是post请求
# 不添加header
# response = urllib.request.urlopen(url, data)

# 添加header的两种方法
# 1.
# request = urllib.request.Request(url, data,head)
# 2.
request = urllib.request.Request(url, data)
request.add_header('User-Agent', user_agent)

response = urllib.request.urlopen(request)

html = response.read().decode('utf-8')
j = json.loads(html)  # 解析成json数据
print(html)

代理

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'

url = 'http://ip.tool.chinaz.com/'

# 代理ip可能失效,不能用
iplist = ['116.204.152.110:8080', '110.86.136.118:9999', '163.204.240.241:9999', '59.57.148.242:9999']
# 创建代理
# 1.参数是一个字典{'类型':'代理ip:端口号'}
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})
# 2.定制、创建一个opener
opener = urllib.request.build_opener(proxy_support)

# opener.addheaders = [('User-Agent', user_agent)]
# 3.安装opener
urllib.request.install_opener(opener)

response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)

下载图片

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
    return html


def get_page(url):
    html = url_open(url).decode('utf-8')
    a = html.find('current-comment-page') + 23  # 找到页数第一个数字的位置
    b = html.find(']', a)  # 找到页数的最后一个数字的位置
    c = html[a:b]  # 切割,获取最新页数
    return c


def find_imgs(url):
    html = url_open(url).decode('utf-8')
    a = html.find('img src=')
    img_addrs = []

    while a != -1:
        b = html.find('.jpg', a, a + 255)  # a+255 限制查找范围
        if b != -1:
            img_addrs.append(html[a + 9:b + 4])
        else:
            b = a + 9  # 查找下个 'img src='

        a = html.find('img src=', b)

    return img_addrs


def save_img(img_addrs):
    for img in img_addrs:
        img_name = img.split('/')[-1]  # 分割反斜杠,获取最后一个
        with open(img_name, 'wb') as f:
            img_url = 'http:' + img  # 拼接成标准链接
            print(img_url)
            img_byte = url_open(img_url)  # 读取图片字节流
            f.write(img_byte)  # 写入,保存图片


def download_mm(folder='OOXX', pages=3):
    if not os.path.exists(folder):
        os.makedirs(folder)  # 创建文件夹

    # 查看当前工作目录
    retval = os.getcwd()
    print("当前工作目录为 %s" % retval)

    os.chdir(folder)  # path -- 要切换到的新路径。

    # 查看修改后的工作目录
    retval = os.getcwd()
    print("目录修改成功 %s" % retval)

    url = 'http://jandan.net/ooxx/'

    page_num = int(get_page(url))
    print('page_num:', page_num)
    for i in range(pages):
        if page_num == 0:
            break
        page_url = url + 'page-' + str(page_num - i) + '#comments'
        print(page_url)
        img_addrs = find_imgs(page_url)
        save_img(img_addrs)


if __name__ == '__main__':
    download_mm()

你可能感兴趣的:(Python-urllib)