01_urllib

1、urllib的简单使用

import urllib.request

#(1)定义一个url 就是要访问的地址
url = 'http://www.baidu.com'

#(2) 模拟浏览器向服务器发送请求 response 响应
response = urllib.request.urlopen(url)

#(3) 获取响应中的页面的源码 content内容的意思
# read方法 返回的是字节形式的二进制数据(b开头)
#我们要将二进制的数据转换为字符串   二进制 ——> 字符串 解码 decode(‘编码的格式’)
content = response.read().decode('utf-8')

#(4) 打印数据
print(content)

2、urllib:一个类型和六个方法

import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)

#返回response类型  HTTPResponse类型
# content = type(response)
# print(content)

#按照一个字节一个字节的去读
# content = response.read()
# print(content)

#返回多少个字节
# content = response.read(5)
# print(content)

#读取一行
# content = response.readline()
# print(content)

#读取多行
# content = response.readlines()
# print(content)

#返回状态码
# print(response.getcode())

#返回的是url地址
# print(response.geturl())

#获取的是状态信息 响应头
# print(response.getheaders())

3、urllib:下载

import urllib.request

# 下载网页
# url_page = 'http://www.baidu.com'

# urlretrieve(url,filename) : url代表的是下载的路径  filename是文件的新名字
#在python中 可以变量的名字 也可以直接写值
# urllib.request.urlretrieve(url_page,'baidu.html')

#下载图片
# url_img = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fc-ssl.duitang.com%2Fuploads%2Fitem%2F202005%2F22%2F20200522131408_obupx.jpeg&refer=http%3A%2F%2Fc-ssl.duitang.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1655520388&t=d7f6d94a25d1d6e3cdbaa03a87fb1c59'
# urllib.request.urlretrieve(url=url_img,filename='lisa.jpg')

#下载视频
url_video = 'https://vd4.bdstatic.com/mda-kfsm974k6xwx9y8j/v1-cae/sc/mda-kfsm974k6xwx9y8j.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1652930638-0-0-dce2ef63e5e2c4d4cbafeea2dd68cff1&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=3238681392&vid=2292765971694816622&abtest=102148_2-17451_1&klogid=3238681392'
urllib.request.urlretrieve(url_video,'news.mp4')

4、urllib:请求对象的定制

import urllib.request
url = 'https://www.baidu.com'

# url 的组成
#https://www.baidu.com/s?ie=UTF-8&wd=周杰伦
# httl/https    www.baidu.com       80/443       s      ie=UTF-8&wd=周杰伦       #
# 协议                主机           端口号        路径      参数                  锚点
# http    80
# https   443
# mysql   3306
# oracle  1521
# redis   6379
# mongodb 27017
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}   #字典
#因为urlopen方法中不能存储字典,所以header不能传递进去
#请求对象得定制
#注意:因为参数顺序得问题,不能直接写url和headers  中间还有data  所以我们需要关键字传参
request = urllib.request.Request(url = url,headers = headers)

response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
print(content)

5、get请求

(1)quote方法:转换为Unicode编码(单个参数)
# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
# %E5%91%A8%E6%9D%B0%E4%BC%A6 = 周杰伦  是Unicode编码 全球统一
# 需求:获取 https://www.baidu.com/s?wd=周杰伦 的网页源码

import urllib.request

url = 'https://www.baidu.com/s?wd='
# 请求对象的定制为了解决反爬的第一种手段
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}

#将周杰伦三个字变成Unicode编码
#我们需要依赖urllib.parse
name = urllib.parse.quote('周杰伦')

url = url + name


# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

print(content)
(2) urlencode方法:多个参数的时候
# urlencode 应用场景:多个参数的时候
# https://www.baidu.com/s?wd=周杰伦&sex=男

# import urllib.parse
# data = {
#     'wd':'周杰伦',
#     'sex':'男',
#     'location':'中国台湾省'
# }
# a = urllib.parse.urlencode(data)
# print(a)

#获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E5%8F%B0%E6%B9%BE%E7%9C%81 的网页源码

import urllib.request
import urllib.parse
base_url = 'https://www.baidu.com/s?'

data = {
    'wd': '周杰伦',
    'sex': '男',
    'location': '中国台湾省'
}

new_data = urllib.parse.urlencode(data)
# print(new_data)

#请求资源路径
url = base_url + new_data
print(url)
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
#请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)

6、post请求

(1)请求百度翻译
import urllib.request
import urllib.parse

url = 'https://fanyi.baidu.com/sug'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
data = {
    'kw':'spider',
}
# post请求的参数,必须要进行编码
data = urllib.parse.urlencode(data).encode('utf-8')

#post的请求的参数 是不会拼接在url的后面的 而是需要放在请求对象定制的参数中
#post请求的参数 必须要进行编码
request = urllib.request.Request(url=url,data=data,headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
#获取响应的数据
content = response.read().decode('utf-8')
print(content)
print(type(content))

import json
obj = json.loads(content)
print(obj)

"""
总结:
1、post请求方式的参数必须编码 data = urllib.parse.urlencode(data)
2、编码之后必须调用encode的方法data=urllib.parse.erlencode(data).encode('utf-8')
3、参数是放在请求对象定制的方法中 :request = urllib.request.Request(url=url,data=data,headers=headers)

"""
(1)请求百度翻译详解
import urllib.request
import urllib.parse
url =  'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
    # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',
    # 'Accept':'*/*',
    # 'Accept-Encoding': 'gzip, deflate, br',
    # 'Accept-Language': 'zh-CN,zh;q=0.9',
    # 'Connection': 'keep-alive',
    # 'Content-Length': '136',
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'BIDUPSID=9C33D9AF842C65882B22D090B46042A7; PSTM=1610940896; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; __yjs_duid=1_0eb9831c9922fbeed75979b5e0cfd7231619776633784; BAIDUID=9139193BF8529188B5B5F4741D0FD50D:FG=1; APPGUIDE_10_0_2=1; BDSFRCVID_BFESS=tP8OJeC62mn-gd3DoJHSKm8DOg3F4AQTH6aoOpQpj2wA39Ni7uPrEG0PHU8g0KubT5mFogKKy2OTH9DF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tbuJ_KDyJKP3fP36qR6VMPIHqxby26nL3jn9aJ5nJDoCVnTojTJUXfk_jNoThlvMtDCj2qkaQpP-HJ7yM-5HbfPwbbbTJbcz3CrPKl0MLPOYbb0xynoDLT0B5xnMBMnGamOnanra3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDFljTu2DjQyeU5eetjK2CntsJOOaCvVHlQOy4oWK441Dh7MQt6R36chWqvEfp-WDqvoD-Jc3M04X-o9-hvT-54e2p3FBUQJHC33Qft20b0m3gFq3q8La5bD-R7jWhvBhl72y5rUQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8IjHCDt5FjtRIeV-35b5rfjJrT5-QSMICShUFsWlOCB2Q-XPoO3KJWeUokQMo-b4CUhH7RqPriWbRM2MbgylRpjM3K0462XjKBKPOpK-DfX2TxoUJ2XM0VMloMqtnWKqLebPRih6j9Qg-8KpQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hC09j68MjjoM5h5K5-LXJR6BWb_8Kb7VbPbMeMnkbfJBDxcXe4bbt2nyaqnRWbCWsh7TDUDMLq_7yajK2MRjL6RBot3ptKTFjD3G0b5pQT8ryb_OK5OibCrn_R6Iab3vOPI4XpO1ef0zBN5thURB2DkO-4bCWJ5TMl5jDh3Mb6ksD-FtqtJHKbDt_II-JfK; BDUSS=NOUXhyMjAwblBqV2NpfkF0dU9Ca1Z3cGNhOTh0R0lBMHU1TWl-bjU0ck1aNUJpRVFBQUFBJCQAAAAAAAAAAAEAAAAe0iJm0ru49tXm1qqw9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMzaaGLM2mhiV3; BDUSS_BFESS=NOUXhyMjAwblBqV2NpfkF0dU9Ca1Z3cGNhOTh0R0lBMHU1TWl-bjU0ck1aNUJpRVFBQUFBJCQAAAAAAAAAAAEAAAAe0iJm0ru49tXm1qqw9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMzaaGLM2mhiV3; BAIDUID_BFESS=875209CD6EF1B6A11342D5227CD66391:FG=1; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1650515716,1650872384,1651827471,1652925735; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; av1_switch_v3=0; PSINO=7; BA_HECTOR=a4210hal8la101ag821h8brmv0r; RT="z=1&dm=baidu.com&si=6304f9glu2&ss=l3cf5iav&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=46b&ul=c3gs0&hd=c3gvq"; H_PS_PSSID=36425_36367_34812_35914_36166_34584_35978_36055_26350_36315_36447; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1652952606; ab_sr=1.0.1_ZWI2ZWRiZWI1Y2JiODRlYWRjZTlkNThjZjAyNDI5NDFhZDg1MGFmN2VkYzI1ZmVlMGIxYjVmZGNhOTEyOWU5ODJlMDI5ZDUxYjc2ODJhOGRlZWZmMzk1NmZjM2U2NDVjNWE3YWIwNjVkOTQ3Zjc2Mjk3Nzg0ZmQ3MmZmNmJlODkzMTQzMjE4OWZiMWExODJiMzZlOTA2NGJkMTkxZmNhZDZjNWQxMzgwZWE1YmM4ZjI5OGViNzM0Y2EyNmRjMjVl',
    # 'Host': 'fanyi.baidu.com',
    # 'Origin': 'https://fanyi.baidu.com',
    # 'Referer': 'https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh',
    # 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
    # 'sec-ch-ua-mobile': '?0',
    # 'sec-ch-ua-platform': '"Windows"',
    # 'Sec-Fetch-Dest': 'empty',
    # 'Sec-Fetch-Mode': 'cors',
    # 'Sec-Fetch-Site': 'same-origin',
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',
    # 'X-Requested-With': 'XMLHttpRequest'
}
data = {
    'from': 'en',
    'to': 'zh',
    'query': 'spider',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '63766.268839',
    'token': '92389bc1e4d32b64ec36f56fb41f03db',
    'domain': 'common'
}

#post请求的参数 必须进行编码 并且要调用encode方法
data = urllib.parse.urlencode(data).encode('utf-8')

request = urllib.request.Request(url=url,data=data,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

import json
obj = json.loads(content)
print(obj)

7、ajax的get请求豆瓣电影第一页


#get请求
#获取豆瓣电影的第一页的数据并且保存起来

import urllib.request
url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=0&genres=%E5%96%9C%E5%89%A7'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
#(1)请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#(2)获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
#(3)数据下载到本地
#open方法默认情况下使用的是gbk的编码 如果我们要保存汉字,需要在open方法中指定编码格式为utf-8
#encoding = ‘utf-8'

# fp = open('douban.json','w',encoding='utf-8')
# fp.write(content)

with open('dounban1.json','w',encoding='utf-8') as fp:
    fp.write(content)

8、ajax的get请求豆瓣电影前10页

# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=0&genres=%E5%96%9C%E5%89%A7

# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=20&genres=%E5%96%9C%E5%89%A7

# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=40&genres=%E5%96%9C%E5%89%A7

# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=80&genres=%E5%96%9C%E5%89%A7

#下载前十页
#(1)请求对象的定制
#(2)获取响应的数据
#(3)下载数据
import urllib.parse
import urllib.request
def create_request(page):
    base_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&'
    data = {
        'start':(page - 1) * 20,
        'limit':20
    }
    data = urllib.parse.urlencode(data)
    url = base_url + data
    print(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def down_load(page,content):
    with open('douban_'+str(page) +'.json','w',encoding='utf-8')as fp:
        fp.write(content)


if __name__ == '__main__':
    start_page = int(input('请输入起始的页码:'))
    end_page = int(input('请输入结束的页码:'))

    for page in range(start_page,end_page+1):
        #每一页都有自己的请求对象的定制
        request = create_request(page)
        #获取响应的数据
        content = get_content(request)
        #下载
        down_load(page,content)

9、ajax的post请求肯德基官网

# 页面1
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
# post
# cname:
# pid:
# keyword: 北京
# pageIndex: 1
# pageSize: 10

# 页面2
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
# post
# cname:
# pid:
# keyword: 北京
# pageIndex: 2
# pageSize: 10

import urllib.request
import urllib.parse
# base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
def create_request(page):
    base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    data = {
        'cname':'',
        'pid':'',
        'keyword': '北京',
        'pageIndex': page,
        'pageSize': '10'
   }
    data = urllib.parse.urlencode(data).encode('utf-8')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
    }

    request = urllib.request.Request(url=base_url,data=data,headers=headers)
    return request
def get_content(requset):
    response = urllib.request.urlopen(requset)
    content = response.read().decode('utf-8')
    return content

def down_load(page,content):
    with open('kfc_'+str(page)+'.json','w',encoding='utf-8')as fp:
        fp.write(content)

if __name__ == '__main__':
    start_page = int(input('请输入起始页码:'))
    end_page = int(input('请输入结束页码:'))
    for page in range(start_page,end_page+1):
        #请求对象的定制
        requests = create_request(page)
        #获取网页源码
        content = get_content(requests)
        #下载
        down_load(page,content)

10、urllib异常

import urllib.request
import urllib.error
# url = 'https://blog.csdn.net/JHXL_/article/details/1246767841'
url = 'http://www.goudan111.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
try:
    request = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    print(content)
except urllib.error.HTTPError:
    print('系统正在升级。。')
except urllib.error.URLError:
    print('我都说了 系统正在升级。。。')

11、handler的基本使用

import urllib.request
url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}

request = urllib.request.Request(url=url,headers=headers)
# (1)获取handler 对象
handler = urllib.request.HTTPHandler()

# (2)获取opener对象
opener = urllib.request.build_opener(handler)

# (3)调用open方法
response = opener.open(request)

content = response.read().decode('utf-8')
print(content)

12、代理

import urllib.request
url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器访问服务器
# response = urllib.request.urlopen(request)

#找的代理,可以买
proxies = {
    'http':'118.24.219.151:16817'
}
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
#获取响应信息
content = response.read().decode('utf-8')

with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

13、代理池

import urllib.request
proxies_pool = [
    {'http':'14.215.212.37:9168'},
    {'http':'14.215.212.37:9168'},
    {'http':'14.215.212.37:9168'}
]

import random
proxies = random.choice(proxies_pool)

url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}

request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')

with open('dailichi.html','w',encoding='utf-8')as fp:
    fp.write(content)

你可能感兴趣的:(python爬虫笔记,python,爬虫)