urllib提供了一系列用于操作URL的功能。
Get
context = ssl._create_unverified_context() # 解决加载https问题
response = urllib.request.urlopen('https://bbs.csdn.net/topics/390978001', context=context)
read = response.read() # 读出来的是二进制
decode = read.decode('utf-8') # 解码
print(decode)
print('code = ', response.getcode()) # 状态码
print('url = ', response.geturl()) # 获取请求的url
print('info = ', response.info()) # 请求头
Post+添加Head
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
data = {
"type": "AUTO",
"i": 'Hello',
"doctype": "json",
"xmlVersion": "1.8",
"keyfrom": "fanyi.web",
"ue": "UTF-8",
"action": "FY_BY_CLICKBUTTON",
"typoResult": "true"
}
head = {
'User-Agent': user_agent
}
data = urllib.parse.urlencode(data).encode('utf-8') # urllib.parse.urlencode encode都是必须的
# 带有data的是post请求
# 不添加header
# response = urllib.request.urlopen(url, data)
# 添加header的两种方法
# 1.
# request = urllib.request.Request(url, data,head)
# 2.
request = urllib.request.Request(url, data)
request.add_header('User-Agent', user_agent)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
j = json.loads(html) # 解析成json数据
print(html)
代理
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
url = 'http://ip.tool.chinaz.com/'
# 代理ip可能失效,不能用
iplist = ['116.204.152.110:8080', '110.86.136.118:9999', '163.204.240.241:9999', '59.57.148.242:9999']
# 创建代理
# 1.参数是一个字典{'类型':'代理ip:端口号'}
proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})
# 2.定制、创建一个opener
opener = urllib.request.build_opener(proxy_support)
# opener.addheaders = [('User-Agent', user_agent)]
# 3.安装opener
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)
下载图片
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23 # 找到页数第一个数字的位置
b = html.find(']', a) # 找到页数的最后一个数字的位置
c = html[a:b] # 切割,获取最新页数
return c
def find_imgs(url):
html = url_open(url).decode('utf-8')
a = html.find('img src=')
img_addrs = []
while a != -1:
b = html.find('.jpg', a, a + 255) # a+255 限制查找范围
if b != -1:
img_addrs.append(html[a + 9:b + 4])
else:
b = a + 9 # 查找下个 'img src='
a = html.find('img src=', b)
return img_addrs
def save_img(img_addrs):
for img in img_addrs:
img_name = img.split('/')[-1] # 分割反斜杠,获取最后一个
with open(img_name, 'wb') as f:
img_url = 'http:' + img # 拼接成标准链接
print(img_url)
img_byte = url_open(img_url) # 读取图片字节流
f.write(img_byte) # 写入,保存图片
def download_mm(folder='OOXX', pages=3):
if not os.path.exists(folder):
os.makedirs(folder) # 创建文件夹
# 查看当前工作目录
retval = os.getcwd()
print("当前工作目录为 %s" % retval)
os.chdir(folder) # path -- 要切换到的新路径。
# 查看修改后的工作目录
retval = os.getcwd()
print("目录修改成功 %s" % retval)
url = 'http://jandan.net/ooxx/'
page_num = int(get_page(url))
print('page_num:', page_num)
for i in range(pages):
if page_num == 0:
break
page_url = url + 'page-' + str(page_num - i) + '#comments'
print(page_url)
img_addrs = find_imgs(page_url)
save_img(img_addrs)
if __name__ == '__main__':
download_mm()