import urllib.request
# #向百度网站发起一个请求 得到一个响应结果 用一个变量接收
# response=urllib.request.urlopen('https://www.baidu.com/')
# print(response.read().decode('utf-8'))
url='https://www.baidu.com/'
headers={
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
#创建百度请求的数据(网页源码)
# 1. 创建请求的对象(构建用户:User-Agent)
response=urllib.request.Request(url,headers=headers)
# 2.获取相应的对象(urlopen())
res=urllib.request.urlopen(response)
# 3.读取响应对象的内容(read().decode('utf-8'))
html=res.read().decode('utf-8')
print(html)
import urllib.parse
url='https://www.baidu.com/s?ie=UTF-8&wd=%E6%B5%B7%E8%B4%BC%E7%8E%8B'
r={
'wd':'海贼王'}
result=urllib.parse.urlencode(r)
print(result)
话不多说,直接上代码,下面有运行成功截图
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/7/22 19:33
# @Author : 沐白
# @Site :
# @File : 百度贴吧联系一.py
# @Software: PyCharm
# 需求:1输入要爬取贴吧的主题
# 2.输入爬取的起始页和终止页
# 3.把每一页的内容保存到本地
# 找规律
# https://tieba.baidu.com/f?kw=%E5%85%83%E5%B0%8A&ie=utf-8&pn=0
# https://tieba.baidu.com/f?kw=%E5%85%83%E5%B0%8A&ie=utf-8&pn=50
# https://tieba.baidu.com/f?kw=%E5%85%83%E5%B0%8A&ie=utf-8&pn=100
# 主要在于pn的值
import random
import urllib.request
import urllib.parse
headers_list=[ {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'},
{
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
,{
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
]
headers=random.choice(headers_list)
name=input('请输入贴吧名:')
start=int(input('请输入起始页:'))
end=int(input('请输入结束页:'))
kw={
'kw':name}
kw=urllib.parse.urlencode(kw)
# 拼接url
for i in range(start,end+1):
pn=(i-1*50)
baseurl='https://tieba.baidu.com/f?kw='
req=baseurl+kw+'&pn='+str(pn)
res=urllib.request.urlopen(req)
html=res.read().decode('utf-8')
filename='第'+str(i)+'页.html'
with open(filename,'w',encoding='utf-8') as f:
print('正在爬取第{}页'.format(i))
f.write(html)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/7/22 20:25
# @Author : 沐白
# @Site :
# @File : 百度贴吧联系二.py
# @Software: PyCharm
import random
import urllib.parse
import urllib.request
# 读取页面
def readPage(url):
headers_list = [{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'},
{
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
, {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
]
headers = random.choice(headers_list)
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode('utf-8')
return html
# 写入文件
def writePage(filename,html):
with open(filename,'w',encoding='utf-8')as f:
f.write(html)
print('写入成功')
# 主函数
def main():
name=input('请输入贴吧名:')
start=int(input('请输入起始页:'))
end=int(input('请输入结束页:'))
kw={
'kw':name}
kw=urllib.parse.urlencode(kw)
for i in range(start,end+1):
pn=(i-1)*50
url='https://tieba.baidu.com/f?'+kw+'&pn='+str(pn)
html=readPage(url)
filename='第{}页.html'.format(i)
writePage(filename,html)
if __name__ == '__main__':
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/7/22 20:41
# @Author : 沐白
# @Site :
# @File : 百度贴吧练习三.py
# @Software: PyCharm
import urllib.request
import urllib.parse
class BaiduSpider:
def __init__(self):
self.headers={
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
self.beaseurl='https://tieba.baidu.com/f?'
def readPage(self,url):
req=urllib.request.Request(url,headers=self.headers)
res=urllib.request.urlopen(req)
html=res.read().decode('utf-8')
return html
def writePage(self,filename,html):
with open(filename,'w',encoding='utf-8')as f:
f.write(html)
print('写入成功')
pass
def main(self):
name = input('请输入贴吧名:')
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {
'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(start, end + 1):
pn = (i - 1) * 50
url = self.beaseurl+ kw + '&pn=' + str(pn)
html = self.readPage(url)
filename = '第{}页.html'.format(i)
self.writePage(filename, html)
pass
if __name__ == '__main__':
# 创建类的实例
spider=BaiduSpider()
spider.main()
**注:**需要将data请求转换成bytes
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/7/22 21:34
# @Author : 沐白
# @Site :
# @File : 有道翻译.py
# @Software: PyCharm
# 需求:制作一个有道翻译小软件
import urllib.request
import urllib.parse
import json
# 请输入要翻译的内容
key=input('请输入要翻译的内容:')
# 把提交的form表单数据转换成bytes类型数据
data={
'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15954251040982',
'sign': '5728388cef63a7407571cbf1b5991361',
'ts': '1595425104098',
'bv': 'a9c3483a52d7863608142cc3f302a0ba',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME'
}
data=urllib.parse.urlencode(data)
# 把data数据转换成字节流数据
data=bytes(data,'utf-8')
#发起请求获取响应
# url需要去掉_o
url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
headers={
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
req=urllib.request.Request(url,data=data,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode('utf-8')
# 把json类型的字符串转换成字典
r_dict=json.loads(html)
r=r_dict['translateResult']
content=r[0]
print(content)
# print(html)
# {"type":"ZH_CN2EN","errorCode":0,"elapsedTime":1,"translateResult":[[{"src":"你好","tgt":"hello"}]]}