文档:http://cn.python-requests.org/zh_CN/latest/
安装:pip --timeout=100 install requests
[ python ] pip 配置国内镜像源(亲测有效)
requests
模块的get
请求import requests
if __name__ == "__main__":
url = "https://www.baidu.com"
response = requests.get(url)
response.encoding = 'utf-8'
print("状态码:" + str(response.status_code))
page_text = response.text
print("页面内容:" + page_text)
with open('./baidu.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('爬取数据结束!')
requests
模块的get
请求import requests
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://www.sogou.com/web'
kw = input('输入查询关键字:')
param = {
'query': kw
}
response = requests.get(url, param, headers=headers)
page_text = response.text
fileName = kw + '.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('数据爬取结束!')
requests
模块的post
请求import requests
import json
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
post_url = 'https://fanyi.baidu.com/sug'
word = input('输入查询关键字:')
data = {
'kw': word
}
response = requests.post(post_url, data, headers=headers)
dic_obj = response.json()
print(dic_obj)
fileName = word + '.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(dic_obj, fp, ensure_ascii=False)
print('数据爬取结束!')
requests
模块ajax
的get
请求https://movie.douban.com/
import requests
import json
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
param = {
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",
}
url = 'https://movie.douban.com/j/chart/top_list'
response = requests.get(url, param, headers=headers)
dic_obj = response.json()
print(dic_obj)
fileName = '豆瓣电影排行榜.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(dic_obj, fp, ensure_ascii=False)
print('数据爬取结束!')
http://125.35.6.84:81/xk/
import requests
import json
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36 '
}
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
# 企业 id 列表
id_list = []
detail_list = []
# 获取前两页企业 id,30 条id
for page in range(1, 3):
page = str(page)
param = {
"on": "true",
"page": page,
"pageSize": "15",
"productName": "",
"conditionType": "1",
"applyname": "",
"applysn": "",
}
response = requests.post(url, param, headers=headers)
json_ids = response.json()
for dic in json_ids['list']:
id_list.append(dic['ID'])
post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data = {
'id': id
}
res = requests.post(post_url, data, headers=headers)
detail_json = res.json()
detail_list.append(detail_json)
fileName = '企业信息.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(detail_list, fp, ensure_ascii=False)
print('数据爬取结束!')
来源:爬虫开发入门丨老男孩IT教育