爬虫过程
- 请求网站,获取源码: urllib、requests、selenium、pyquery
- 解析源码: 正则表达式,lxml.etree、beautifulsoup4、selenium
- 存储数据: pymysql、pymongo
请求网站 -- -- 自动化
项目初建
- python -m venv 1902spiderenv(环境名)
- 进入E盘建文件,pycharm打开并配置环境
1. urllib语法
1.1 语法
- 传入data则表示发送POST请求,不传入data则表示发送GET请求
- urllib.request.urlopen(url, data, timeout)
# 1.1 只传入地址url的情况
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
# read()读取响应内容,默认格式为bytes
text = response.read().decode('utf-8')
# 1.2 传入地址url和参数data的情况
# 参数放在url地址中,请求方式为GET
url = 'http://www.baidu.com/s?wd=python'
response = urllib.request.urlopen(url)
text = response.read().decode('utf-8')
# 1.3 参数放在urlopen(data)中,请求方式为POST
url = 'http://www.baidu.com/s'
data = {
'wd': 'python'
}
# urlencode()方法将字典转化为key=value的形式
data = urllib.parse.urlencode(data)
# bytes()方法将data转化为bytes类型
data = bytes(data, encoding='utf-8')
response = urllib.request.urlopen(url, data)
text = response.read().decode('utf-8')
# 1.4 传入timeout参数 - 规定在多少时间内响应
url = 'http://www.baidu.com'
try:
response = urllib.request.urlopen(url, timeout=0.01)
text = response.read().decode('utf-8')
print(text)
except urllib.error.URLError as e:
print('超时')
2. 语法2 - urllib.request
- request = urllib.request.Request(url, data, headers, method)
- urllib.request.urlopen(对象request)
# Request(url)只传入地址url的情况
url = 'https://movie.douban.com/top250'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
# print(text)
# Request(url, data)传入url和参数data的情况
# data是bytes类型,请求为POST
url = 'http://www.baidu.com/s'
data = {
'wd': 'python'
}
data = urllib.parse.urlencode(data)
# 把字符串类型的data转成bytes类型
data = bytes(data, encoding='utf-8')
request = urllib.request.Request(url, data)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
# Request(url, headers), headers表示请求头, 如User_Agent参数
# User_Agent参数可被服务器端获取进行判断 判断该请求为爬虫还是人工
url = 'http://httpbin.org/get'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
# 修改代理IP
url = 'http://httpbin.org/get'
proxies = {
'http': 'http://59.37.33.62:50686'
}
proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(proxy_handler)
# 设置User-Agent
# 设置User-Agent方法1
opener.addheaders = [('User-Agent', '')]
opener.open(url)
# 设置User-Agent方法2
response = opener.open(url)
text = response.read().decode('utf-8')
print(text)
2. urllib爬取boss直聘
"""__author__= 雍新有"""
import urllib.request
import re
def get_html(url):
# 使用urllib库获取url所对应的源码
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
# ip代理
proxies = {
'http': 'http://113.120.63.179:9999'
}
proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(proxy_handler)
request = urllib.request.Request(url, headers=headers)
# response = urllib.request.urlopen(request)
response = opener.open(request)
# read()返回bytes类型的源码内容
return response.read().decode('utf-8')
def parse_html(html):
# 解析源码,获取岗位名称、工资等信息
patterns = re.compile('(.*?).*?(.*?).*?.*?(.*?)', re.S)
result = patterns.findall(html)
return result
def main():
# 成都boss招聘的地址
url = 'https://www.zhipin.com/c101270100/?query=python&page=%s&ka=page-%s'
for i in range(1, 6):
# i为1,2,3,4,5
real_url = url % (i, i)
print(real_url)
# 获取地址所对应的源码内容
html = get_html(real_url)
# print(html)
# 解析页面源码中的岗位信息
result = parse_html(html)
print(result)
if __name__ == '__main__':
main()
3. urllib爬取猫眼 - 自恋(没有存数据库)
"""__author__= 雍新有"""
import urllib.request
import re
def parse_html(html):
# 注意空格
patterns = re.compile('(.*?).*?.*?.*?.*?(.*?)
.*?\
(.*?)
.*?(.*?)
.*?(.*?).*?(.*?)', re.S)
result = patterns.findall(html)
# Python strip()方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
# 注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
for item in result:
yield {
'index': item[0],
'img': item[1],
'title': item[2],
'actors': item[3].strip(),
'time': item[4],
'score': item[5] + item[6],
}
return result
def get_html(url):
# 使用urllib库获取url所对应的源码
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
return response.read().decode('utf-8')
def main():
# 猫眼电影地址
url = 'https://maoyan.com/board/4?offset=%s'
for i in range(0, 10):
m = i * 10
real_url = url % m
print(real_url)
# 获取页面对应的源码内容
html = get_html(real_url)
# print(html)
# 获取猫眼电影top100
result = parse_html(html)
for item in result:
print(item)
if __name__ == '__main__':
main()