redis 基础
- CONFIG GET * //获得所有配置信息
- CONFIG SET loglevel "notice" //设置配置信息
- PING //检测redis是否正常连通
- SET CLASS 1803 //创建key
- KEYS * //查询key
- EXISTS CLASS //检测key是否存在
- DEL CLASS //删除key
- EXISTS CLASS //检测key是否存在
- SET CLASS 1803 EX 3 //创建key并设置过期时间,秒
- SET CLASS 1803 PX 3000 //创建key并设置过期时间,毫秒
- EXPIRE CLASS 3 //设置过期时间
- PERSIST CLASS //移出过期时间
- sadd student zhangsan //添加集合
- scard student //返回集合中元素的数量
- lpush
- lpop
- blpop
- smembers //返回集合中所有成员
更多命令,请参考:http://redisdoc.com/list/
分布式爬虫步骤
- 写爬虫脚本 redis_spider.py
- 部署
- 服务端部署
redis-server
-
redis_spider.py rds =Redis('127.0.0.1',6379)
- 节点部署
redis_spider.py rds =Redis('服务端的ip',6379)
eg: rds =Redis('10.31.161.59',6379)
- 运行
服务端 python redis_spider.py
节点 python redis_spider.py
- 如何在局域网部署redis分布式
[服务端配置]:
修改配置文件redis.conf,添加远程访问配置,如下:
bind 127.0.0.1
bind 10.31.161.59
保存后重启redis-server
./redis-server /path/to/redis.conf
[客户端连接服务端]
redis-cli -h 10.31.161.59 -p 6379
从redis中提取数据
"""
假设,redis数据库中有一个键名为('spider:items'的list,现在
我们要把这个键中的数据及时地转存到其它存储介质中.('spider:items'中的数据是实时生成的)
方案:
由于'spider:items'中的数据是实时生成的,我们要及时地完成转存数据,可以每隔一段时间从
redis的'spider:items'中取出一条数据,保存到其它存储介质中,代码试下思路如下:
"""
from redis import Redis
import time
rds = Redis('127.0.0.1',port=6379)
while True:
# 实时监控redis中的'spider:items'键,每隔2秒提取一条数据
# 注意这里使用的blpop
key,value = rds.blpop('spider:items')
time.sleep(2)
# 把value保存到其它数据.....
设计超时装饰器
import time
import requests
RETRY_TIME = 3
DOWNLOAD_DELAY = 2
class Retry(object):
def __init__(self,retries = 3,delay=0):
self.retries = retries
self.delay=delay
def __call__(self,func):
def wrapper(*args, **kwargs):
for i in range(self.retries):
try:
result=func(*args,**kwargs)
except Exception as e:
print(e)
time.sleep(self.delay)
continue
else:
return result
return wrapper
@Retry(RETRY_TIME,DOWNLOAD_DELAY)
def fetch(url):
print(f'Start fetch {url}')
resp = requests.get(url,timeout=5)
print(resp.status_code)
fetch('http://www.baidu.com')
分布式爬虫完整代码
"""
爬虫:
for url in urls:
url -> 发送请求 -> 获得response -> 解析response -> 保存数据
多线程爬虫:
urls 保存在本地内存中
work(url -> 发送请求 -> 获得response -> 解析response -> 保存数据)
启用多个work
多线程分布式爬虫:
urls 保存在redis内存数据库中
多台电脑 从redis内存数据库取url,
每台电脑执行的操作是:work(url -> 发送请求 -> 获得response -> 解析response -> 保存数据)
"""
from redis import Redis
import threading
from lxml import etree
import requests
import time
import sys
IDLE = 0 # 空闲状态
WORKING = 1 # 工作状态
REDIS_SPIDER_URLS_KEY = 'spider:urls'
start_url = 'https://m.sogou.com/'
ALLOW_DOMAIN = 'sogou.com'
MAX_DOWNLOAD_THREAD = 10
RETRY_TIME = 3
DOWNLOAD_DELAY = 10
crawled_url = set() # 爬取过的url
REDIS_CRAWLED_URL = 'crawled_url'
rds =Redis('127.0.0.1',6379)
class Retry(object):
def __init__(self,retries = 3,delay=0):
self.retries = retries
self.delay=delay
def __call__(self,func):
def wrapper(*args, **kwargs):
for i in range(self.retries):
try:
result=func(*args,**kwargs)
except Exception as e:
print(e)
time.sleep(self.delay)
continue
else:
return result
return wrapper
@Retry(RETRY_TIME,DOWNLOAD_DELAY)
def fetch(url):
"""
下载页面,如果下载成功,返回网页源码 -->str,否则返回None
:param url:
:return: 返回str类型的网页内容 或者 None
"""
# 如果是新url,就下载url对应的页面,如果url已经爬取过,忽略
print(url)
print(rds.smembers(REDIS_CRAWLED_URL))
if rds.sadd(REDIS_CRAWLED_URL,url) == 1:
print('Start fetching .. {url}'.format(url=url))
print(f'Start fetching .. {url}')
resp = requests.get(url)
print(resp.status_code)
if resp.status_code == 200:
return resp.text
return None
# else:
# pass
def parse(html):
"""
解析html,从html中抽取所有的url
:param html:网页源码
:return: 返回list类型的url列表
"""
if html is None:
return None
tree = etree.HTML(html)
urls = tree.xpath('//a/@href')
new_urls=[]
print(urls)
for url in urls:
if ALLOW_DOMAIN in url:
if url.startswith('http') or url.startswith('https'):
print(url)
new_urls.append(url)
if not new_urls:
return None
return new_urls
class Spider(threading.Thread):
"""爬虫器"""
def __init__(self):
super(Spider,self).__init__()
self.status = IDLE
@property
def is_idle(self):
"""
判断对象是不是空闲状态,如果是空闲状态,返回True
:return:
"""
return self.status == IDLE
def run(self):
while True:
# 从redis url队列中取出一个url,如果队列中暂时没有url,等待
url = rds.blpop(REDIS_SPIDER_URLS_KEY)[1]
print(url)
# 开始爬取url对应的页面
self.status = WORKING
# 获取网页源码
html = fetch(url)
# 解析
urls = parse(html)
print(urls)
if urls is not None:
new_urls = set(urls) # 获取页面中所有url,并去重
rds.lpush(REDIS_SPIDER_URLS_KEY,*new_urls)
# for url in urls:
# rds.lpush(REDIS_SPIDER_URLS_KEY,url)
# 线程爬取结束
self.status = IDLE
def all_is_idle(spiders):
"""
检测所有爬虫线程是否均为空闲状态,如果都是空闲状态返回True,否则返回False
:param spiders: 所有爬虫线程
:return:bool
"""
# spiders_status = [] # [1,1,0,1,1]
# for spider in spiders:
# spiders_status.append(spider.status)
spiders_is_idle = [spider.is_idle for spider in spiders]
return all(spiders_is_idle)
for spider in spiders:
if spider.is_idle is not True:
return False
return True
def main(n):
# 创建n个线程
rds.lpush(REDIS_SPIDER_URLS_KEY,start_url)
spider_pools = []
for i in range(n):
spider = Spider()
spider_pools.append(spider)
for spider in spider_pools:
spider.start()
while True:
# 监听 rds中的待爬取队列
# 如果待爬取队列为空,结束爬虫
if rds.llen(REDIS_SPIDER_URLS_KEY) == 0 and all_is_idle(spider_pools):
print("所有线程爬取完毕.")
sys.exit(0)
time.sleep(2)
if __name__ == '__main__':
main(MAX_DOWNLOAD_THREAD)