爬取菜鸟教程python3相关教程
(http://www.runoob.com/python3/python3-tutorial.html)
抓取以下网页
准备工作(所有用到的包)
import hashlib
import pickle
import queue
import random
import re
import zlib
from os import path
import os
from threading import Thread
from urllib import robotparser
import requests
import time
from datetime import datetime
from urllib.parse import urlparse, urlsplit, urljoin
from bson import Binary
from fake_useragent import UserAgent
from lxml import etree
from pymongo import MongoClient
from retrying import retry
随即代理(感谢快代理的免费代理提供 =3=)
class RandomProxy(object):
def __init__(self):
self.proxies = []
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
# 抓取代理
def crawl_proxies(self):
ua = UserAgent()
headers = {
"User-Agent": ua.random, # 随机user-agent
}
base_url = "https://www.kuaidaili.com/free/inha/{}/"
for i in range(1, 6):
url = base_url.format(i)
response = requests.get(url=url, headers=headers)
tree = etree.HTML(response.text)
ip_list = tree.xpath('//div[@id="list"]//tbody//tr')
for proxy in ip_list:
protocol = proxy.xpath('./td[4]/text()')[0]
ip = proxy.xpath('./td[1]/text()')[0]
port = proxy.xpath('./td[2]/text()')[0]
ip_port = ip + ":" + port
if self.verify_proxies({protocol: ip_port}):
self.proxies.append({protocol: ip_port})
else:
continue
# with open('proxies.json','w') as fp:
# fp.write(self.proxies)
# 验证代理是否可用
def verify_proxies(self, proxy):
url = "http://www.baidu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
response = requests.get(url=url, headers=headers, proxies=proxy)
if response.status_code == 200:
return True
else:
return False
# 获得一个可用代理
def get_proxy(self):
if len(self.proxies) < 2:
self.crawl_proxies()
return random.choice(self.proxies)
下载限流器
class Throttle(object):
def __init__(self, delay):
self.client = MongoClient('localhost', 27017)
self.db = self.client.cache
domains = self.db.domain
self.delay = delay
def wait_url(self, url_str):
domain_url = urlparse(url_str).netloc # 取出url的域名部分
last_accessed = self.db.domain.find_one({"_id": domain_url})['time'] # 从数据库取出域名的上次下载时间
if self.delay > 0 and last_accessed is not None:
# 将当前时间和上次下载时间相减,得出两次下载时间间隔,然后用休眠时间(delay)减去这个间隔
# 如果大于0就休眠,否则直接下载后续链接
sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
if sleep_interval > 0:
time.sleep(sleep_interval)
else:
result = {'time': datetime.now()}
self.db.domain.update_one({'_id': domain_url}, {'$set': result}, upsert=True) # 把当前时间以域名作为key存到domains字典中
自己封装的工具类
class Help(object):
def __init__(self):
self.client = MongoClient('localhost', 27017)
self.db = self.client.cache
urls = self.db.urls
# 解析robots文件
def get_robots(self, url, ua):
rp = robotparser.RobotFileParser()
base_url = urlsplit(url).netloc
base_url = 'http://' + base_url + '/robots.txt'
rp.set_url(urljoin(base_url, 'robots.txt'))
rp.read()
return rp.can_fetch(ua, url)
# 存储下载内容
def save_url(self, html_content, url_str):
filename = urlsplit(url_str).path.split("/")[-1]
if not os.path.exists('./download/'):
os.makedirs('./download/')
filepath = path.join("./download/", filename)
with open(filepath, 'wb') as fp:
fp.write(html_content)
# 提取网页中的相关链接
def extractor_url_lists(self, html_content, keyword):
# re.IGNORECASE忽略大小写
url_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
url_list = url_regex.findall(html_content.decode('utf8'))
filter_urls = [link for link in url_list if re.search(keyword, link)]
return set(filter_urls)
# 以下载链接为key,内容的md5为值,进行去重(并存到mongodb)
def distinct(self, url_str, html_content):
content = hashlib.md5(str(html_content).encode(encoding='utf8')).hexdigest()
record = self.db.urls.find_one({'_id': url_str})
if record:
before_temp = pickle.loads(zlib.decompress(record["html_content"]))
before = hashlib.md5(str(before_temp).encode(encoding='utf8')).hexdigest()
if before == content:
return False
else:
return True
else:
result = {"html_content": Binary(zlib.compress(pickle.dumps(html_content))), "timestamp": datetime.utcnow()} # 压缩数据,设置时间戳
# 使用update的upsert(如果不存在执行insert,存在执行update)参数进行插入更新操作,$set内置函数表示覆盖原始数据
self.db.urls.update({"_id": url_str}, {'$set': result}, upsert=True)
return True
定义爬取深度
MAX_DEP = 2
通用爬虫
class CrawlerCommon(Thread):
def __init__(self, init_url):
super().__init__()
self.throttle = Throttle(5.0) # 实例化限流器
self.random_proxy = RandomProxy() #实例化随机代理
self.help = Help() # 实例化工具类
__ua = UserAgent() # 随机user-agent
self.headers = {"User-Agent": __ua.random}
self.seed_url = init_url # 初始化爬取的种子网址
self.crawler_queue = queue.Queue() # 使用不同的队列会造成BFS和DFS的效果
self.crawler_queue.put(init_url) # 种子网址放入队列
self.client = MongoClient('localhost', 27017) # 创建mongodb连接
self.visited = {init_url: 0} # 初始化爬取深度为0
# 使用装饰器的重试下载类
@retry(stop_max_attempt_number=3)
def retry_download(self, url_str, data, method, proxies):
self.throttle.wait_url(url_str)
if method == "POST":
result = requests.post(url_str, data=data, headers=self.headers, proxies=proxies)
else:
result = requests.get(url_str, headers=self.headers, timeout=3, proxies=proxies)
# 此处为断言,判断状态码是否为200
assert result.status_code == 200
return result.content
# 真正的下载类
def download(self, url_str, data=None, method="GET", proxies={}):
print("download url is ::::", url_str)
try:
result = self.retry_download(url_str, data, method, proxies)
except Exception as e:
print(e)
result = None
return result
# 进行网页爬取的主要方法
def run(self):
keyword = input("请输入关键字:")
# print(self.crawler_queue.get())
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
if self.help.get_robots(url_str, self.headers["User-Agent"]):
depth = self.visited[url_str]
if depth < MAX_DEP:
proxy = self.random_proxy.get_proxy()
html_content = self.download(url_str, proxies=proxy)
if html_content is not None:
if self.help.distinct(url_str, html_content): # 内容去重
self.help.save_url(html_content, url_str) # 保存去重后的内容到文件
else:
continue
# keyword = input("请输入关键字:")
url_list = self.help.extractor_url_lists(html_content, keyword) # 提取相关内容的链接即包含python3的链接
for url in url_list:
if "http" in url:
pass
else:
url = urljoin("http://www.runoob.com", url) # 补全链接
if url not in self.visited:
self.visited[url] = depth + 1
self.crawler_queue.put(url)
else:
print("robots.txt 禁止下载:", url_str)
运行
if __name__ == "__main__":
crawler = CrawlerCommon("http://www.runoob.com/python3/python3-tutorial.html")
crawler.start()