这篇博客实现了一个python网络爬虫,爬取实习僧网站上的一些信息,存储到MongDB中,并设计了一个server和client,client给server发送要查询的岗位信息,server在数据库中查询,并返回给client。server使用Flask实现。
一共分为三个文件,crawler.py, server.py和client.py。 crawler.py负责爬取网站内容。
下面是源代码:
crawler.py
该爬虫使用了multiprocessing 和线程池,ThreadPool来提高运行效率。由爬虫每次爬取网页内容时,等待I/O的过程中都要消耗一些时间,因此使用多线程可以提高总体的执行效率。使用多进程的原因是为了提高CPU利用率,加快运算速度。因为Python中的线程需要获取GIL才能运行,因此使用多线程不能提高CPU利用率。因而使用多进程。
MongoDB在python中有pymongo作为支持的库。
数据库表的名字是‘craw‘
创建了MongoConnector类,包含数据库的操作的常用方法。
引入logging模块,并使用Logger类,可创建Logger对象并获取logger句柄。
MyCrawler类中,通过seed page循环抓取网页上的链接,并将每个链接中的岗位title存储到数据库中,已经存储的则跳过。
getPageSource中使用BeautifulSoup中提供的方法获取网页源码,并拿到title值。
这里使用了广度优先算法,广度值为3。这部分代码是借鉴的另一篇博客,链接在此:https://blog.csdn.net/weixin_34613450/article/details/72810595
from pymongo import MongoClient
import multiprocessing
from multiprocessing import Process
from concurrent.futures import ThreadPoolExecutor
import time
import random
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
from bs4 import BeautifulSoup
import logging
class MongoConnector:
def __init__(self):
# create the connection with mongodb
self.client = MongoClient('localhost', 27017)
self.db = self.client.mydatabase
# 获取数据库中所有pairs
def GetCollection(self):
self.collection = self.db['craw']
return self.collection
# 插入一条记录
def Insert(self,pair):
self.client.mydatabase.craw.insert_one(pair)
# 关闭数据库
def Close(self):
self.client.close()
#查找记录并返回结果
def Find(self,name,value):
row = self.collection.find({name:value})
return row
class Logger():
def __init__(self,filename):
# create a logger with the program name and the level = DEBUG
self.logger = logging.getLogger(__name__)
# logger.setLevel(logging.DEBUG)
# set up logging to file
logging.basicConfig(
format='[%(asctime)s] [%(levelname)s] [%(processName)s] [%(threadName)s] : %(message)s',
level=logging.INFO)
# create a handler that will write log to stderr
self.file_handler = logging.FileHandler(filename)
self.file_handler.setLevel(logging.INFO)
self.logger.addHandler(self.file_handler)
# 获取logger
def getLogger(self):
return self.logger
#自定义队列类
class linkQuence:
def __init__(self):
# 已访问的url集合
self.visted = []
# 待访问的url集合
self.unVisited = []
# 获取访问过的url队列
def getVisitedUrl(self):
return self.visted
# 获取未访问的url队列
def getUnvisitedUrl(self):
return self.unVisited
# 添加到访问过得url队列中
def addVisitedUrl(self, url):
self.visted.append(url)
# 移除访问过得url
def removeVisitedUrl(self, url):
self.visted.remove(url)
# 未访问过得url出队列
def unVisitedUrlDeQuence(self):
try:
return self.unVisited.pop()
except:
return None
# 保证每个url只被访问一次
def addUnvisitedUrl(self, url):
if url != "" and url not in self.visted and url not in self.unVisited:
self.unVisited.insert(0, url)
# 获得已访问的url数目
def getVisitedUrlCount(self):
return len(self.visted)
# 获得未访问的url数目
def getUnvistedUrlCount(self):
return len(self.unVisited)
# 判断未访问的url队列是否为空
def unVisitedUrlsEnmpy(self):
return len(self.unVisited) == 0
class MyCrawler:
def __init__(self,seeds):
# 初始化当前抓取的深度
self.current_deepth = 1
# 构造访问代理
self.proxy = '119.28.142.148:8888'
self.proxy_handler = ProxyHandler({
'http': 'http://' + self.proxy,
'https': 'https://' + self.proxy
})
self.opener = build_opener(self.proxy_handler)
# 构造一个linkQuence对象,使用种子初始化url队列
self.linkQuence = linkQuence()
if isinstance(seeds, str):
self.linkQuence.addUnvisitedUrl(seeds)
if isinstance(seeds, list):
for seed in seeds:
self.linkQuence.addUnvisitedUrl(seed)
# print("Add the seeds url %s to the unvisited url list" % str(self.linkQuence.unVisited))
# logger.info(("Add the seeds url {} to the unvisited url list").format(str(self.linkQuence.unVisited)))
# 抓取过程主函数
def crawling1(self, seeds):
# 获取超链接
links = self.getHyperLinks(seeds)
logger.info(("Get {} new links").format(len(links)))
def crawling(self, seeds, crawl_deepth):
# 循环条件:抓取深度不超过crawl_deepth
while self.current_deepth <= crawl_deepth:
# 循环条件:待抓取的链接不空
while not self.linkQuence.unVisitedUrlsEnmpy():
# 队头url出队列
visitUrl = self.linkQuence.unVisitedUrlDeQuence()
#logger.info(("Pop out one url {} from unvisited url list").format(visitUrl))
if visitUrl is None or visitUrl == "":
continue
# 获取超链接
links = self.getHyperLinks(visitUrl)
logger.info(("Get {} new links").format(len(links)))
# 将url放入已访问的url中
self.linkQuence.addVisitedUrl(visitUrl)
#logger.info(("Visited url count:{} " ).format(str(self.linkQuence.getVisitedUrlCount())))
#logger.info(("Visited deepth: {}").format(str(self.current_deepth)))
# 未访问的url入列
for link in links:
self.linkQuence.addUnvisitedUrl(link)
# print("%d unvisited links:" % len(self.linkQuence.getUnvisitedUrl()))
#logger.info(("{} unvisited links:").format(len(self.linkQuence.getUnvisitedUrl())))
self.current_deepth += 1
# 获取源码中的超链接
def getHyperLinks(self, url):
got_link = None
links = set()
soup = self.getPageSource(url)
for link in soup.find_all('a'):
href = str(link.get('href')).split()
if 'None' in href or 'javascript' in href or href == '/' or 'Mailto:[email protected]' in href:
pass
else:
if len(href) == 1:
if 'http:' in href[0] or 'www' in href[0] or 'shixiseng' in href[0]:
got_link = href[0]
else:
got_link = 'https://www.shixiseng.com' + href[0]
elif len(href) == 2:
got_link = 'https://www.shixiseng.com' + href[0] + href[1]
links.add(got_link)
logger.info(('Get {} new links').format(len(links)))
self.getTitles(links)
return links
# 获取网页源码
def getPageSource(self, url):
try:
html = self.opener.open(url)
soup = BeautifulSoup(html.read().decode('utf-8'),'lxml')
return soup
except URLError as e:
logger.error(('URLError:{}').format(e.reason))
return ''
except UnicodeEncodeError as e:
logger.error(('UnicodeEncodeError:{}').format(e.reason))
def getTitles(self,links):
for link in links:
# check if link is already in database
row = mongo.Find('url', link)
if row.count() > 0:
logger.info(('URl {} already in database.').format(link))
else:
# page = request.Request(link, headers=headers)
try:
html = self.opener.open(link)
soup = BeautifulSoup(html.read().decode('utf-8'), 'lxml')
title = str(list(soup.title.children))
pair = {'title': title[2:-2], 'url': link}
mongo.Insert(pair)
logger.info(('Insert title:{},url:{} into database.').format(title[2:-2], link))
except Exception:
pass
def main(seeds):
craw = MyCrawler(seeds)
craw.crawling(seeds,3)
def TargetFunction(socket_queue, locker,seed_link):
with ThreadPoolExecutor(max_workers=4) as executor:
while True:
seconds = random.random()
time.sleep(seconds)
locker.acquire()
#get mongodb connection
locker.release()
# Submit a target function with arguments to the executor
executor.submit(main, seed_link)
# print('seed_link in target function',seed_link)
if __name__ == '__main__':
logger = Logger('crawler_log.txt').getLogger()
seed_link = "https://www.shixiseng.com/"
# create variables for multiprocessing
processes = []
socket_queue = multiprocessing.Queue()
locker = multiprocessing.Lock()
mongo = MongoConnector()
collection = mongo.GetCollection()
# create child processes
for _ in range(4):
# Process.sleep(10)
child_process = Process(target=TargetFunction, args=(socket_queue, locker, seed_link,))
child_process.start()
processes.append(child_process)
mongo.Close()
server.py
from flask import Flask
from flask import jsonify
from pymongo import MongoClient
import logging
# f = open(r'index.html','w')
# f.write(str(soup))
class MongoConnector:
def __init__(self):
# create the connection with mongodb
self.client = MongoClient('localhost', 27017)
self.db = self.client.mydatabase
# 获取数据库中所有pairs
def GetCollection(self):
self.collection = self.db['data']
return self.collection
# 插入一条记录
def Insert(self,pair):
self.client.mydatabase.data.insert_one(pair)
# 关闭数据库
def Close(self):
self.client.close()
#查找记录并返回结果
def Find(self,name,value):
row = self.collection.find({name:value})
return row
class Logger():
def __init__(self,filename):
# create a logger with the program name and the level = DEBUG
self.logger = logging.getLogger(__name__)
# logger.setLevel(logging.DEBUG)
# set up logging to file
logging.basicConfig(
format='[%(asctime)s] [%(levelname)s] [%(processName)s] [%(threadName)s] : %(message)s',
level=logging.INFO)
# create a handler that will write log to stderr
self.file_handler = logging.FileHandler(filename)
self.file_handler.setLevel(logging.INFO)
self.logger.addHandler(self.file_handler)
# 获取logger
def getLogger(self):
return self.logger
app = Flask(__name__)
@app.route('/position/', methods=['GET'])
def search(position_info):
links = set()
row = collection.find({'title': {'$regex': position_info}})
result = {}
for item in row:
if item['url'] not in links:
logger.info(('Find a record about {}, url is {}').format(item['title'],item['url']))
#print('item', item) # item is a dict
links.add(item['url'])
result[item['title']] = item['url']
#print('result',result)
response = jsonify(result)
return response
if __name__ == '__main__':
logger = Logger('server_log.txt').getLogger()
mongo = MongoConnector()
collection = mongo.GetCollection()
try:
app.run(host="localhost", port=5002)
except ValueError as e:
logger.info(('Error:',e.args))
client.py
import sys
import json
import requests
import logging
import multiprocessing
from multiprocessing import Process
import time
import random
class Logger():
def __init__(self,file_name):
# create a logger with the program name and the level = DEBUG
self.logger = logging.getLogger(__name__)
# logger.setLevel(logging.DEBUG)
# set up logging to file
logging.basicConfig(
format='[%(asctime)s] [%(levelname)s] [%(processName)s] [%(threadName)s] : %(message)s',
level=logging.INFO)
# create a handler that will write log to stderr
self.file_handler = logging.FileHandler(file_name)
self.file_handler.setLevel(logging.INFO)
self.logger.addHandler(self.file_handler)
# 获取logger
def getLogger(self):
return self.logger
def TargetFunction(socket_queue, locker,position_info):
seconds = random.random()
time.sleep(seconds)
locker.acquire()
locker.release()
try:
request = 'http://{}:{}/position/{}'.format(ipAddress, portNum,position_info)
response = requests.get(request)
result = json.dumps(response.json(), indent=4, sort_keys=True)
logger.info(('Request:/position/{}').format(position_info))
print(("Search results of '{}':").format(position_info))
try:
d = json.loads(result)
for key,value in d.items():
print(key,' You can visit the position here:',value)
logger.info(('Searched {},{} for {}').format(key,value,position_info))
except UnboundLocalError as e:
logger.error('Error:',e.args)
except json.decoder.JSONDecodeError as e:
logger.error('Error:',e.args)
if __name__ == "__main__":
logger = Logger('client_log.txt').getLogger()
position_info = str(sys.argv[1])
ipAddress = 'localhost'
portNum = 5002
# create variables for multiprocessing
processes = []
socket_queue = multiprocessing.Queue()
locker = multiprocessing.Lock()
# create child processes
for _ in range(4):
# Process.sleep(10)
child_process = Process(target=TargetFunction, args=(socket_queue, locker, position_info.strip(),))
child_process.start()
processes.append(child_process)