1.单线程版本
python版本3.6
#!/usr/bin/env/ python3
#coding=utf-8
import logging
import requests
from bs4 import BeautifulSoup
import re
import codecs
from urllib.parse import urljoin,urlparse
import time
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, rootUrl):
if rootUrl == None:
return
if rootUrl not in self.new_urls and rootUrl not in self.old_urls:
self.new_urls.add(rootUrl)
def hasUrl(self):
return len(self.new_urls)>0
def getURL(self):
url = self.new_urls.pop()
self.old_urls.add(url)
return url
def add_new_urls(self, links):
for link in links:
self.add_new_url(link)
class CrawlerManage(object):
def requestURL(self, url):
# req = requests.Request(url = url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
r = requests.get(url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
if r.status_code != 200:
return
return r.encoding,r.text
def analyzeLiks(self,page_url, content):
fullNewLinks = set()
soup = BeautifulSoup(content, 'html.parser')
links = soup.find_all('a',href=re.compile(r"/item/\S+"))
for link in links:
new_url = link['href']
new_full_url = urljoin(page_url,new_url)
fullNewLinks.add(new_full_url)
return fullNewLinks
# 鄱阳街小学
class RunService(object):
def __init__(self,rootUrl):
self.urlManager = UrlManager()
self.urlManager.add_new_url(rootUrl)
self.crawlerManage= CrawlerManage()
def run(self,num):
n =0
while self.urlManager.hasUrl():
url = self.urlManager.getURL()
encodeing,content = self.crawlerManage.requestURL(url)
with codecs.open("e:/baike/"+str(n)+".html",'w+',encodeing) as f:
f.write(content)
links = self.crawlerManage.analyzeLiks(url,content)
self.urlManager.add_new_urls(links)
if n == num:
break
n = n+1
if __name__ == '__main__':
#https://baike.baidu.com/item/%E5%88%98%E4%BA%A6%E8%8F%B2/136156
rootUrl ="https://baike.baidu.com/item/刘亦菲/136156"
runService = RunService(rootUrl)
startTIme = time.time()
print('start server')
runService.run(1000)
endTIme = time.time()
longtime = endTIme-startTIme
print('server exit(0),use time %s s'%str(longtime))
2.多进程版本
#!/usr/bin/env/ python3
#coding=utf-8
import logging
import requests
from bs4 import BeautifulSoup
import re
import codecs
from urllib.parse import urljoin, urlparse, unquote
import time
import os
from multiprocessing import Pool,Queue
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, rootUrl):
if rootUrl == None:
return
if rootUrl not in self.new_urls and rootUrl not in self.old_urls:
self.new_urls.add(rootUrl)
def hasUrl(self):
return len(self.new_urls)>0
def getURL(self):
url = self.new_urls.pop()
self.old_urls.add(url)
return url
def add_new_urls(self, links):
for link in links:
self.add_new_url(link)
class CrawlerManage(object):
def requestURL(self, url):
# req = requests.Request(url = url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
r = requests.get(url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
if r.status_code != 200:
return
return r.encoding,r.text
def analyzeLiks(self,page_url, content):
fullNewLinks = set()
soup = BeautifulSoup(content, 'html.parser')
links = soup.find_all('a',href=re.compile(r"/item/\S+"))
for link in links:
new_url = link['href']
new_full_url = urljoin(page_url,new_url)
fullNewLinks.add(new_full_url)
return fullNewLinks
# 鄱阳街小学
class RunService(object):
def __init__(self,rootUrl,num=1000):
self.urlManager = UrlManager()
self.urlManager.add_new_url(rootUrl)
self.crawlerManage= CrawlerManage()
self.num =num
self.n=0
def run(self):
while self.urlManager.hasUrl():
try:
url = self.urlManager.getURL()
encodeing,content = self.crawlerManage.requestURL(url)
fileName =unquote(urlparse(url).path.split("/")[-1])
with codecs.open("e:/baike1/"+fileName+".html",'w+',encodeing) as f:
f.write(content)
links = self.crawlerManage.analyzeLiks(url,content)
self.urlManager.add_new_urls(links)
if self.n >= self.num:
break
self.n = self.n+1
except Exception as e:
print(e)
def runTask(runService,name):
print('Run task %s (%s) (%s)...' % (name, os.getpid(),os.getppid()))
print('runService:%s' %id(runService))
start = time.time()
while runService.urlManager.hasUrl():
try:
url = runService.urlManager.getURL()
encodeing, content = runService.crawlerManage.requestURL(url)
fileName = urlparse(unquote(url)).path.split("/")[-1]
with codecs.open("e:/baike1/" + fileName + ".html", 'w+', encodeing) as f:
f.write(content)
links = runService.crawlerManage.analyzeLiks(url, content)
runService.urlManager.add_new_urls(links)
if runService.n >= runService.num:
break
runService.n = runService.n + 1
except Exception as e:
print(e)
end = time.time()
print('Task %s runs %0.2f seconds.' % (name, (end - start)))
if __name__ == '__main__':
#https://baike.baidu.com/item/%E5%88%98%E4%BA%A6%E8%8F%B2/136156
rootUrl ="https://baike.baidu.com/item/刘亦菲/136156"
runService = RunService(rootUrl,100)
startTIme = time.time()
print('start server')
p = Pool(10)
for i in range(10):
p.apply_async(runTask,args=(runService,i))
p.close()
p.join()
endTIme = time.time()
longtime = endTIme-startTIme
print('server exit(0),use time %s s'%str(longtime))