python爬虫爬取词条百度百科

1.单线程版本

python版本3.6

#!/usr/bin/env/ python3
#coding=utf-8

import logging
import requests
from bs4 import BeautifulSoup
import  re
import codecs
from urllib.parse import urljoin,urlparse
import time


class UrlManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def add_new_url(self, rootUrl):
        if rootUrl == None:
            return
        if rootUrl not in self.new_urls and rootUrl not in self.old_urls:
            self.new_urls.add(rootUrl)

    def hasUrl(self):
        return  len(self.new_urls)>0

    def getURL(self):
        url = self.new_urls.pop()
        self.old_urls.add(url)
        return url

    def add_new_urls(self, links):
        for link in links:
            self.add_new_url(link)


class CrawlerManage(object):
    def requestURL(self, url):
        # req = requests.Request(url = url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
        r = requests.get(url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
        if r.status_code != 200:
            return
        return r.encoding,r.text

    def analyzeLiks(self,page_url, content):
        fullNewLinks = set()
        soup = BeautifulSoup(content, 'html.parser')
        links = soup.find_all('a',href=re.compile(r"/item/\S+"))
        for link in links:
            new_url = link['href']
            new_full_url = urljoin(page_url,new_url)
            fullNewLinks.add(new_full_url)
        return fullNewLinks
       # 鄱阳街小学


class RunService(object):
    def __init__(self,rootUrl):
        self.urlManager = UrlManager()
        self.urlManager.add_new_url(rootUrl)
        self.crawlerManage= CrawlerManage()

    def run(self,num):
        n =0
        while  self.urlManager.hasUrl():
            url = self.urlManager.getURL()
            encodeing,content = self.crawlerManage.requestURL(url)
            with codecs.open("e:/baike/"+str(n)+".html",'w+',encodeing) as f:
                f.write(content)
            links = self.crawlerManage.analyzeLiks(url,content)
            self.urlManager.add_new_urls(links)
            if n == num:
                break
            n = n+1


if __name__ == '__main__':
    #https://baike.baidu.com/item/%E5%88%98%E4%BA%A6%E8%8F%B2/136156
    rootUrl ="https://baike.baidu.com/item/刘亦菲/136156"
    runService = RunService(rootUrl)
    startTIme = time.time()
    print('start server')
    runService.run(1000)
    endTIme = time.time()
    longtime = endTIme-startTIme
    print('server exit(0),use time %s s'%str(longtime))


2.多进程版本

#!/usr/bin/env/ python3
#coding=utf-8

import logging
import requests
from bs4 import BeautifulSoup
import re
import codecs
from urllib.parse import urljoin, urlparse, unquote
import time
import os
from multiprocessing import Pool,Queue

class UrlManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def add_new_url(self, rootUrl):
        if rootUrl == None:
            return
        if rootUrl not in self.new_urls and rootUrl not in self.old_urls:
            self.new_urls.add(rootUrl)

    def hasUrl(self):
        return  len(self.new_urls)>0

    def getURL(self):
        url = self.new_urls.pop()
        self.old_urls.add(url)
        return url

    def add_new_urls(self, links):
        for link in links:
            self.add_new_url(link)


class CrawlerManage(object):
    def requestURL(self, url):
        # req = requests.Request(url = url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
        r = requests.get(url,headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
        if r.status_code != 200:
            return
        return r.encoding,r.text

    def analyzeLiks(self,page_url, content):
        fullNewLinks = set()
        soup = BeautifulSoup(content, 'html.parser')
        links = soup.find_all('a',href=re.compile(r"/item/\S+"))
        for link in links:
            new_url = link['href']
            new_full_url = urljoin(page_url,new_url)
            fullNewLinks.add(new_full_url)
        return fullNewLinks
       # 鄱阳街小学


class RunService(object):
    def __init__(self,rootUrl,num=1000):
        self.urlManager = UrlManager()
        self.urlManager.add_new_url(rootUrl)
        self.crawlerManage= CrawlerManage()
        self.num =num
        self.n=0

    def run(self):
        while  self.urlManager.hasUrl():
            try:
                url = self.urlManager.getURL()
                encodeing,content = self.crawlerManage.requestURL(url)
                fileName =unquote(urlparse(url).path.split("/")[-1])
                with codecs.open("e:/baike1/"+fileName+".html",'w+',encodeing) as f:
                    f.write(content)
                links = self.crawlerManage.analyzeLiks(url,content)
                self.urlManager.add_new_urls(links)
                if self.n >= self.num:
                    break
                self.n = self.n+1
            except Exception  as e:
                print(e)

def runTask(runService,name):
    print('Run task %s (%s) (%s)...' % (name, os.getpid(),os.getppid()))
    print('runService:%s' %id(runService))
    start = time.time()
    while runService.urlManager.hasUrl():
        try:
            url = runService.urlManager.getURL()
            encodeing, content = runService.crawlerManage.requestURL(url)
            fileName = urlparse(unquote(url)).path.split("/")[-1]
            with codecs.open("e:/baike1/" + fileName + ".html", 'w+', encodeing) as f:
                f.write(content)
            links = runService.crawlerManage.analyzeLiks(url, content)
            runService.urlManager.add_new_urls(links)
            if runService.n >= runService.num:
                break
            runService.n = runService.n + 1
        except Exception  as e:
            print(e)
    end = time.time()
    print('Task %s runs %0.2f seconds.' % (name, (end - start)))

if __name__ == '__main__':
    #https://baike.baidu.com/item/%E5%88%98%E4%BA%A6%E8%8F%B2/136156
    rootUrl ="https://baike.baidu.com/item/刘亦菲/136156"
    runService = RunService(rootUrl,100)
    startTIme = time.time()
    print('start server')
    p = Pool(10)
    for i in range(10):
        p.apply_async(runTask,args=(runService,i))
    p.close()
    p.join()
    endTIme = time.time()
    longtime = endTIme-startTIme
    print('server exit(0),use time %s s'%str(longtime))

你可能感兴趣的:(python3.6)