爬虫在数据采集方面有很多不错的应用,互联网就是数据的海洋,掌握好这一工具对与获得更多更宏观的数据有很大的意义。
一个简单的爬虫包括五个主要的部分
1 spider_main 一个调度的逻辑
2 url_manager url的管理器,复杂url的获得和去重 ,这一部分深度的可以利用redis的队列,以及深度广度优先原则
3 html_parser html的解析器,获得html中想得到的数据和子url
4 html_downloader 具体执行下载逻辑的部分,深度的话会涉及cookid的处理,https,header,多线程,定时,反爬虫的策略
5 html_outputer 与数据库连接以及在前台的展现
分别的代码实现
1
from spider1 import url_manager, html_downloader, html_parser, html_outputer class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print('craw %d : %s' % (count, new_url)) html_cont = self.downloader.download(new_url) new_urls, new_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) # self.outputer.collect_data(new_data) if count == 10: break count = count + 1 except: print('craw failed') self.outputer.output_html() if __name__ == "__main__": root_url = "https://baike.baidu.com/item/Python/407313" spider = SpiderMain() spider.craw(root_url)2
class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, root_url): if root_url is None: return if root_url not in self.new_urls and root_url not in self.old_urls: self.new_urls.add(root_url) def add_new_urls(self, new_urls): if new_urls is None or len(new_urls) == 0: return for url in new_urls: self.add_new_url(url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): new_url = self.new_urls.pop(); self.old_urls.add(new_url) return new_url3
import re import urllib.parse from bs4 import BeautifulSoup class HtmlParser(object): def parse(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_urls = self._get_new_urls(page_url,soup) new_data = self._get_new_data(page_url,soup) return new_urls,new_data # 源代码 def _get_new_urls(self, page_url, soup): new_urls = set() url_nodes = soup.find_all('a',href=re.compile(r"/item/*")) new_full_url = '' for url_node in url_nodes: if url_node.has_attr('data-lemmaid'): new_full_url = r"https://baike.baidu.com"+ url_node['href'] +r"/"+url_node['data-lemmaid'] new_urls.add(new_full_url) else: new_full_url = r"https://baike.baidu.com"+ url_node['href'] new_urls.add(new_full_url) return new_urls def _get_new_data(self,page_url,soup): title_content = {} title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1') title = title_node.get_text() content_nodes = soup.find_all('div',class_ = 'para') content = [] for content_node in content_nodes: content.append(content_node.get_text()) content_string = ''.join(content) title_content['title'] = title title_content['content'] = content_string # return title_content return content_string4
import urllib.request class HtmlDownLoader(object): def download(self, url): if url is None: return None response = urllib.request.urlopen(url.encode('utf-8').decode('utf-8')) # if response.read().getcode() != 200: # return None return response.read()5
class HtmlOutputer(object): def collect_data(self, new_data): pass def output_html(self): pass运行后的结果: