来源于imooc教程实例,课程地址http://www.imooc.com/learn/563
以下是自己经过每一步分析,最后成功完成,代码模块化结构分明,不过自己一开始分析还是有点晕晕的,毕竟还不太习惯,以后多练习吧,每一份的收获都来之不易,但收获的喜悦总是弥足珍贵,好了,下面就开始我们的程序。
0,爬取目标
1)百度百科词条标题和简介;
2)链接页面的词条标题和简介;
3)总共爬取1000个页面。
1,准备工作与环境
pycharm,win10,python3.6,requests+BeautifulSoup+re
2,爬虫的构成
按照面向对象编程的原则,可将爬虫按照以下模块进行编程,分别是:
1)爬虫总调度程序,spider_main.py;
2)url管理程序, url_manager.py;
3)html下载程序,html_downloader.py;
4)html 解析程序:BeautifulSoup+正则,html_downloader.py;
5)结果输出程序,html_outputer.py
3,Python代码
1)爬虫总调度程序,spider_main.py;
# coding=utf-8
import url_manager
import html_downloader
import html_outputer
import html_parser
import csv
class SpiderMain(object):
# 初始各个对象, 其中UrlManager、HtmlDownloader、HtmlParser、HtmlOutputer四个对象需要之后创建
def __init__(self):
self.urls = url_manager.UrlManager() # URL管理器
self.downloader = html_downloader.HtmlDownloader() # 下载器 # 下载器
self.parser = html_parser.HtmlParse() # 解析器
self.outputer = html_outputer.HtmlOutputer() # 输出器
def craw(self,root_url):
count = 1
# 将root_url添加到url管理器
self.urls.add_new_url(root_url)
# f = open('craw.txt','a+',encoding='utf-8')
# f.write('')
while self.urls.has_new_url(): #new_urls集合的长度
try:
new_url = self.urls.get_new_url()
# print('craw %d:%s'%(count,new_url))
# 启动下载器,获取url的页面
html_page = self.downloader.download(new_url)
# 调用解析器解析下载的这个页面
new_urls,new_data = self.parser.parse(new_url,html_page)
# print(new_data)
# 将解析出的url添加到url管理器, 将数据添加到输出器里
#new_data是一个字典
self.urls.add_new_urls(new_urls)
# print(new_data['summary'])
self.outputer.output_html(new_data)
# print(new_data)
# self.outputer.collect_data(new_data)
# print(new_url)
# print(titles)
# print(summarys)
# with open('crawbaidubike.txt','a',encoding='utf-8') as f:
#
# f.write(new_url)
# f.write('\n')
# f.write(titles)
# f.write(summarys)
# f.write('\n')
# f.close()
# print(new_url,titles,summarys)
# self.outputer.collect_data(new_urls,titles,summarys) #new_datas是一个字典
if count ==10:
break
count = count + 1
except:
print('craw failed')
# print(new_datas)
# print(new_data)
# self.outputer.output_html(new_data)
# self.outputer.output_html()
if __name__=='__main__':
root_url = 'http://baike.baidu.com/link?url=iKhdUIHVpllyG6H-jGntGa3wfibBxAxWkLxev-Ekt2kNL6Tyte9w5-59CZNbCyCkTB8u5Aqp89j3P9yjYB97pq'
obj_spider = SpiderMain()
obj_spider.craw(root_url)
2)url管理程序, url_manager.py;
# coding=utf-8
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self,url): #添加新的url,这个url既不在new_urls集合中,也不在old_urls集合中
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):#因为解析一个页面会得到很多的urls,需要一条一条的加入到new_urls集合中
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self): #返回一个new_urls集合的长度
return len(self.new_urls)!=0
def get_new_url(self): #在new_urls集合中取出一个url,同时将取出的url放入到old_urls集合中
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
3)html下载程序,html_downloader.py;
import requests
import random
UA_LIST = [
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,
“Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,
“Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”
]
headers = { ‘User-Agent’: random.choice(UA_LIST)}
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response = requests.get(url,headers=headers)
if response.status_code!=200:
return None
response.encoding=’utf-8’
return response.text
4)html 解析程序:BeautifulSoup+正则,html_downloader.py;
# coding=utf-8
from bs4 import BeautifulSoup
import re
# url = 'https://baike.baidu.com/item/'
# titles=set()
# summarys=set()
class HtmlParse(object):
def parse(self,new_url,html_page):
if new_url is None or html_page is None:
return
Soup = BeautifulSoup(html_page,'lxml')
# links = Soup.find('div', class_='main-content').find_all('a', href=re.compile(r'/item/.*?'))
new_urls = self.get_new_urls(new_url,Soup)
new_data = self.get_new_data(new_url,Soup)
# title = Soup.find('dd',class_='lemmaWgt-lemmaTitle-title').h1.get_text()
# summary = Soup.find('div',class_='lemma-summary').get_text()
# print(titles)
# print(summarys)
return new_urls,new_data
def get_new_urls(self,new_url,Soup):
url = 'https://baike.baidu.com/item/'
new_urls = set()
links = Soup.find('div', class_='main-content').find_all('a', href=re.compile(r'/item/.*?'))
for link in links:
new_url =url+link.get_text() #真正的url
new_urls.add(new_url)
return new_urls
def get_new_data(self,new_url,Soup):
res_data = {}
res_data['url'] = new_url
title = Soup.find('dd', class_='lemmaWgt-lemmaTitle-title').h1.get_text()
summary = Soup.find('div',class_='lemma-summary').get_text()
# res_data['url'] = new_url
# titles.add(title)
# summarys.add(summary)
res_data['title']=title
res_data['summary']=summary
# print(res_data)
return res_data
5)结果输出程序,html_outputer.py
class HtmlOutputer(object):
# def init(self):
# self.datas = {}
def collect_data(self,dataes):
if dataes is None:
return
for data in dataes:
self.datas.append(data)
def output_html(self,new_data):
# print(self.datas)
# for data in self.datas:
# # print(data['url'])
# print(data['title'])
# print(data['summary'])
f = open('output.html','a+',encoding='utf-8')
f.write('')
f.write('')
f.write('')
f.write('')
f.write('')
f.write('链接 ')
f.write('标题 ')
f.write('简介 ')
f.write(' ')
f.write('')
f.write('')
# for i in range(0, 100):
f.write('')
f.write('%s ' % new_data['url'])
f.write('%s ' % new_data['title'])
f.write('%s ' % new_data['summary'])
f.write(' ')
f.write('')
f.write('
')
f.write('')
f.write('')
f.close()
结果:不知道为什么,图片上传不成功,算了,不纠结了。