python爬取百度百科

来源于imooc教程实例,课程地址http://www.imooc.com/learn/563
以下是自己经过每一步分析,最后成功完成,代码模块化结构分明,不过自己一开始分析还是有点晕晕的,毕竟还不太习惯,以后多练习吧,每一份的收获都来之不易,但收获的喜悦总是弥足珍贵,好了,下面就开始我们的程序。
0,爬取目标

1)百度百科词条标题和简介;

2)链接页面的词条标题和简介;

3)总共爬取1000个页面。

1,准备工作与环境
pycharm,win10,python3.6,requests+BeautifulSoup+re

2,爬虫的构成

按照面向对象编程的原则,可将爬虫按照以下模块进行编程,分别是:
1)爬虫总调度程序,spider_main.py;
2)url管理程序, url_manager.py;
3)html下载程序,html_downloader.py;
4)html 解析程序:BeautifulSoup+正则,html_downloader.py;
5)结果输出程序,html_outputer.py

3,Python代码
1)爬虫总调度程序,spider_main.py;

# coding=utf-8
import url_manager
import html_downloader
import html_outputer
import html_parser
import csv

class SpiderMain(object):
    # 初始各个对象, 其中UrlManager、HtmlDownloader、HtmlParser、HtmlOutputer四个对象需要之后创建
    def __init__(self):
        self.urls = url_manager.UrlManager()   # URL管理器
        self.downloader = html_downloader.HtmlDownloader()   # 下载器   # 下载器
        self.parser = html_parser.HtmlParse()    # 解析器
        self.outputer = html_outputer.HtmlOutputer()  # 输出器

    def craw(self,root_url):
        count = 1
        # 将root_url添加到url管理器
        self.urls.add_new_url(root_url)
        # f = open('craw.txt','a+',encoding='utf-8')
        # f.write('')


        while self.urls.has_new_url():  #new_urls集合的长度
            try:
                new_url = self.urls.get_new_url()
                # print('craw %d:%s'%(count,new_url))
                # 启动下载器,获取url的页面
                html_page = self.downloader.download(new_url)
                # 调用解析器解析下载的这个页面
                new_urls,new_data = self.parser.parse(new_url,html_page)
                # print(new_data)
                # 将解析出的url添加到url管理器, 将数据添加到输出器里
                #new_data是一个字典
                self.urls.add_new_urls(new_urls)
                # print(new_data['summary'])
                self.outputer.output_html(new_data)
                # print(new_data)
                # self.outputer.collect_data(new_data)
                # print(new_url)
                # print(titles)
                # print(summarys)

                # with open('crawbaidubike.txt','a',encoding='utf-8') as f:
                #
                #     f.write(new_url)
                #     f.write('\n')
                #     f.write(titles)
                #     f.write(summarys)
                #     f.write('\n')
                # f.close()

                # print(new_url,titles,summarys)

                # self.outputer.collect_data(new_urls,titles,summarys)  #new_datas是一个字典


                if count ==10:
                    break
                count = count + 1
            except:
                print('craw failed')
        # print(new_datas)
        # print(new_data)
        # self.outputer.output_html(new_data)

        # self.outputer.output_html()


if __name__=='__main__':
    root_url = 'http://baike.baidu.com/link?url=iKhdUIHVpllyG6H-jGntGa3wfibBxAxWkLxev-Ekt2kNL6Tyte9w5-59CZNbCyCkTB8u5Aqp89j3P9yjYB97pq'
    obj_spider = SpiderMain()
    obj_spider.craw(root_url)

2)url管理程序, url_manager.py;

# coding=utf-8
class UrlManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def add_new_url(self,url): #添加新的url,这个url既不在new_urls集合中,也不在old_urls集合中
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self,urls):#因为解析一个页面会得到很多的urls,需要一条一条的加入到new_urls集合中
        if urls is None or len(urls)==0:
            return
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):   #返回一个new_urls集合的长度
        return len(self.new_urls)!=0

    def get_new_url(self):  #在new_urls集合中取出一个url,同时将取出的url放入到old_urls集合中
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

3)html下载程序,html_downloader.py;

import requests
import random

UA_LIST = [
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,
“Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,
“Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,
“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”
]
headers = { ‘User-Agent’: random.choice(UA_LIST)}

返回一个url的页面内容

class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response = requests.get(url,headers=headers)
if response.status_code!=200:
return None
response.encoding=’utf-8’
return response.text

4)html 解析程序:BeautifulSoup+正则,html_downloader.py;

# coding=utf-8
from bs4 import BeautifulSoup
import re
# url = 'https://baike.baidu.com/item/'

# titles=set()
# summarys=set()
class HtmlParse(object):

    def parse(self,new_url,html_page):
        if new_url is None or html_page is None:
            return

        Soup = BeautifulSoup(html_page,'lxml')
        # links = Soup.find('div', class_='main-content').find_all('a', href=re.compile(r'/item/.*?'))
        new_urls = self.get_new_urls(new_url,Soup)
        new_data = self.get_new_data(new_url,Soup)
        # title = Soup.find('dd',class_='lemmaWgt-lemmaTitle-title').h1.get_text()
        # summary = Soup.find('div',class_='lemma-summary').get_text()
        # print(titles)
        # print(summarys)
        return new_urls,new_data

    def get_new_urls(self,new_url,Soup):
        url = 'https://baike.baidu.com/item/'
        new_urls = set()
        links = Soup.find('div', class_='main-content').find_all('a', href=re.compile(r'/item/.*?'))
        for link in links:
            new_url =url+link.get_text()   #真正的url
            new_urls.add(new_url)
        return new_urls

    def get_new_data(self,new_url,Soup):
        res_data = {}
        res_data['url'] = new_url
        title = Soup.find('dd', class_='lemmaWgt-lemmaTitle-title').h1.get_text()
        summary = Soup.find('div',class_='lemma-summary').get_text()
            # res_data['url'] = new_url
        # titles.add(title)
        # summarys.add(summary)
        res_data['title']=title
        res_data['summary']=summary
        # print(res_data)

        return res_data

5)结果输出程序,html_outputer.py

class HtmlOutputer(object):
# def init(self):
# self.datas = {}

def collect_data(self,dataes):
    if dataes is None:
        return
    for data in dataes:
        self.datas.append(data)

def output_html(self,new_data):
    # print(self.datas)
    # for data in self.datas:
    #     # print(data['url'])
    #     print(data['title'])
    #     print(data['summary'])
    f = open('output.html','a+',encoding='utf-8')
    f.write('')
    f.write('')
    f.write('')

    f.write('')
    f.write('')
    f.write('')
    f.write('')
    f.write('')
    f.write('')
    f.write('')

    f.write('')

    # for i in range(0, 100):
    f.write('')
    f.write('' % new_data['url'])
    f.write('' % new_data['title'])
    f.write('' % new_data['summary'])
    f.write('')
    f.write('')
    f.write('
链接标题简介
%s%s%s
') f.write('') f.write('') f.close()

结果:不知道为什么,图片上传不成功,算了,不纠结了。

你可能感兴趣的:(python爬虫)