python 爬虫抓取奥数题

__author__ = 'bloodchilde'



import  urllib
import urllib2
import  re
import os

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )


class Spider:
    def __init__(self):
        self.siteUrl="http://aoshu.juren.com/tiku/xiaoxueaoshu/"
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
        self.headers = { 'User-Agent' : self.user_agent }

    def getPageContent(self,url):
        request = urllib2.Request(url,headers = self.headers)
        response = urllib2.urlopen(request)
        return response.read().decode("utf-8")

    def getSiteContent(self):
        siteContents = self.getPageContent(self.siteUrl)

        sitePattern = re.compile('<div.*?class="news".*?>.*?<h3.*?class="newdot".*?>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</h3>.*?</div>',re.S)

        items = re.findall(sitePattern,siteContents)

        contents=[]

        for item in items:
            contents.append([item[0],item[1]])

        return contents

    def enterAoshuTiPage(self,gradeLevel,url):
        curPageContent = self.getPageContent(url)

        curPattern = re.compile('<li>.*?<span.*?class="left".*?>.*?<a.*?href="(.*?)".*?target="_blank".*?>(.*?)</a>.*?</span>.*?</li>',re.S)

        items = re.findall(curPattern,curPageContent)


        contents =[]
        for item in items:
            url = item[0]
            timuName = item[1]
            contents.append([url,timuName])


        pattent = re.compile('<div.*?id="pages".*?>.*?<strong>.*?</strong>.*?<strong>(.*?)</strong>.*?</div>',re.S)

        pageCounts = re.findall(pattent,curPageContent)

        return contents,pageCounts[0]

    def getAoshutiContent(self,url,path):
        pageContent = self.getPageContent(url)

        pattern = re.compile('<div.*?class="mainContent".*?>.*?<p>'+
                             '.*?</p>.*?<p>(.*?)</p>.*?<div.*?id="page".*?>.*?<span.*?class="current".*?>.*?</span>.*?<a.*?href="(.*?)".*?>.*?</a>.*?</div>.*?</div>',re.S)

        items = re.findall(pattern,pageContent)


        path = path +u"/test.txt"

        f = open(path, 'a+')

        for item in items:
            timu = item[0]
            daAnUrl = item[1]

            daAnContent = self.getDaAn(daAnUrl)
            print "question:"+timu
            print "answer:"+daAnContent

            fileContents = "question:"+timu+"\r\n"+"answer:"+daAnContent+"\r\n\r\n\r\n\r\n"

            f.write(fileContents)

        f.close()

    def mk_dir(self,path):
        isExisist = os.path.exists(path)
        if not isExisist:
            os.makedirs(path)
            return True
        else:
            return False





    def getDaAn(self,url):
        page = self.getPageContent(url)

        pattern = re.compile('<div.*?class="mainContent".*?>.*?<p>.*?</p>.*?<p>(.*?)</p>.*?</div>',re.S)


        items = re.findall(pattern,page)

        return items[0]


    def getAoShuTi(self,grade,url):

        global_url = url

        path = u"C:/Users/bloodchilde/Desktop/image_python/"+grade

        self.mk_dir(path)

        contents,pageCount = self.enterAoshuTiPage(grade,global_url)

        for pageIndex in range(1,int(pageCount)):
            if pageIndex != 1:
                url = global_url+u"/index_"+str(pageIndex)+".html"
                contents,count= self.enterAoshuTiPage(grade,url)

                for item in contents:
                    url = item[0]
                    name = item[1].split(":")[1]
                    print name
                    self.getAoshutiContent(url,path)
            else:
                for item in contents:
                    url = item[0]
                    name = item[1].split(":")[1]
                    print name
                    self.getAoshutiContent(url,path)



demo  = Spider()


contents = demo.getSiteContent()

for content in contents:

    url = content[0]
    grade = content[1]
    print grade
    print "---------------------------------------"
    demo.getAoShuTi(grade,url)





分析:

1.奥数题题源网页地址:http://aoshu.juren.com/tiku/xiaoxueaoshu/

2.分析源码,获取有几个年级的奥数题并且定位到那个年级的URL

3,解析步骤2中的URL,获取页数和题源URL,

4,解析题源URL,获取题目内容和答案的URL

5,解析答案的URL获取答案内容

6,将获取的题目和答案写进文件


你可能感兴趣的:(python爬虫)