__author__ = 'bloodchilde' import urllib import urllib2 import re import os import sys reload(sys) sys.setdefaultencoding( "utf-8" ) class Spider: def __init__(self): self.siteUrl="http://aoshu.juren.com/tiku/xiaoxueaoshu/" self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' self.headers = { 'User-Agent' : self.user_agent } def getPageContent(self,url): request = urllib2.Request(url,headers = self.headers) response = urllib2.urlopen(request) return response.read().decode("utf-8") def getSiteContent(self): siteContents = self.getPageContent(self.siteUrl) sitePattern = re.compile('<div.*?class="news".*?>.*?<h3.*?class="newdot".*?>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</h3>.*?</div>',re.S) items = re.findall(sitePattern,siteContents) contents=[] for item in items: contents.append([item[0],item[1]]) return contents def enterAoshuTiPage(self,gradeLevel,url): curPageContent = self.getPageContent(url) curPattern = re.compile('<li>.*?<span.*?class="left".*?>.*?<a.*?href="(.*?)".*?target="_blank".*?>(.*?)</a>.*?</span>.*?</li>',re.S) items = re.findall(curPattern,curPageContent) contents =[] for item in items: url = item[0] timuName = item[1] contents.append([url,timuName]) pattent = re.compile('<div.*?id="pages".*?>.*?<strong>.*?</strong>.*?<strong>(.*?)</strong>.*?</div>',re.S) pageCounts = re.findall(pattent,curPageContent) return contents,pageCounts[0] def getAoshutiContent(self,url,path): pageContent = self.getPageContent(url) pattern = re.compile('<div.*?class="mainContent".*?>.*?<p>'+ '.*?</p>.*?<p>(.*?)</p>.*?<div.*?id="page".*?>.*?<span.*?class="current".*?>.*?</span>.*?<a.*?href="(.*?)".*?>.*?</a>.*?</div>.*?</div>',re.S) items = re.findall(pattern,pageContent) path = path +u"/test.txt" f = open(path, 'a+') for item in items: timu = item[0] daAnUrl = item[1] daAnContent = self.getDaAn(daAnUrl) print "question:"+timu print "answer:"+daAnContent fileContents = "question:"+timu+"\r\n"+"answer:"+daAnContent+"\r\n\r\n\r\n\r\n" f.write(fileContents) f.close() def mk_dir(self,path): isExisist = os.path.exists(path) if not isExisist: os.makedirs(path) return True else: return False def getDaAn(self,url): page = self.getPageContent(url) pattern = re.compile('<div.*?class="mainContent".*?>.*?<p>.*?</p>.*?<p>(.*?)</p>.*?</div>',re.S) items = re.findall(pattern,page) return items[0] def getAoShuTi(self,grade,url): global_url = url path = u"C:/Users/bloodchilde/Desktop/image_python/"+grade self.mk_dir(path) contents,pageCount = self.enterAoshuTiPage(grade,global_url) for pageIndex in range(1,int(pageCount)): if pageIndex != 1: url = global_url+u"/index_"+str(pageIndex)+".html" contents,count= self.enterAoshuTiPage(grade,url) for item in contents: url = item[0] name = item[1].split(":")[1] print name self.getAoshutiContent(url,path) else: for item in contents: url = item[0] name = item[1].split(":")[1] print name self.getAoshutiContent(url,path) demo = Spider() contents = demo.getSiteContent() for content in contents: url = content[0] grade = content[1] print grade print "---------------------------------------" demo.getAoShuTi(grade,url)
分析:
1.奥数题题源网页地址:http://aoshu.juren.com/tiku/xiaoxueaoshu/
2.分析源码,获取有几个年级的奥数题并且定位到那个年级的URL
3,解析步骤2中的URL,获取页数和题源URL,
4,解析题源URL,获取题目内容和答案的URL
5,解析答案的URL获取答案内容
6,将获取的题目和答案写进文件