最近学车,没有做什么正事,把好几天前的东西发出来给大家玩玩吧。因为地震所以才想到要做这个。。。
采集中国地震台网的地震数据。
#/usr/bin/env python # -*- coding:utf-8 -*- import urllib.request as request import urllib import re #获取信息的主页面url mainurl=r'http://www.ceic.ac.cn/AdvSearchHandler' #获取信息需要post的数据 mainpostdata = b'currentPageNo=1&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax=' #获取信息的regex pageinforegex = r'<td width="60%">共搜索到([\d]+)条符合条件的地震信息,共([\d]+)页,正显示第1页</td>'.encode('utf-8') #地震数据页面的url #detailurl = r'http://www.ceic.ac.cn/AdvSearchHandler?currentPageNo='+str(n-1)+'&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax=' #地震数据页面的post数据 detailpostdata = b'advSearchActionType=pageDown' #提取地震数据的regex detailregex = b'<td width="150px" nowrap>[\w\W]+?</td>[\w\W]+?</tr>' def crawler(): pageinfo = getPageNum(mainurl,mainpostdata,pageinforegex) if pageinfo!=None: for pageno in range(1,int(pageinfo[1])): detailurl = r'http://www.ceic.ac.cn/AdvSearchHandler?currentPageNo='+str(pageno-1)+'&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax=' resultdata = getPageData(detailurl,detailpostdata,detailregex) data2file(resultdata) return #获取地震数据的总页面个数 def getPageNum(url,postdata,pageinforegex): req = request.Request( url, postdata ) page = request.urlopen(req).read() pageinfo = re.findall(pageinforegex,page) if len(pageinfo[0])!=2: print('GetInfoError!') return None else: totalnum = pageinfo[0][0] totalpagenum = pageinfo[0][1] print('数据条数:%s'%totalnum.decode('utf-8')) print('页面数:%s'%totalpagenum.decode('utf-8')) return pageinfo[0] #抓取数据 def getPageData(detailurl,detailpostdata,detailregex): req = request.Request( detailurl, detailpostdata ) page = request.urlopen(req).read() pagedata = re.findall(detailregex,page) resultdata = [] if len(pagedata)!=0: for data in pagedata: #print(data.decode('utf-8')) splitdata = data.split(b'</td>') resultdata.append(splitdata[0].replace(b'<td width="150px" nowrap>',b'')) resultdata.append(splitdata[1].replace(b'\r\n <td width="50px" nowrap>',b'')) resultdata.append(splitdata[2].replace(b'\r\n <td width="60px" nowrap>',b'')) resultdata.append(splitdata[3].replace(b'\r\n <td width="60px" nowrap>',b'')) resultdata.append(splitdata[4].replace(b'\r\n <td width="60px" nowrap>',b'')) resultdata.append(splitdata[5].replace(b'\r\n <td align="left">',b'')) for x in resultdata: print(x.decode('utf-8')) return resultdata #写入文件 def data2file(resultdata): fp = open('earthquake.txt','a+') fp.write('<time>\n'+resultdata[0].decode('utf-8')+'\n</time>\n') fp.write('<level>\n'+resultdata[1].decode('utf-8')+'\n</level>\n') fp.write('<latitude>\n'+resultdata[2].decode('utf-8')+'\n</latitude>\n') fp.write('<longtitude>\n'+resultdata[3].decode('utf-8')+'\n</longtitude>\n') fp.write('<deepth>\n'+resultdata[4].decode('utf-8')+'\n</deepth>\n') fp.write('<location>\n'+resultdata[5].decode('utf-8')+'\n</location>\n') fp.close() if __name__ == "__main__": print ("爬虫开始爬啦....\n") crawler()