发个地震数据的爬虫

最近学车,没有做什么正事,把好几天前的东西发出来给大家玩玩吧。因为地震所以才想到要做这个。。。

采集中国地震台网的地震数据。

#/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib.request as request
import urllib
import re

#获取信息的主页面url
mainurl=r'http://www.ceic.ac.cn/AdvSearchHandler'
#获取信息需要post的数据
mainpostdata = b'currentPageNo=1&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax='
#获取信息的regex
pageinforegex = r'<td width="60%">共搜索到([\d]+)条符合条件的地震信息,共([\d]+)页,正显示第1页</td>'.encode('utf-8')

#地震数据页面的url
#detailurl = r'http://www.ceic.ac.cn/AdvSearchHandler?currentPageNo='+str(n-1)+'&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax='
#地震数据页面的post数据
detailpostdata = b'advSearchActionType=pageDown'
#提取地震数据的regex
detailregex = b'<td width="150px" nowrap>[\w\W]+?</td>[\w\W]+?</tr>'




def crawler():
    pageinfo = getPageNum(mainurl,mainpostdata,pageinforegex)
    if pageinfo!=None:
    	for pageno in range(1,int(pageinfo[1])):
    		detailurl = r'http://www.ceic.ac.cn/AdvSearchHandler?currentPageNo='+str(pageno-1)+'&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax='
    		resultdata = getPageData(detailurl,detailpostdata,detailregex)
    		data2file(resultdata)
    return
    
    

#获取地震数据的总页面个数
def getPageNum(url,postdata,pageinforegex):
	req = request.Request(
		url,
        postdata
        )
	page = request.urlopen(req).read()
	pageinfo = re.findall(pageinforegex,page)
	if len(pageinfo[0])!=2:
		print('GetInfoError!')
		return None
	else:
		totalnum = pageinfo[0][0]
		totalpagenum = pageinfo[0][1]
		print('数据条数:%s'%totalnum.decode('utf-8'))
		print('页面数:%s'%totalpagenum.decode('utf-8'))	
		return pageinfo[0]

#抓取数据
def getPageData(detailurl,detailpostdata,detailregex):
	req = request.Request(
		detailurl,
        detailpostdata
        )
	page = request.urlopen(req).read()
	pagedata = re.findall(detailregex,page)
	resultdata = []
	if len(pagedata)!=0:
		for data in pagedata:
			#print(data.decode('utf-8'))
			splitdata = data.split(b'</td>')
			resultdata.append(splitdata[0].replace(b'<td width="150px" nowrap>',b''))
			resultdata.append(splitdata[1].replace(b'\r\n              <td width="50px" nowrap>',b''))
			resultdata.append(splitdata[2].replace(b'\r\n              <td width="60px" nowrap>',b''))
			resultdata.append(splitdata[3].replace(b'\r\n              <td width="60px" nowrap>',b''))
			resultdata.append(splitdata[4].replace(b'\r\n              <td width="60px" nowrap>',b''))
			resultdata.append(splitdata[5].replace(b'\r\n              <td align="left">',b''))
			for x in resultdata:
				print(x.decode('utf-8'))
	return resultdata

#写入文件
def data2file(resultdata):
	fp = open('earthquake.txt','a+')
	fp.write('<time>\n'+resultdata[0].decode('utf-8')+'\n</time>\n')
	fp.write('<level>\n'+resultdata[1].decode('utf-8')+'\n</level>\n')
	fp.write('<latitude>\n'+resultdata[2].decode('utf-8')+'\n</latitude>\n')
	fp.write('<longtitude>\n'+resultdata[3].decode('utf-8')+'\n</longtitude>\n')
	fp.write('<deepth>\n'+resultdata[4].decode('utf-8')+'\n</deepth>\n')
	fp.write('<location>\n'+resultdata[5].decode('utf-8')+'\n</location>\n')
	fp.close()
if __name__ == "__main__":
    print ("爬虫开始爬啦....\n")
    crawler()

采集的数据截图如下所示


你可能感兴趣的:(爬虫,地震数据)