爬取中国地震台网以及地震科学数据

选择爬取的是一年内的内容

使用的是sqlite数据库,增量更新使用的48小时的链接


import requests
import os
import sqlite3



# 一年内的地震数据
def glob_add(page):
    i = 1
    count=0
    while i < page:

        url = 'http://www.ceic.ac.cn/ajax/speedsearch?num=6&&page={}'.format(i)

        headers = {
     
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
        html = requests.get(url, headers=headers, timeout=10000)

        response = html.text
        # 去除首尾 ( )
        data = eval(response[1:-1])
        records = data['shuju']
        for result in records:
            did = result['id']
            on_time = result['O_TIME']
            m = result['M']
            epi_lat = result['EPI_LAT']
            epi_lon = result['EPI_LON']
            epi_depth = result['EPI_DEPTH']
            location_c = result['LOCATION_C']
            count += 1
            print (count)
            cursor.execute(
                r"insert into earthquake values ({},{},'{}',{},{},{},{},'{}');".format(count,did, on_time, m, epi_lat, epi_lon,
                                                                                    epi_depth, location_c))

        i = i + 1

    conn.commit()


# 增量添加

def local_add():
    url = 'http://www.ceic.ac.cn/ajax/speedsearch?num=2&&page=1&&'
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
    html = requests.get(url, headers=headers, timeout=10000)
    response = html.text
    # 去除首尾 ( )
    data = eval(response[1:-1])
    records = data['shuju']
    count=0
    for result in records:
        did = result['id']
        on_time = result['O_TIME']
        m = result['M']
        epi_lat = result['EPI_LAT']
        epi_lon = result['EPI_LON']
        epi_depth = result['EPI_DEPTH']
        location_c = result['LOCATION_C']
        # 查询 在 数据路内是否已经存在了 不存在则添加
        cursor.execute('select did from earthquake where did={};'.format(did))
        result_count, = cursor.fetchone()

        if result_count:
            print ('没有新的更新')
            continue
        else:
            count+=1
            print ('添加id为' + did)
            cursor.execute(
                r"insert into earthquake values ({},{},'{}',{},{},{},{},'{}');".format(count,did, on_time, m, epi_lat, epi_lon,
                                                                                    epi_depth, location_c))

    conn.commit()


if __name__ == "__main__":

    db_file = os.path.join(os.path.dirname(__file__), 'earth3.db')
    if os.path.isfile(db_file):
        conn = sqlite3.connect(db_file)
        cursor = conn.cursor()
        local_add()
        print ("文件已存在,执行增量更新")
    else:
        # 创建数据库
        conn = sqlite3.connect(db_file)
        cursor = conn.cursor()
        cursor.execute('create table earthquake (id int primary key not null ,did varchar(50) not null , O_TIME  varchar(100),'
                       'm varchar(20), EPI_LAT varchar(30),EPI_LON varchar(30),'
                       'EPI_DEPTH varchar(30) ,LOCATION_C varchar(100));')
        glob_add(58)
        print ("执行全量添加")
    conn.close()

地震科学数据

这个相对上一个要难处理一些,话不多,上代码:

延时的部分可以选择去掉

import requests
import os
import base64
import time
import sqlite3
from bs4 import BeautifulSoup
import random


def glob_add(page):
    i = 1
    count = 0
    while i < page:
        headers = {
     
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
        }

        pageurl = "http://data.earthquake.cn/datashare/report.shtml?DISPLAY_TYPE=1&PAGEID=websourcesearch&code=D10000&type=&reportdatasourcesearch_PAGECOUNT=49&reportdatasourcesearch_RECORDCOUNT=481&reportdatasourcesearch_ALLDATASETS_RECORDCOUNT=reportdatasourcesearch__default_default_default_key__default_default_default_key%3D481%3B&reportdatasourcesearch_PAGENO={}".format(
            i)
#代理可以选择使用
        proxies = {
     
            "http://219.143.71.193:80": "http://XXXX",

        }
        rr = requests.get(pageurl, headers=headers, proxies=proxies)

        soup = BeautifulSoup(rr.text, 'lxml')
        count = (i-1) * 10
        for link in soup.find_all('a'):
            l = []
            if link['href'].startswith('http://'):
                l.append(link['href'])
            for link_l in l:
                time.sleep(random.randint(0, 3))
                print (link_l)
                count += 1
                r = requests.get(link_l, headers=headers, timeout=1000)

                soup_r = BeautifulSoup(r.text, 'lxml')
                dataname = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(1) > table > tbody > tr:nth-child(1) > td")
                dataclass = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(1) > table > tbody > tr:nth-child(2) > td")
                coverage = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(1) > table > tbody > tr:nth-child(3) > td")
                person = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(2) > table > tbody > tr:nth-child(1) > td")
                phone = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(2) > table > tbody > tr:nth-child(2) > td")
                email = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(2) > table > tbody > tr:nth-child(3) > td")
                unit = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(2) > table > tbody > tr:nth-child(4) > td")
                abstract = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(2) > div > p")
                use_intor = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(3) > div > p")
                img_path = soup_r.select(
                    "#WX_CONTENT_datasourcelist_guid_reportdatasourcelist > form > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(3) > img")

                # if img_path[0]['src'] == '/datascyle/datascyle/uploadfile/wutu.png':
                #     r_img = '无缩略图'
                # else:
                imgpath = 'http://data.earthquake.cn{}'.format(img_path[0]['src'])

                r_img = requests.get(imgpath, headers=headers, timeout=1000).content
                img_base = base64.b64encode(r_img)
                b = str(dataclass[0]).replace(' ', '')
                c = b.replace('', '').replace('', '').replace('\n', '')
                d = c.replace('
'
, ' ') e = d.replace('\t', '').strip() print (count, str(dataname[0].string).strip(),e,) cursor.execute( r"insert into dataset values ({},'{}','{}','{}'," r'"{}","{}","{}","{}","{}","{}","{}");'.format(count, str(dataname[0].string).strip(), e, str(coverage[0].string).replace(' ', '').replace( "\t", '').strip(), str(person[0].string).strip(), str(phone[0].string).strip(), str(email[0].string).strip(), str(unit[0].string).strip(), str(use_intor).strip().replace('\"', '\'').replace( '

]'
, '').replace('[

',''), str(abstract[0].string).strip(), img_base)) conn.commit() i += 1 def create_db(db_file): cursor.execute('create table dataset (id int primary key not null ,' 'dataname varchar(200) not null ,' 'class varchar(100) not null,' 'coverage varchar(200) not null ,' 'person varchar(100) ,' 'phone varchar(50) ,' 'email varchar(50) ,' 'unit varchar(100) ,' 'abstract varchar(1000) ,' 'use_instr varchar(1000) ,' 'picture longblob );') if __name__ == "__main__": db_file = os.path.join(os.path.dirname(__file__), 'earthset1.db') if os.path.isfile(db_file): print ("文件已经存在取消执行请输入‘exit’,\n继续执行将删除原文件重新建立新文件确认请输入‘yes’") str1 = input("请输入:") if str1.lower() == 'exit': print ('取消执行') elif str1.lower() == 'yes': os.remove(db_file) conn = sqlite3.connect(db_file) cursor = conn.cursor() create_db(db_file) # 爬取页数 glob_add(50) print ("爬取结束!") else: print ("输入错误!") else: conn = sqlite3.connect(db_file) cursor = conn.cursor() create_db(db_file) glob_add(50) print ("爬取结束!")

顺便记录一些git使用问题

已经上传的文件,没有忽略的 要先将缓存移除,再忽略。

git rm -r --cache  <内容>

你可能感兴趣的:(python,sqlite3,http)