python网页抓取多年水文数据解析并存入mysql数据库

import requests
import chardet
from bs4 import BeautifulSoup
import pandas as pd
import pymysql.cursors
from sqlalchemy.orm import sessionmaker
from scray.models import *

# 连接MySQL数据库
ip='127.0.0.1'
port=3306
user='root'
password='123456'
db='hnhydro'

from sqlalchemy import create_engine
import datetime

engine = create_engine("mysql+pymysql://root:123456@localhost:3306/hnhydro?charset=utf8", echo = True)

t=datetime.datetime(2017,1,1,0,0,0)
now = datetime.datetime.now()

for liuyu in [3,4,5,6,7]:
    while t.__lt__(now):
        # 2:湘江 3:资水流域 4:沅水流域 5:澧水流域 6:长江 7:洞庭湖
        postData = {
            'printflag': 1,
            'liuyu': liuyu,
            'printflag': 1,
            'nian': t.year,
            'yue': t.month,
            'ri': t.day,
            'shi': t.strftime('%H:00')
        }
        res = requests.post('http://61.187.56.156/wap/hnsq_BB2.asp', data = postData)

        from util.CharUtils import *
        res.encoding='gb2312'
        #html_doc = CharUtils.getUTFString(res)
        #html_doc = res.content.encode('gb2312').decode('utf-8')

        soup = BeautifulSoup(res.content, 'lxml')
        remap = {
            # ord返回ascii值
            ord('\t'): '',
            ord('\f'): '',
            ord('\n'): '',
            ord('\r'): None,
            ',':''
            }
        # 去除\t, \f, \r
        table = soup.select('table')[0]
        data = [[td.getText().strip().strip('-+').translate(remap).replace(',','') for td in tr.findAll('td')] for tr in table.findAll('tr')]

        for index, value in enumerate(data):
            row = value

            # echo= True 会打印操作数据库的信息
            Session = sessionmaker(bind=engine)
            session = Session()
            sq = DaySq(
                watershed=row[0],
                river=row[1],
                station=row[2],
                date=t.strftime('%F'),
                time=t.strftime('%H:00') if row[3]=='' else row[3],
                wlevel=None if row[4]=='' else float(row[4]),
                trend=None if row[5]=='' else row[5],
                cmp8=None if row[6]=='' else float(row[6]),
                q=None if row[7]=='' else int(row[7]),
                warnwlevel=None if row[8]=='' else float(row[8]),
                hismaxwlevel=None if row[9]=='' else float(row[9]),
                occurdt=None if row[10]=='' else row[10],
                occurarea=None if row[11]=='' else row[11],
                occurcounty=None if row[12]=='' else row[12],
                controlsize=None if row[13]=='' else int(row[13])
            )
            session.add(sq)
            session.commit()

        # import numpy as np
        # df = pd.DataFrame(np.array(data[1:]), columns = data[0])
        # print(df)
        #
        # df.to_csv('D:\\DEV_DOC\\hydro.csv', index=False, encoding = "utf-8")

        t+=datetime.timedelta(hours=1)

你可能感兴趣的:(Python)