import requests import chardet from bs4 import BeautifulSoup import pandas as pd import pymysql.cursors from sqlalchemy.orm import sessionmaker from scray.models import * # 连接MySQL数据库 ip='127.0.0.1' port=3306 user='root' password='123456' db='hnhydro' from sqlalchemy import create_engine import datetime engine = create_engine("mysql+pymysql://root:123456@localhost:3306/hnhydro?charset=utf8", echo = True) t=datetime.datetime(2017,1,1,0,0,0) now = datetime.datetime.now() for liuyu in [3,4,5,6,7]: while t.__lt__(now): # 2:湘江 3:资水流域 4:沅水流域 5:澧水流域 6:长江 7:洞庭湖 postData = { 'printflag': 1, 'liuyu': liuyu, 'printflag': 1, 'nian': t.year, 'yue': t.month, 'ri': t.day, 'shi': t.strftime('%H:00') } res = requests.post('http://61.187.56.156/wap/hnsq_BB2.asp', data = postData) from util.CharUtils import * res.encoding='gb2312' #html_doc = CharUtils.getUTFString(res) #html_doc = res.content.encode('gb2312').decode('utf-8') soup = BeautifulSoup(res.content, 'lxml') remap = { # ord返回ascii值 ord('\t'): '', ord('\f'): '', ord('\n'): '', ord('\r'): None, ',':'' } # 去除\t, \f, \r table = soup.select('table')[0] data = [[td.getText().strip().strip('-+').translate(remap).replace(',','') for td in tr.findAll('td')] for tr in table.findAll('tr')] for index, value in enumerate(data): row = value # echo= True 会打印操作数据库的信息 Session = sessionmaker(bind=engine) session = Session() sq = DaySq( watershed=row[0], river=row[1], station=row[2], date=t.strftime('%F'), time=t.strftime('%H:00') if row[3]=='' else row[3], wlevel=None if row[4]=='' else float(row[4]), trend=None if row[5]=='' else row[5], cmp8=None if row[6]=='' else float(row[6]), q=None if row[7]=='' else int(row[7]), warnwlevel=None if row[8]=='' else float(row[8]), hismaxwlevel=None if row[9]=='' else float(row[9]), occurdt=None if row[10]=='' else row[10], occurarea=None if row[11]=='' else row[11], occurcounty=None if row[12]=='' else row[12], controlsize=None if row[13]=='' else int(row[13]) ) session.add(sq) session.commit() # import numpy as np # df = pd.DataFrame(np.array(data[1:]), columns = data[0]) # print(df) # # df.to_csv('D:\\DEV_DOC\\hydro.csv', index=False, encoding = "utf-8") t+=datetime.timedelta(hours=1)