8:GRU预测数据预处理

# 机器学习预测数据预处理

from pygeohash import encode, decode
import plotly
import numpy as np
import pandas as pd
import math
from matplotlib.path import Path
import numpy as np
import plotly.offline as of
import plotly.graph_objs as go
import chart_studio.plotly as py
import numpy as np
import pandas as pd
import folium
import webbrowser
from folium.plugins import HeatMap
import datetime
import time
import pymysql
import pymysql.cursors
import pandas as pd
import decimal
import geohash

def mysql(geo):
    print('数据库开始读取')
    conn = pymysql.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='xu19931026',
        db='hk_taxi',
        charset='utf8'
    )
    cursor = conn.cursor()  # 获取游标
    sql = "SELECT DATE_FORMAT(departure_time,'%%Y-%%m-%%d') as demand_time ,left(`geo`,7) as geo,count(0) as count FROM haikou_1 where left(`geo`,6)=%s AND DATE_FORMAT(departure_time,'%%Y-%%m-%%d') BETWEEN '2017-06-01' AND '2017-08-31' GROUP BY  DATE_FORMAT(departure_time,'%%Y-%%m-%%d') ,left(`geo`,7) ORDER BY demand_time"  # sql语句
    cursor.execute(sql,geo)
    result=cursor.fetchall()
    df=list(result)
    conn.close()#关闭数据库连接
    print('数据库读取完成')
    return result
def zhuanhuanzidian(a):
    # 初始化
    print(a)
    key_list = {}
    key_list2 = {}
    key_duibi=a[0][0]
    for ii in a:
        key = ii[0]
        if key != key_duibi:
            key_list2 = {}
        key2 = ii[1]
        key_list2[key2] = ii[2]
        key_list[key] = key_list2
        key_duibi = ii[0]
    print('首先转换为字典')
    print(key_list)  # 把需求变为双层字典的形式
    max_len = {}
    for i in key_list:
        max_len[i] = len(key_list[i])
    print('*******')
    print(max_len)
    print('找出最大的length:')
    print('最大的length是%d' % max(max_len.values()))
    print('*******')
    max_date=list(max_len.keys())[list(max_len.values()).index(max(max_len.values()))]
    print('最大的一个日期是%s' % max_date)
    return key_list,max_date
def bijiao(dica,dicb):
# Python两个字典比较,少key的取0,按照key排序后输出
#dica是标准
    dica = {i:dica[i] for i in sorted(dica.keys())}#对dica字典按照key值排序后输出
    print('比较前')
    print(dicb)
    print('比较后')
    for i in dicb:
        for k in dica.keys():  # 比较两个字典,没有的key的加上该key并取值为0
            if k not in dicb[i].keys():
                dicb[i][k] = 0
        dicb[i] = {j: dicb[i][j] for j in sorted(dicb[i].keys())}  # 对dicb字典按照key值排序后输出
    print('****')
    print(dicb)
    print('*******')
    return dicb
def zhuan_csv(a):
    df = pd.DataFrame(a)
    df = df.unstack()
    df = df.unstack(level=-1)
    print(df)
    return df
def zhaobinji(a):
    dic_biaozhun_new={}#空字典
    dic_biaozhun={}
    for i in a:
        dic_biaozhun_new=dic_biaozhun_new.keys()| a[i].keys() #变为了set类型
        for k in dic_biaozhun_new: #把set类型变为dict类型
            dic_biaozhun[k] = 0
        dic_biaozhun_new=dic_biaozhun
        dic_biaozhun = {}
    print('字典标准是')
    print(dic_biaozhun_new)
    print('字典标准长度是:')
    print(len(dic_biaozhun_new))
    return dic_biaozhun_new
if __name__ == '__main__':
    result=mysql('w7w3y2') #写入6位Geohash编码,返回每日每个七位Geohash编码的需求
    key_list,max_date=zhuanhuanzidian(result)
    dic_biaozhun=zhaobinji(key_list)
    key_list=bijiao(dic_biaozhun,key_list) #第一个参数请填上最大值所在的key
    key_list_df=zhuan_csv(key_list) #转换为CSV格式

你可能感兴趣的:(论文2)