澳大利亚降雨预测(基于四种机器学习算法)

数据来源

澳大利亚降雨预测数据集来自于Kaggle网站:
https://www.kaggle.com/jsphyg/weather-dataset-rattle-package
该数据集该数据集包含来自许多澳大利亚气象站的大约10年的每日天气观测以及天气预报,“RainTomorrow”是要预测的目标变量,这意味着-第二天下雨了,如果当天的降雨量为1mm或更大,则此列为“是”。气象信息包括日期,城市,最低温度,最高温度,降雨量,蒸发量,阳光(一天中阳光明媚的小时数),一天中最强阵风、9am、3pm的风向和风速,一天中9am、3pm的湿度、气压、云层(云层遮盖的天空比例)、温度、当日是否下雨。
该数据集的主要任务目标是根据今日的气象信息训练分类模型,根据该模型预测澳大利亚第二天的降雨。数据总量为145460行,23列。

数据预处理

# -*- coding: utf-8 -*- 
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
import graphviz
import pydotplus
import io


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', None)


def preprocessing(df, index):
    labels = set([])
    for label in df[index]:
        labels.add(label)
    labels = list(labels)
    # print(labels)
    print('总类别数:', len(labels))
    column = df[index]
    df.drop(axis=1, columns=index, inplace=True)
    if 'RainT' in index:
        temp = column.map(lambda x: labels.index(x))
    elif index == 'Date':
        temp = column.map(lambda x: x[5] if x[6] == '/' else x[5:7])
    else:
        temp = column.map(lambda x: labels.index(x)+1)
    df.insert(0, index, temp)
    return df


def get_dataset(location):      # 获取数据集
    df = pd.read_csv(location)
    # df = df[['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed',
    #          'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
    #          'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']]
    print(df)
    print(df.info())  # 查看缺失值
    df = df.dropna()        # 去除包含缺失值的行
    # 用整数标签代替字符串型数据
    for label in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Date', 'RainToday', 'RainTomorrow']:
        df = preprocessing(df, label)
    print(df)
    # Date列格式转换
    date = df['Date'].astype(float)
    df.drop(axis=1, columns='Date', inplace=True)
    df.insert(2, 'Date', date)
    print(df.dtypes)
    print(df.describe())
    print(df.corr()['RainTomorrow'])      # 打印相关系数矩阵
    # 删除相关系数过低的数据
    for key in df.corr()['RainTomorrow'].keys():
        if (df.corr()['RainTomorrow'][key] < 0.1) & (df.corr()['RainTomorrow'][key] > -0.1):
            df.drop(axis=1, columns=key, inplace=True)
    # 画相关性热力图
    # sns.heatmap(df.corr()[['RainToday', 'RainTomorrow']])
    plt.show()
    return df


def test(resultset, rightset):      # 计算测试集的正确率
    print('预测结果为', resultset)
    print('正确结果为', rightset)
    if len(resultset) != len(rightset):
        print("数据集长度不同, 错误!")
        return 0
    total = len(resultset)
    right = 0
    for r1, r2 in zip(resultset, rightset):
        if r1 == r2:
            right += 1
    rightrate = right/total
    print(rightrate)

KNN

from tools import *
if __name__ == '__main__':
    df = get_dataset('E:\\X\\DATA\\weatherAUS.csv')
    df_list = df.values.tolist()
    feature = df.drop(['RainTomorrow'], axis=1).values.tolist()
    label = df['RainTomorrow'].values.tolist()
    feature_train, feature_test, label_train, label_test = train_test_split(feature, label, test_size=0.1)
    knn = KNeighborsClassifier()
    knn.fit(feature_test, label_test)
    result = knn.predict(feature_test)
    test(result, label_test)

决策树

from tools import *
if __name__ == '__main__':
    df = get_dataset('E:\\X\\DATA\\weatherAUS.csv')
    print(df)
    df_list = df.values.tolist()
    feature = df.drop(['RainTomorrow'], axis=1).values.tolist()
    label = df['RainTomorrow'].values.tolist()
    feature_train, feature_test, label_train, label_test = train_test_split(feature, label, test_size=0.1)
    clf = DecisionTreeClassifier(max_depth=3)
    clf = clf.fit(feature_train, label_train)
    test(clf.predict(feature_test), label_test)
    # 可视化
    dot_data = io.StringIO()
    export_graphviz(clf, out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("tree.pdf")

随机森林

from tools import *
if __name__ == '__main__':
    df = get_dataset('E:\\X\\DATA\\weatherAUS.csv')
    df_list = df.values.tolist()
    feature = df.drop(['RainTomorrow'], axis=1).values.tolist()
    label = df['RainTomorrow'].values.tolist()
    feature_train, feature_test, label_train, label_test = train_test_split(feature, label, test_size=0.1)
    clf = RandomForestClassifier()
    clf = clf.fit(feature_train, label_train)
    print("随机森林分类")
    test(clf.predict(feature_test), label_test)

贝叶斯

from tools import *
if __name__ == '__main__':
    df = get_dataset('E:\\X\\DATA\\weatherAUS.csv')
    df_list = df.values.tolist()
    print(df)
    feature = df.drop(['RainTomorrow'], axis=1).values.tolist()
    label = df['RainTomorrow'].values.tolist()
    feature_train, feature_test, label_train, label_test = train_test_split(feature, label, test_size=0.1)
    clf = GaussianNB()
    clf.fit(feature_train, label_train)
    test(clf.predict(feature_test), label_test)

你可能感兴趣的:(python,机器学习,python)