首先感谢我的队友,给出了很多关键的思路和想法,论文也基本没让我上手
先上个思路,不好意思让你们在这里还要看我手写的东西
特别是发现异常的过程,数据,作图,分组,发现问题,删一部分;数据,作图,分组,发现问题,删一部分……
这个过程跟队友一起搞了好久
这部分多且杂,这篇比较系统,为了一个清晰的思路明天再总结总结数据处理发现问题的过程(啊代码也是)
大佬说,最重要的是数据。数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限
懵懵懂懂,毕竟第四题的模型效果辣眼睛,也不知道是不是数据的问题,但是如果是数据的问题,明明第三题还阔以哇!!!
木事,听说哈梅巴赫也是在华尔街做宽客时经历了一次公司停电丢数据才意识到
但愿有生之年也有机会能像他一样潇洒
图片还转不过来真不好意思
1. #导包,导入数据
2. import pandas as pd
3. import numpy as np
4. import matplotlib.pyplot as plt
5. from pandas import DataFrame,Series
6. import seaborn as sns
7. from sklearn.linear_model import LinearRegression
8. from sklearn.model_selection import train_test_split
9. data=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/附件/list1.csv",encoding="gbk")
10. data.shape
11.
12.
13. del data["Unnamed: 20"]
14. del data["Unnamed: 19"]
15. del data["Unnamed: 18"]
16. del data["Unnamed: 17"]
17. del data["Unnamed: 16"]
18. data
19.
20.
21. #删除空值
22. data=data.dropna(axis=0,how='any')
23. data.shape
24.
25.
26. #删除温差<=0的行
27. data= data.drop(data[(data['dtemp']<=0)].index)
28. data.shape
29.
30.
31. #删除流量/功率=0的行(一样的)
32. data= data.drop(data[(data['POWER']<=0)].index)
33. data.shape
34.
35.
36. #删除室温,设置室温都=0的行
37. data= data.drop(data[(data['ROOM_TEMPERATURE']==0) & (data['ROOM_SET_TEMPERATURE']==0)].index)
38. data.shape
39.
40.
41. #删除阀门全关,流速不为0的行
42. data= data.drop(data[(data['FLOW_SPEED']!=0) & (data['OPEN_STATUS']=='全关')].index)
43. data.shape
44.
45.
46. #查看室温的均值,标准差,以均值三倍标准差以外作为异常界限
47. mean, std = data["ROOM_TEMPERATURE"].mean(), data["ROOM_TEMPERATURE"].std()
48.
49. lower, upper = mean - 3 * std, mean + 3 * std
50. print("均值:", mean)
51. print("标准差:", std)
52. print("下限:", lower)
53. print("上限:", upper)
54. data["ROOM_TEMPERATURE"][(data["ROOM_TEMPERATURE"] < lower) | (data["ROOM_TEMPERATURE"] > upper)]
55.
56.
57. #箱线图
58. sns.boxplot(data=data["ROOM_TEMPERATURE"])
59.
60.
61. #删除超过界限的数据
62. data = data.drop(data[(data["ROOM_TEMPERATURE"] < lower) | (data["ROOM_TEMPERATURE"] > upper)].index)
63.
64.
65. #功率同理
66. mean, std = data["POWER"].mean(), data["POWER"].std()
67. lower, upper = mean - 3 * std, mean + 3 * std
68. print("均值:", mean)
69. print("标准差:", std)
70. print("下限:", lower)
71. print("上限:", upper)
72. data["POWER"][(data["POWER"] < lower) | (data["POWER"] > upper)]
73.
74.
75. sns.boxplot(data=data["POWER"])
76.
77.
78. data = data.drop(data[(data["POWER"] < lower) | (data["POWER"] > upper)].index)
79. data
80.
81.
82. # 按时间排序
83. data.sort_values("SYS_READ_TIME",inplace=True)
84. data
85.
86.
87. # 保存
88. data
89. data.to_csv('./data.csv',encoding="gbk")
1. # 导包,导入文件
2. import pandas as pd
3. import numpy as np
4. import matplotlib.pyplot as plt
5. data2=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/附件/list2.csv",encoding="gbk")
6. data2.shape
7.
8.
9. #删除中间的空值列
10. del data2['Unnamed: 1']
11.
12.
13. # 时间转化为整形
14. def time_trans(time_ch):
15. year = time_ch.split('/')[0]
16. month = time_ch.split('/')[1]
17. day = time_ch.split('/')[2].split(' ')[0]
18. h = time_ch.split(' ')[1].split(':')[0]
19. m = time_ch.split(':')[1]
20.
21. if len(month) == 1:
22. month = '0'+ month
23. if len(day) == 1:
24. day = '0' + day
25. if len(h) == 1:
26. h = '0' + h
27.
28. new_date = (year + month + day + h + m )
29. return int(new_date)
30.
31. time_trans('2017/12/1 5:30')
32.
33.
34. # 转化成整形的时间加入一个新列表
35. new_time=[]
36. for i in list(data2['时间']):
37. new_time.append(time_trans(i))
38. # i+=1
39. len(list(set(new_time)))
40.
41.
42. # 温度转换为列表
43. new_temp=list(data2['温度'])
44. new_temp
45.
46.
47. # 合并为一个新的DataFrame
48. from pandas.core.frame import DataFrame
49. new_time_temp={'new_time' : new_time,
50. 'new_temp' : new_temp}
51. time_temp = DataFrame(new_time_temp)
52. time_temp
53.
54.
55. # 在时间列每两行之间插入29行,在前一行的基础上+1
56. # 把原来半小时的时间间隔转换为一分钟
57. for i in new_time:
58. time=i
59. for j in range(29):
60. insertRow = pd.DataFrame([(time+1)],columns = ['new_time'])
61. above = time_temp.loc[:j]
62. below = time_temp.loc[(j+1):]
63. time_temp = above.append(insertRow,ignore_index=True).append(below,ignore_index=True)
64. j+=1
65. time+=1
66. i+=1
67. time_temp
68.
69.
70. # 按新的时间排序
71. time_temp.sort_values(by="new_time" , ascending=True,inplace=True)
72. time_temp
73.
74.
75. # 删除之前的索引
76. time_temp = time_temp.reset_index(drop=True)
77.
78.
79. # 温度列以上面最近的有效值填充
80. time_temp = time_temp.fillna(method='ffill')
81. time_temp
82.
83.
84. # 把整形的时间转换回去,并放在一个新列表里
85. def time_trans1(time_ch):
86. time_ch = str(time_ch)
87. year = time_ch[:4]
88. month = time_ch[4:6]
89. day = time_ch[6:8]
90. h = time_ch[8:10]
91. m = time_ch[10:]
92.
93. year = year + '/'
94. month = month + '/'
95. day = day + ' '
96. h = h + ':'
97.
98. new_date = (year + month + day + h + m )
99. return (new_date)
100.
101. new_time1=[]
102. for i in list(time_temp['new_time']):
103. new_time1.append(time_trans1(i))
104. # i+=1
105. len(list(new_time1))
106.
107.
108. # 新的温度也放一个列表
109. new_temp1=list(time_temp['new_temp'])
110. new_temp1
111.
112.
113. # 合并一个新DataFrame
114. data={'时间' : new_time1,
115. '温度' : new_temp1}
116. data2 = DataFrame(data)
117. data2
118.
119.
120. # 保存
121. data2.to_csv('./data2.csv',encoding="gbk")
1. #导包,导入数据
2. import pandas as pd
3. import numpy as np
4. import matplotlib.pyplot as plt
5. data=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/data.csv",encoding="gbk")
6. data2=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/data2.csv",encoding="gbk")
7. data.shape
8. data2.shape
9.
10.
11. 数据2的时间列名改为和数据1统一
12. data2.rename(columns={'时间':'SYS_READ_TIME'},inplace=True)
13. data2
14.
15.
16. # 以时间为索引左连接合并数据据
17. data1 = pd.merge(data,data2,how = 'left',on='SYS_READ_TIME')
18. data1.shape
19.
20.
21. # 删除有空值的行(室外温度缺失)
22. data1=data1.dropna(axis=0,how='any')
23. data1.shape
24.
25.
26. # 删除多出来的索引列
27. del data1['Unnamed: 0_y']
28. del data1['Unnamed: 0_x']
29.
30.
31. # 保存
32. data1.to_csv('./data3.csv',encoding="gbk")
四.线性回归
1. # 导包,导入数据
2. import pandas as pd
3. import numpy as np
4. import matplotlib.pyplot as plt
5. from pandas import DataFrame,Series
6. import seaborn as sns
7. from sklearn.linear_model import LinearRegression
8. from sklearn.model_selection import train_test_split
9. data=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/data3.csv",encoding="gbk")
10. data.shape
11.
12.
13. # 查看功率与温度的协方差,相关系数
14. X ,y= data['POWER'],data['ROOM_TEMPERATURE']
15. print("协方差:", X.cov(y))
16. print("相关系数:", X.corr(y))
17.
18.
19. # 查看室外温度与温度的协方差,相关系数
20. X ,y= data['温度'],data['ROOM_TEMPERATURE']
21. print("协方差:", X.cov(y))
22. print("相关系数:", X.corr(y))
23.
24.
25. # 查看所有数据之间的相关系数
26. data.corr()
27.
28.
29. # 画出相关系数热图
30. plt.figure(figsize=(15, 10))
31. ax = sns.heatmap(data.corr(), cmap=plt.cm.RdYlGn, annot=True, fmt=".2f")
32. # 注意:Matplotlib 3.1.1版本的bug,heatmap的首行与末行会显示不全。
33. # 可手动调整y轴的范围来进行修复。(老版本的Matplotlib不需要调整y轴范围。)
34. # a, b = ax.get_ylim()
35. # ax.set_ylim(a + 0.5, b - 0.5)
36.
37.
38. # 划出30%的测试集,线性回归训练、测试模型,并查看各特征变量的权重、截距,和检验标准
39. X ,y= data[['POWER','温度','IN_TEMPERATURE']],data['ROOM_TEMPERATURE']
40. # y = data['ROOM_TEMPERATURE']
41. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
42. lr = LinearRegression()
43. lr.fit(X_train, y_train)
44.
45. print("模型权重:", lr.coef_)
46. print("截距:", lr.intercept_)
47. y_hat = lr.predict(X_test)
48.
49. print("均方误差(MSE):", mean_squared_error(y_test, y_hat))
50. print("根均方误差(RMSE):", np.sqrt(mean_squared_error(y_test, y_hat)))
51. print("平均绝对值误差(MAE):", mean_absolute_error(y_test, y_hat))
52. print("训练集R^2:", lr.score(X_train, y_train))
53. print("测试集R^2:", lr.score(X_test, y_test))
54.
55.
56. # 分户,各划出30%的测试集,线性回归训练、测试模型,并查看各特征变量的权重、截距,和检验标准
57. def func_all(x):
58. data2=data[(data.address_2nd==x)]
59.
60. X ,y= data2[['POWER','IN_TEMPERATURE','温度']],data2['ROOM_TEMPERATURE']
61.
62. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
63. lr = LinearRegression()
64. lr.fit(X_train, y_train)
65. print("住户:",x)
66. print("模型权重:", lr.coef_)
67. print("截距:", lr.intercept_)
68. y_hat = lr.predict(X_test)
69.
70. print("均方误差(MSE):", mean_squared_error(y_test, y_hat))
71. print("根均方误差(RMSE):", np.sqrt(mean_squared_error(y_test, y_hat)))
72. print("平均绝对值误差(MAE):", mean_absolute_error(y_test, y_hat))
73. print("训练集R^2:", r2_score(y_train, lr.predict(X_train)))
74. print("测试集R^2:", r2_score(y_test, y_hat))
75. # socre其实求解的就是r^2的值。但是注意,r2_score方法与score方法传递参数的内容是不同的。
76. print(' ')
77.
78. address = [15311251,15310819,15310804,15311234,15311289,15311072,15311061,15310846,15311065,15311342,15311245,15310966,
79. 15311191,15311361,15310827,15311196,15311235,15311233,15310808,15311473,15310839,15310815,15310845,
80. 15311082]
81. for i in range(len(address)):
82. x = address[i]
83. func_all(x)
84. i+=1
85.
86.
87. #将分户得到的数据写入csv文件
88.
89. import csv
90.
91. #python2可以用file替代open
92. with open("result.csv","w") as csvfile:
93. writer = csv.writer(csvfile)
94.
95. #先写入columns_name
96. writer.writerow(['index','住户','模型权重','截距','回归方程','训练集R^2','测试集R^2','均方误差(MSE)','根均方误差(RMSE)','"平均绝对值误差(MAE)'])
97.
98. def all(x):
99. data2=data[(data.address_2nd==x)]
100.
101. X ,y= data2[['POWER','IN_TEMPERATURE','温度']],data2['ROOM_TEMPERATURE']
102.
103. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
104. lr = LinearRegression()
105. lr.fit(X_train, y_train)
106. y_hat = lr.predict(X_test)
107. df1=x
108. df2=lr.coef_
109. df3=lr.intercept_
110. list(df2)
111. df4=('y = %f x1 + %f x2 +%f x3 +%f'%(df2[0],df2[1],df2[2],df3))
112. df5=r2_score(y_train, lr.predict(X_train))
113. df6=r2_score(y_test, y_hat)
114. df7=mean_squared_error(y_test, y_hat)
115. df8=np.sqrt(mean_squared_error(y_test, y_hat))
116. df9=mean_absolute_error(y_test, y_hat)
117.
118. writer.writerows([[i,df1,df2,df3,df4,df5,df6,df7,df8,df9]])
119. address = [15311251,15310819,15310804,15311234,15311289,15311072,15311061,15310846,15311065,15311342,15311245,15310966,
120. 15311191,15311361,15310827,15311196,15311235,15311233,15310808,15311473,15310839,15310815,15310845,
121. 15311082]
122. for i in range(len(address)):
123. x = address[i]
124. all(x)
125. for i in address:
126. df1=x
127. df2=lr.coef_
128. df3=lr.intercept_
129. df4=r2_score(y_train, lr.predict(X_train))
130. i+=1
论文已提交,代码可以粘了黑黑黑
代码是粘的整理到word的附件,就出现了两排行号,难受,但懒得删了