课程来源:大数据分析师(第一期)(学堂在线 北邮 杨亚)
数据集分享:https://pan.baidu.com/s/197wLiuRLaB8kOxV9Ay7eTQ
提取码:hldd
注意点:
若采用sklearn.preprocessing中的函数进行归一化,要将数据转为二维的!数据只有一维,可采用方法:reshape(-1, 1) #变为n行1列的二维矩阵形式
参考资料:25、pandas的reshape(1,-1)什么意思?
import pandas as pd
filename = 'gz3.csv'
df = pd.read_csv(filename,encoding='utf-8',usecols=[12,11])
#filename = 'sy3.csv'
#df = pd.read_csv(filename,encoding='utf-8',usecols=[22,23])
print(df.head())
print(df.describe())
print(df.info())
import matplotlib.pyplot as plt
fig = plt.figure()
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
x1 = df['HUMI_new2']
y1 = df['PRES_new2']
def count_elements(scores): #定义转换函数,统计每个数值对应多少个
scorescount = {} #定义一个字典对象
for i in scores :
scorescount[int(i)] = scorescount.get(int(i), 0) + 1 #累加每个分数值的人数
return scorescount
counted1 = count_elements(x1)
counted2 = count_elements(y1)
ax1.set_title('HUMI')
ax1.bar(counted1.keys(),counted1.values(),0.8,alpha=0.5,color='b')
ax1.set_title('PRES')
ax2.bar(counted2.keys(),counted2.values(),0.8,alpha=0.5,color='r')
import pandas as pd
import matplotlib.pyplot as plt
filename = 'gz3.csv'
df = pd.read_csv(filename,encoding='utf-8',usecols=[12,11])
fig = plt.figure()
ax1 = fig.add_subplot(231)
#子图1:原始图像
x1 = df['PRES_new2']
y1 = df['HUMI_new2']
ax1.scatter(x1, y1, s=10)
#ax1.set_xlim(0,1250)
#ax1.set_ylim(0,120)
ax1.set_title('Original')
#子图2:(0,1)归一化,采用MinMaxScaler函数
'''
min=x1.min()
max=x1.max( )
ave=x1.mean()
std =x1.std()
x2=(x1-min)/(max-min)
'''
ax2 = fig.add_subplot(232)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_reshape = x1.values.reshape(-1, 1) #变为n行1列的二维矩阵形式
x2 = scaler.fit_transform(x_reshape) #调用MinMaxScaler的fit.transform转换方法,进行归一化处理
y_reshape = y1.values.reshape(-1, 1) #变为n行1列的二维矩阵形式
y2 = scaler.fit_transform(y_reshape) #调用MinMaxScaler的fit.transform转换方法,进行归一化处理
ax2.scatter(x2, y2, s=10)
ax2.set_title('MinMaxScaler')
#子图3:Z-score归一化 采用StandardScaler()函数
ax3 = fig.add_subplot(233)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_reshape = x1.values.reshape(-1, 1) #变为n行1列的二维矩阵形式
x3 = scaler.fit_transform(x_reshape) #调用MinMaxScaler的fit.transform转换方法,进行归一化处理
y_reshape = y1.values.reshape(-1, 1) #变为n行1列的二维矩阵形式
y3 = scaler.fit_transform(y_reshape) #调用MinMaxScaler的fit.transform转换方法,进行归一化处理
ax3.scatter(x3, y3, s=10)
ax3.set_title('StandardScaler')
def count_elements(scores): #定义转换函数,统计每个数值对应多少个
scorescount = {} #定义一个字典对象
for i in scores : #累加每个分数值的人数
scorescount[int(i)] = scorescount.get(int(i), 0) + 1 #get(返回字典中的值,默认值)
return scorescount
def count_elements1(scores): #扩大计数范围,使得效果更明显
scorescount = {} #定义一个字典对象
for i in scores : #累加每个分数值的人数
scorescount[int(i*100)] = scorescount.get(int(i*100), 0) + 1 #get(返回字典中的值,默认值)
return scorescount
ax4 = fig.add_subplot(234)
ax5 = fig.add_subplot(235)
ax6 = fig.add_subplot(236)
ax4.set_title('ori')
ax5.set_title('minmax')
ax6.set_title('standard')
counted1 = count_elements(x1)
counted2 = count_elements1(x2)
counted3 = count_elements1(x3)
ax4.bar(counted1.keys(),counted1.values(),0.8,alpha=0.5,color='r')
ax5.bar(counted2.keys(),counted2.values(),0.8,alpha=0.5,color='r')
ax6.bar(counted3.keys(),counted3.values(),0.8,alpha=0.5,color='r')
#连续值的离散化
import numpy as np
import pandas as pd
data=pd.Series(np.arange(11))
result1=pd.cut(data,5)
result2=pd.cut(data,4)
bins=[0,2,7,10]
result3=pd.cut(data,bins)
print('--------------result1-----------------')
print(result1)
print('--------------result1 count-----------------')
print(pd.value_counts(result1))
print('--------------result2-----------------')
print(result2)
print('--------------result2 count-----------------')
print(pd.value_counts(result2))
print('--------------result3-----------------')
print(result3)
print('--------------result3 count-----------------')
print(pd.value_counts(result3))
data=pd.Series([0,11,18,23,40,50,69,73,88])
result1=pd.cut(data,4)
result2=pd.qcut(data,4)
print('--------------result1-----------------')
print(result1)
print('--------------result1 count-----------------')
print(pd.value_counts(result1))
print('--------------result2-----------------')
print(result2)
print('--------------result2 count-----------------')
print(pd.value_counts(result2))
file="sy3.csv"
df=pd.read_csv(file,encoding='utf-8',usecols=[20])
print('--------------head-----------------')
print(df.head())
print('--------------describe-----------------')
print(df.describe())
print('--------------info-----------------')
print(df.info)
sections=[0,50,100,150,200,300,1200]#设置划分区间
sections_name=['green','yellow','orange','red','purple','Brownish red']
result=pd.cut(df.ave,sections,labels=sections_name)
print('--------------result-----------------')
print(result)
print('--------------result type-----------------')
print(type(result))
print('--------------result count-----------------')
print(pd.value_counts(result))
result1=pd.cut(df.ave,4)
print('--------------result1-----------------')
print(result1.head())
print('--------------result count-----------------')
print(pd.value_counts(result1))