#2018-03-23 18:56:38 March Friday the 12 week, the 082 day SZ SSMR
https://blog.csdn.net/eastmount/article/details/60675865
python数据挖掘学习笔记】十.Pandas、Matplotlib、PCA绘图实用代码补充
一. Pandas获取数据集并显示
采用Pandas对2002年~2014年的商品房价数据集作时间序列分析,从中抽取几个城市与贵阳做对比,并对贵阳商品房作出分析。
import pandas as pd
#pandas直接读取数据显示绘制图形,index_col获取索引。
#ValueError: Index year invalid
data = pd.read_csv("room32.csv",index_col='year') #index_col用作行索引的列名
#显示前6行数据
print(data.shape)
print(data.head(6))
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['simHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
#plt.savefig(u'时序图.png', dpi=500)
plt.show()
二. Pandas获取某列数据绘制柱状图
import pandas as pd
#ValueError: Index year invalid
data = pd.read_csv("room32.csv",index_col='year') #index_col用作行索引的列名
#显示前6行数据
print(data.shape)
print(data.head(6))
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['simHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.savefig(u'时序图.png', dpi=500)
plt.show()
#获取贵阳数据集并绘图
gy = data['Guiyang']
print (u'输出贵阳数据')
print (gy)
gy.plot()
plt.show()
通过这个数据集调用bar函数可以绘制对应的柱状图,如下所示,需要注意x轴位年份,获取两列数据进行绘图。
import pandas as pd
#ValueError: Index year invalid
data = pd.read_csv("room32.csv",index_col='year') #index_col用作行索引的列名
#显示前6行数据
print(data.shape)
print(data.head(6))
#获取贵阳数据集并绘图
gy = data['Guiyang']
print (u'输出贵阳数据')
print (gy)
import numpy as np
x = ['2002','2003','2004','2005','2006','2007','2008',
'2009','2010','2011','2012','2013','2014']
N = 13
ind = np.arange(N) #赋值0-13
width=0.35
plt.bar(ind, gy, width, color='r', label='sum num')
#设置底部名称
plt.xticks(ind+width/2, x, rotation=40) #旋转40度
plt.title('The price of Guiyang')
plt.xlabel('year')
plt.ylabel('price')
#plt.savefig('guiyang.png',dpi=400)
plt.show()
补充一段hist绘制柱状图的代码:
import numpy as np
import pylab as pl
# make an array of random numbers with a gaussian distribution with
# mean = 5.0
# rms = 3.0
# number of points = 1000
data = np.random.normal(5.0, 3.0, 1000)
# make a histogram of the data array
pl.hist(data, histtype='stepfilled') #去掉黑色轮廓
# make plot labels
pl.xlabel('data')
pl.show()
三. Python绘制时间序列-自相关图
import pandas as pd
#ValueError: Index year invalid
data = pd.read_csv("room32.csv",index_col='year')
#显示前6行数据
print(data.shape)
print(data.head(6))
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['simHei']
plt.rcParams['axes.unicode_minus'] = False
data.plot()
#plt.savefig(u'时序图.png', dpi=500)
plt.show()
from statsmodels.graphics.tsaplots import plot_acf
gy = data['Guiyang']
print (gy)
plot_acf(gy).show()
plt.savefig(u'贵阳自相关图',dpi=300)
from statsmodels.tsa.stattools import adfuller as ADF
print ('ADF:',ADF(gy))
四. 聚类分析大连交易所数据集
sklearn自带一些数据集以及UCI官网提供大量的数据集。这里讲述一个大连商品交易所的数据集。
地址:http://www.dce.com.cn/dalianshangpin/xqsj/lssj/index.html#
#第一部分:导入数据集
import pandas as pd
#pandas.errors.ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 3
Coke1 =pd.read_csv("coal35.csv")
print (Coke1 [:4])
#第二部分:聚类
from sklearn.cluster import KMeans
clf=KMeans(n_clusters=3)
pre=clf.fit_predict(Coke1)
print (pre[:4])
#第三部分:降维
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
newData=pca.fit_transform(Coke1)
print (newData[:4])
x1=[n[0] for n in newData]
x2=[n[1] for n in newData]
#第四部分:用matplotlib包画图
import matplotlib.pyplot as plt
plt.title
plt.xlabel("x feature")
plt.ylabel("y feature")
plt.scatter(x1,x2,c=pre, marker='x')
plt.savefig("bankloan.png",dpi=400)
plt.show()
五. PCA降维及绘图代码
from numpy import *
def loadDataSet(fileName,delim='\t'):
fr=open(fileName)
stringArr=[line.strip().split(delim) for line in fr.readlines()]
datArr=[map(float,line) for line in stringArr]
return mat(datArr)
def pca(dataMat,topNfeat=9999999):
#TypeError: unsupported operand type(s) for /: 'map' and 'int'
meanVals=mean(dataMat,axis=0)
meanRemoved=dataMat-meanVals
covMat=cov(meanRemoved,rowvar=0)
eigVals,eigVets=linalg.eig(mat(covMat))
eigValInd=argsort(eigVals)
eigValInd=eigValInd[:-(topNfeat+1):-1]
redEigVects=eigVets[:,eigValInd]
#print meanRemoved
#print redEigVects
lowDDatMat=meanRemoved*redEigVects
reconMat=(lowDDatMat*redEigVects.T)+meanVals
return lowDDatMat,reconMat
dataMat=loadDataSet('41.txt')
lowDMat,reconMat=pca(dataMat,1)
def plotPCA(dataMat,reconMat):
import matplotlib
import matplotlib.pyplot as plt
datArr=array(dataMat)
reconArr=array(reconMat)
n1=shape(datArr)[0]
n2=shape(reconArr)[0]
xcord1=[];ycord1=[]
xcord2=[];ycord2=[]
for i in range(n1):
xcord1.append(datArr[i,0]);ycord1.append(datArr[i,1])
for i in range(n2):
xcord2.append(reconArr[i,0]);ycord2.append(reconArr[i,1])
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=90,c='red',marker='^')
ax.scatter(xcord2,ycord2,s=50,c='yellow',marker='o')
plt.title('PCA')
plt.savefig('ccc.png',dpi=400)
plt.show()
plotPCA(dataMat,reconMat)