《基于Python的大数据分析基础及实战》第四章

第四章

文件下载

阿里云永不限速:https://www.aliyundrive.com/s/1nLLM4keEKS

若没有注册阿里云请使用以下链接注册,双方各领300G

我在使用超好用的「阿里云盘」,注册就领 300 GB 容量,完成新手任务再领 500 GB,快来试试吧点此链接领取福利:
https://pages.aliyundrive.com/mobile-page/web/beinvited.html?code=e212062

蓝奏链接:https://wwm.lanzouf.com/iPJ6v0302zef
原教材pdf下载链接在第一章

开始

个人信息
  • key:word
  • key:word
  • key:word

4.1.1 使用python对数据进行可视化处理

4.1.1 准备工作

import numpy as np
import matplotlib.pyplot as plt
t=np.arange(1,10,0.0005)
x=np.sin(t**2)
y=np.cos(t**2)
plt.figure(figsize=(8,5))
plt.plot(x,y,'r-*')
plt.axis('equal')
plt.xlabel(r'$\sin(t^2)$')
plt.ylabel(r'$\cos(t^2)$')
plt.title('a circle')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-myoaF0NY-1649650392318)(output_3_0.png)]

4.1.2 Matplotlib绘图示例

1. 点图和线图

import numpy as np
import matplotlib.pyplot as plt
x1=np.linspace(0.0,5.0)
x2=np.linspace(0.0,2.0)
y1=np.cos(2*np.pi*x1)*np.exp(-x1)
y2=np.cos(2*np.pi*x2)
plt.subplot(2,1,1)
plt.plot(x1,y1,'yo-')
plt.title('A tale of 2 subplots')
plt.ylabel('Damped oscillation')
plt.subplot(2,1,2)
plt.plot(x2,y2,'r.-')
plt.xlabel('time (s)')
plt.ylabel('Undamped')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zKG3chVB-1649650392319)(output_6_0.png)]

2. 直方图

import numpy as np
import scipy
from scipy import stats
#import matplotlib.mlab as mlab 
import matplotlib.pyplot as plt 
mu = 100
sigma = 15
x = mu + sigma * np.random.randn (10000)
x.shape

num_bins = 50

n,bins,patches=plt.hist(x,num_bins,density=True,color='green')
y=scipy.stats.norm.pdf(bins,mu,sigma)
plt.plot(bins,y,'r--')
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.subplots_adjust(left=0.15)
plt.show()
bins

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hmzXZcDS-1649650392320)(output_8_0.png)]

array([ 49.1159973 ,  51.28145314,  53.44690897,  55.61236481,
        57.77782065,  59.94327648,  62.10873232,  64.27418815,
        66.43964399,  68.60509983,  70.77055566,  72.9360115 ,
        75.10146733,  77.26692317,  79.432379  ,  81.59783484,
        83.76329068,  85.92874651,  88.09420235,  90.25965818,
        92.42511402,  94.59056985,  96.75602569,  98.92148153,
       101.08693736, 103.2523932 , 105.41784903, 107.58330487,
       109.7487607 , 111.91421654, 114.07967238, 116.24512821,
       118.41058405, 120.57603988, 122.74149572, 124.90695155,
       127.07240739, 129.23786323, 131.40331906, 133.5687749 ,
       135.73423073, 137.89968657, 140.0651424 , 142.23059824,
       144.39605408, 146.56150991, 148.72696575, 150.89242158,
       153.05787742, 155.22333326, 157.38878909])

3. 等值线图

import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

delta = 0.2
x=np.arange(-3, 3, delta) 
y=np.arange(-3, 3, delta)
X,Y=np.meshgrid(x,y)
Z=X**2+Y**2
x,y,z=X.flatten(),Y.flatten(),Z.flatten()
fig=plt.figure(figsize=(12,6))
ax1=fig.add_subplot(121,projection='3d')
ax1.plot_trisurf(x,y,z,cmap=cm.autumn, linewidth=0.01)
plt.title('3D')
ax2=fig.add_subplot(122)
cs=ax2.contour(X,Y,Z,15,cmap='jet')
ax2.clabel(cs, inline=True, fontsize=10, fmt='%1.1f')
plt.title('Contour')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7T6O79SP-1649650392325)(output_10_0.png)]

4. 三维曲面图

from mpl_toolkits.mplot3d import axes3d 
import matplotlib.pyplot as plt
from matplotlib import cm
fig = plt.figure(figsize=(8,6)) 
#ax = fig.gca(projection='3d')
ax=plt.axes(projection='3d')
X, Y, Z = axes3d.get_test_data(0.05)
ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3)
cset =ax.contour(X,Y,Z, zdir='z', offset=-100, cmap=cm.coolwarm) 
cset =ax.contour(X,Y,Z,zdir='x', offset=-40, cmap=cm.coolwarm)
cset =ax.contour(X, Y, Z, zdir='y', offset=40, cmap=cm.coolwarm) 
ax.set_xlabel('X')
ax.set_xlim(-40, 40)
ax.set_ylabel('Y')
ax.set_ylim(-40, 40)
ax.set_zlabel('Z')
ax.set_zlim(-100, 100)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zn9qrWZN-1649650392326)(output_12_0.png)]

5. 条形图

import numpy as np
import matplotlib.pyplot as plt
n_groups=5
means_men=(20,35,30,35,27)
std_men=(2,3,4,1,2)
means_women=(25,32,34,20,25)
std_women=(3,5,2,3,3)
fig,ax=plt.subplots()
index=np.arange(n_groups)
bar_width=0.35
opacity=0.4
error_config={'ecolor':'0.3'}
rects1 = plt.bar(index, means_men,bar_width,
alpha=opacity, color="b", yerr=std_men,
error_kw=error_config, label='Men')
rects2 = plt.bar(index+bar_width, means_women,bar_width,
alpha=opacity,color="r", yerr=std_women,
error_kw=error_config, label='Women')
plt.xlabel("Group")
plt.ylabel("Scores")
plt.title("Scores by group and gender")
plt.xticks(index+bar_width,('A','B','C','D','E'))
plt.legend()
plt.tight_layout()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pG8cd4hY-1649650392326)(output_14_0.png)]

6. 饼图

import matplotlib.pyplot as plt
labels='Frogs','Hogs','Dogs','Logs'
sizes=[15,30,45,10]
colors=['yellowgreen','gold','lightskyblue','lightcoral']
explode=0,0.1,0,0
plt.pie(sizes,explode=explode,labels=labels,colors=colors,
       autopct='%1.1f%%',shadow=True,startangle=90)
plt.axis('equal')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-npJorIjr-1649650392327)(output_16_0.png)]

7. 气泡图(散点图)

import matplotlib.pyplot as plt
import pandas as pd
df_data=pd.read_excel("iris.xls")

sizes=df_data['petal length']*100

fig=plt.figure(figsize=(10,8))
plt.scatter(df_data['sepal length'],
            df_data['sepal width'],s=sizes,
                    alpha=0.6)
plt.xlabel('Sepal Length(cm)')
plt.ylabel('Sepal Width(cm)')
plt.title('Petal Length(cm)*100')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iW7X3Af7-1649650392327)(output_18_0.png)]

4.1.3 Seabon 中的图例

1. 数据分布可视化

from sklearn.datasets import load_iris
import numpy as np
iris=load_iris()
iris.data
iris
from pandas import DataFrame
df=DataFrame(iris.data,columns=iris.feature_names)
df['target']=iris.target
df
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
import seaborn as sns
sns.set(color_codes=True)
sns.displot(df['petal length (cm)'],bins=15)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-kfssWRfQ-1649650392328)(output_21_0.png)]

sns.jointplot(x='sepal length (cm)',y='sepal width (cm)',
             data=df, height=8)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lNhEfqgC-1649650392328)(output_22_0.png)]

sns.FacetGrid(df, hue='target', 
              height=8).map(plt.scatter,'sepal length (cm)',
                          'sepal width (cm)').add_legend()
                                            
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-z4xBIBdq-1649650392329)(output_23_0.png)]

sns.axes_style('dark')
sns.jointplot(x="sepal length (cm)",y='sepal width (cm)',
data=df, kind="hex",color='k')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dWYvt4Lx-1649650392329)(output_24_0.png)]

g=sns.jointplot(x="sepal length (cm)",y='sepal width (cm)',
data=df, kind="kde",color='m')
g.plot_joint(plt.scatter,c='y',s=30, 
            linewidth=1,marker='+')
g.ax_joint.collections[0].set_alpha(0)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-c25ogzsC-1649650392330)(output_25_0.png)]

g=sns.PairGrid(df)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HFTzQCfJ-1649650392330)(output_26_0.png)]

g=sns.PairGrid(df,hue='target')
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-b2KiPXvO-1649650392331)(output_27_0.png)]

g=sns.PairGrid(df,vars=['sepal length (cm)','sepal width (cm)'],
              hue="target")
g.map(plt.scatter)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-sTaRGkGv-1649650392331)(output_28_0.png)]

g=sns.PairGrid(df)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot,cmap="Blues_d")
g.map_diag(sns.kdeplot, lw=3,legend=False)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-egGR5SVb-1649650392332)(output_29_0.png)]

sns.pairplot(df,hue="target",size=2.5)
D:\ANACONDA\envs\py397\lib\site-packages\seaborn\axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)






[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rD6FjpOU-1649650392332)(output_30_2.png)]

g=sns.pairplot(df,hue="target",palette="Set2",
               diag_kind="kde",height=2.5)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9DDGtXJZ-1649650392332)(output_31_0.png)]

2. 线性相关图

sns.lmplot(x='sepal length (cm)',y='petal width (cm)',
           data=df,hue='target')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iiCErT0Y-1649650392333)(output_33_0.png)]

3. 分类数据可视化

sns.violinplot(x='target',y="sepal length (cm)",
              data=df, inner=None)
sns.swarmplot(x='target',y="sepal length (cm)",
              data=df, color='w',alpha=0.5)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-15GTWlX9-1649650392333)(output_35_0.png)]

plt.figure(figsize=(8,6))
sns.boxplot(x='target',y="sepal length (cm)",
              data=df)
plt.title("Boxplot")
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-B3o6RZFB-1649650392334)(output_36_0.png)]

热力图

##热力图
##相关系数大小的可视化
import numpy as np 
newdata=df 
datacor=np.corrcoef(newdata, rowvar=0)
datacor =pd.DataFrame(data=datacor,columns=newdata.columns, index=newdata.columns)
##形式1
mask=np.zeros_like(datacor)
mask[np.triu_indices_from(mask)] =True 
plt.figure(figsize=(8, 8) )
with sns.axes_style("white"):
    ax=sns.heatmap(datacor, mask=mask, square=True,annot=True)
ax.set_title("Iris data Variables Relation")
plt.show()
##形式2
plt.figure(figsize=(8, 8) )
with sns.axes_style("white") :
    ax=sns.heatmap(datacor, square=True, annot=True, fmt="f")
ax.set_title("Iris data Variables Relation")
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PsXyduiD-1649650392334)(output_38_0.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7KzSoIJ1-1649650392335)(output_38_1.png)]

4.1.4 pandas 的一些可视化功能

1. 绘制箱线图

df.boxplot(by="target",figsize=(12,6))
plt .show ()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wnbiaOYF-1649650392335)(output_41_0.png)]

2. 时间序列图

ts=pd.Series(np.random.randn(1000),
             index=pd.date_range('1/1/2000',periods=1000))
ts=ts.cumsum()
ts.plot()
df0=pd.DataFrame(np.random.randn(1000,4),
             index=ts.index, columns=list('ABCD'))
df0=df0.cumsum()
plt.figure(figsize=(8,6))
df0.plot()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GCCVZt69-1649650392336)(output_43_0.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tqhnARzf-1649650392336)(output_43_2.png)]

3. 安德鲁曲线和平行坐标图

from pandas.plotting import andrews_curves
from pandas.plotting import parallel_coordinates
plt.figure(figsize=(6,4))
andrews_curves(df,'target')
plt.title('andrews curves')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wjIoSBA9-1649650392336)(output_45_0.png)]

plt.figure(figsize=(6, 4) )
parallel_coordinates(df, "target") 
plt.title("parallel coordinates") 
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DPeyiOcm-1649650392337)(output_46_0.png)]

from pandas.plotting import radviz
plt.figure(figsize=(8,6))
radviz(df,'target')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-sRVHEAX0-1649650392337)(output_47_0.png)]

4.1.5 文本数据可视化

import networkx as nx
import matplotlib.pyplot as plt
G=nx.random_geometric_graph(200,0.125)
#position is stored as node attribute data for random_geometric_ graph
pos=nx.get_node_attributes (G, 'pos')
#find node near center (0.5,0.5)
dmin=1
ncenter=0
for n in pos:
    x,y=pos[n]
    d=(x-0.5)**2+(y-0.5)**2
    if d<dmin:
        ncenter=n
        dmin=d

p=nx.single_source_shortest_path_length(G,ncenter)
plt.figure(figsize=(8,8))
nx.draw_networkx_edges(G, pos, nodelist=[ncenter],alpha=0.4) 
nx.draw_networkx_nodes(G,pos,nodelist=p.keys(),node_size=80)
plt.xlim(-0.05,1.05)
plt.ylim(-0.05,1.05) 
plt.axis('off') 
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XVzkDsxh-1649650392338)(output_49_0.png)]

4.1.7 folium 绘制地图

import folium
from folium import plugins
import numpy as np

data = (np.random.normal(size=(100, 3))*
        np.array([[1, 1, 1]]) +
        np.array([[48, 5, 1]])).tolist()

mapa = folium.Map([48., 5.], tiles='stamentoner', 
                   zoom_start=6) 
mapa.add_child(plugins.HeatMap(data)) 
mapa.save('Heatmap.html')    
from PIL import Image
pil_im=Image.open('zhouzhou.jpg')
Pil_im=Image.open('zhouzhou.jpg').convert("L")
pil_im

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-d14fS41P-1649650392338)(output_52_0.png)]

pil_im.thumbnail((128,128))
pil_im

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RdK1XHuj-1649650392338)(output_53_0.png)]

from PIL import Image
pil_im=Image.open('zhouzhou.jpg')
box=(150,350,400,600)
region=pil_im.crop(box)
region=region.transpose(Image.ROTATE_90)
region
C:\Users\b2014\AppData\Local\Temp\ipykernel_20996\2549292298.py:5: DeprecationWarning: ROTATE_90 is deprecated and will be removed in Pillow 10 (2023-07-01). Use Transpose.ROTATE_90 instead.
  region=region.transpose(Image.ROTATE_90)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jTvICxzz-1649650392339)(output_54_1.png)]

pil_im=Image.open('zhouzhou.jpg')
out=pil_im.resize((128,128))
out

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DFlxo0WZ-1649650392339)(output_55_0.png)]

out=out.rotate(45)
out

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0gIm23VN-1649650392339)(output_56_0.png)]

5. 图像轮廓和直方图

%reset -f
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
im=np.array(Image.open("zhouzhou.jpg").convert("L"))
im.shape
(1054, 737)
plt.figure()
plt.gray()
plt.contour(im, origin='image')
plt.axis("equal")
plt.show()
plt.hist(im.flatten(),128)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KbcENCjE-1649650392339)(output_59_0.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3sUToHiR-1649650392340)(output_59_1.png)]

4.2.2 OpenCV 图库

安装py_opencv

conda install opencv

#### 1. 读取和写入图像


```python
import cv2
im=cv2.imread('zhouzhou.jpg')
im.shape
cv2.imwrite('zhouzhou1.png',im)
True

2. 颜色空间

im=cv2.imread('zhouzhou.jpg')
gray=cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
print(gray)
gray.shape
[[173 175 177 ...  31  32  32]
 [172 175 177 ...  30  30  31]
 [172 175 177 ...  29  29  30]
 ...
 [ 40  41  43 ... 123 123 124]
 [ 40  41  43 ... 123 123 123]
 [ 40  41  44 ... 122 122 123]]





(1054, 737)
cv2.COLOR_BGR2GRAY 
cv2.COLOR_BGR2RGB 
cv2.COLOR_GRAY2BGR
8

3. 图像显示

import matplotlib.pyplot as plt
intim=cv2.integral(gray)
intim=(255*intim)/intim.max()
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.imshow(gray)
plt.title("YTZ picture")
plt.subplot(1,2,2)
plt.imshow(intim)
plt.title('YTZ integral')
plt.show()
    ···

你可能感兴趣的:(教材,big,data)