使用np.array()创建
使用plt创建
使用np的routines函数创建
使用array()创建一个一维数组
In [3]:
import numpy as np
arr = np.array([1,2,3,4,5,6])
arr
Out[3]:
array([1, 2, 3, 4, 5, 6])
In [4]:
np.array([[1,2,3,4],[5,6,7,8],[9,9,9,9]])
Out[4]:
array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 9, 9, 9]])
In [6]:
arr = np.array([1,2.2,3,4,5,6])
arr
Out[6]:
array([1. , 2.2, 3. , 4. , 5. , 6. ])
In [10]:
import matplotlib.pyplot as plt
img_arr = plt.imread('./1.jpg')
plt.imshow(img_arr)
Out[10]:
In [11]:
plt.imshow(img_arr-100)
Out[11]:
In [12]:
np.zeros((3,4))
Out[12]:
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
In [13]:
np.linspace(0,100,num=20)
Out[13]:
array([ 0. , 5.26315789, 10.52631579, 15.78947368,
21.05263158, 26.31578947, 31.57894737, 36.84210526,
42.10526316, 47.36842105, 52.63157895, 57.89473684,
63.15789474, 68.42105263, 73.68421053, 78.94736842,
84.21052632, 89.47368421, 94.73684211, 100. ])
In [14]:
np.arange(0,100,step=3)
Out[14]:
array([ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48,
51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99])
In [16]:
np.random.randint(0,100,size=(5,6))
Out[16]:
array([[71, 76, 47, 11, 7, 6],
[47, 89, 70, 44, 41, 96],
[58, 42, 36, 53, 49, 55],
[13, 32, 64, 58, 15, 7],
[78, 56, 40, 71, 45, 63]])
In [18]:
np.random.random((3,4))
Out[18]:
array([[0.24913375, 0.91988476, 0.36386714, 0.58404557],
[0.15544885, 0.73892461, 0.82189615, 0.80368295],
[0.07230386, 0.45535116, 0.75370029, 0.03377829]])
In [23]:
#固定随机性
np.random.seed(10)
np.random.randint(0,100,size=(5,6))
Out[23]:
array([[ 9, 15, 64, 28, 89, 93],
[29, 8, 73, 0, 40, 36],
[16, 11, 54, 88, 62, 33],
[72, 78, 49, 51, 54, 77],
[69, 13, 25, 13, 92, 86]])
In [30]:
img_arr.shape
img_arr.ndim
img_arr.size
img_arr.dtype
type(img_arr)
Out[30]:
numpy.ndarray
In [32]:
arr = np.array([1,2,3],dtype='uint8')
In [38]:
arr = np.array([1,2,3])
In [39]:
arr.dtype = 'int32'
In [40]:
arr = np.random.randint(0,100,size=(6,8))
arr
Out[40]:
array([[30, 30, 89, 12, 65, 31, 57, 36],
[27, 18, 93, 77, 22, 23, 94, 11],
[28, 74, 88, 9, 15, 18, 80, 71],
[88, 11, 17, 46, 7, 75, 28, 33],
[84, 96, 88, 44, 5, 4, 71, 88],
[88, 50, 54, 34, 15, 77, 88, 15]])
In [41]:
arr[1]
Out[41]:
array([27, 18, 93, 77, 22, 23, 94, 11])
In [44]:
arr.shape
Out[44]:
(6, 8)
In [43]:
#切出前两行
arr[0:2]
Out[43]:
array([[30, 30, 89, 12, 65, 31, 57, 36],
[27, 18, 93, 77, 22, 23, 94, 11]])
In [45]:
#切出前两列arr[hang,lie]
arr[:,0:2]
Out[45]:
array([[30, 30],
[27, 18],
[28, 74],
[88, 11],
[84, 96],
[88, 50]])
In [46]:
#切出前两行的前两列的数据
arr[0:2,0:2]
Out[46]:
array([[30, 30],
[27, 18]])
In [49]:
#数组数据翻转
plt.imshow(img_arr)
Out[49]:
In [50]:
img_arr.shape #前两个维度表示的是像素,最后一个维度表示颜色
Out[50]:
(426, 640, 3)
In [51]:
#将图片进行上下翻转
plt.imshow(img_arr[::-1,:,:])
Out[51]:
In [52]:
plt.imshow(img_arr[:,::-1,:])
Out[52]:
In [53]:
plt.imshow(img_arr[::-1,::-1,::-1])
Out[53]:
In [54]:
#裁剪
plt.imshow(img_arr)
Out[54]:
In [55]:
plt.imshow(img_arr[50:200,50:300,:])
Out[55]:
In [57]:
arr = np.array([1,2,3,4,5,6])
arr
Out[57]:
array([1, 2, 3, 4, 5, 6])
In [60]:
#将一维数组变形成二维
arr.reshape((2,3))
Out[60]:
array([[1, 2, 3],
[4, 5, 6]])
In [61]:
arr.reshape((-1,2))
Out[61]:
array([[1, 2],
[3, 4],
[5, 6]])
In [64]:
arr1 = np.array([[1,2,3],[4,5,6]])
arr1
Out[64]:
array([[1, 2, 3],
[4, 5, 6]])
In [68]:
np.concatenate((arr1,arr1),axis=1)
Out[68]:
array([[1, 2, 3, 1, 2, 3],
[4, 5, 6, 4, 5, 6]])
In [69]:
arr2 = np.array([[1,2,3,3],[4,5,6,6]])
arr2
Out[69]:
array([[1, 2, 3, 3],
[4, 5, 6, 6]])
In [71]:
#讲arr1和arr2进行级联
np.concatenate((arr1,arr2),axis=1)
Out[71]:
array([[1, 2, 3, 1, 2, 3, 3],
[4, 5, 6, 4, 5, 6, 6]])
In [74]:
arr = np.random.randint(0,10,size=(4,5))
arr
Out[74]:
array([[6, 6, 5, 6, 0],
[0, 6, 9, 1, 8],
[9, 1, 2, 8, 9],
[9, 5, 0, 2, 7]])
In [77]:
arr.sum(axis=1)
Out[77]:
array([23, 24, 29, 23])
In [78]:
np.sin(arr)
Out[78]:
array([[-0.2794155 , -0.2794155 , -0.95892427, -0.2794155 , 0. ],
[ 0. , -0.2794155 , 0.41211849, 0.84147098, 0.98935825],
[ 0.41211849, 0.84147098, 0.90929743, 0.98935825, 0.41211849],
[ 0.41211849, -0.95892427, 0. , 0.90929743, 0.6569866 ]])
In [81]:
arr = np.random.random(size=(3,4))
arr
Out[81]:
array([[0.07961309, 0.30545992, 0.33071931, 0.7738303 ],
[0.03995921, 0.42949218, 0.31492687, 0.63649114],
[0.34634715, 0.04309736, 0.87991517, 0.76324059]])
In [83]:
np.around(arr,decimals=2)
Out[83]:
array([[0.08, 0.31, 0.33, 0.77],
[0.04, 0.43, 0.31, 0.64],
[0.35, 0.04, 0.88, 0.76]])
In [85]:
arr = np.random.randint(0,20,size=(5,3))
arr
Out[85]:
array([[12, 18, 17],
[17, 16, 0],
[ 5, 9, 0],
[ 6, 0, 2],
[ 3, 3, 18]])
In [86]:
np.amin(arr,axis=0)
Out[86]:
array([3, 0, 0])
In [87]:
np.ptp(arr,axis=0)
Out[87]:
array([14, 18, 18])
In [88]:
np.median(arr,axis=0)
Out[88]:
array([6., 9., 2.])
In [93]:
np.std(arr,axis=0)
Out[93]:
array([5.16139516, 7.02566723, 8.28492607])
In [94]:
np.var(arr,axis=0)
Out[94]:
array([26.64, 49.36, 68.64])
NumPy 中包含了一个矩阵库 numpy.matlib,该模块中的函数返回的是一个矩阵,而不是 ndarray 对象。一个 的矩阵是一个由行(row)列(column)元素排列成的矩形阵列。
matlib.empty() 函数返回一个新的矩阵,语法格式为:numpy.matlib.empty(shape, dtype),填充为随机数据
In [98]:
import numpy.matlib as matlib
matlib.empty(shape=(4,5))
Out[98]:
matrix([[-0.2794155 , -0.2794155 , -0.95892427, -0.2794155 , 0. ],
[ 0. , -0.2794155 , 0.41211849, 0.84147098, 0.98935825],
[ 0.41211849, 0.84147098, 0.90929743, 0.98935825, 0.41211849],
[ 0.41211849, -0.95892427, 0. , 0.90929743, 0.6569866 ]])
In [ ]:
In [101]:
matlib.eye(5,5,1)
Out[101]:
matrix([[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0.]])
In [99]:
matlib.identity(6)
Out[99]:
matrix([[1., 0., 0., 0., 0., 0.],
[0., 1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0., 0.],
[0., 0., 0., 0., 1., 0.],
[0., 0., 0., 0., 0., 1.]])
In [103]:
arr = matlib.identity(6)
arr
Out[103]:
matrix([[1., 0., 0., 0., 0., 0.],
[0., 1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0., 0.],
[0., 0., 0., 0., 1., 0.],
[0., 0., 0., 0., 0., 1.]])
In [106]:
a = np.array([[1,2,3],[4,5,6]])
a
Out[106]:
array([[1, 2, 3],
[4, 5, 6]])
In [107]:
a.T
Out[107]:
array([[1, 4],
[2, 5],
[3, 6]])
numpy.dot(a, b, out=None)
第一个矩阵第一行的每个数字(2和1),各自乘以第二个矩阵第一列对应位置的数字(1和1),然后将乘积相加( 2 x 1 + 1 x 1),得到结果矩阵左上角的那个值3。也就是说,结果矩阵第m行与第n列交叉位置的那个值,等于第一个矩阵第m行与第二个矩阵第n列,对应位置的每个值的乘积之和。
线性代数基于矩阵的推导:
In [109]:
arr_1 = np.array([[1,2,3],[4,5,6]]) #2行3列
arr_2 = np.array([[1,2,3],[4,5,6]])
arr_2 = arr_2.T
In [110]:
arr_1
Out[110]:
array([[1, 2, 3],
[4, 5, 6]])
In [111]:
arr_2
Out[111]:
array([[1, 4],
[2, 5],
[3, 6]])
In [112]:
np.dot(arr_1,arr_2)
Out[112]:
array([[14, 32],
[32, 77]])
In [8]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
Series是一种类似与一维数组的对象,由下面两个部分组成:
Series的创建
In [3]:
s = Series(data=[1,2,3,4,5])
s
Out[3]:
0 1
1 2
2 3
3 4
4 5
dtype: int64
In [4]:
s1 = Series(data=[1,2,3],index=['a','b','c'])
s1
Out[4]:
a 1
b 2
c 3
dtype: int64
In [7]:
dic = {
'数学':100,
'理综':188
}
s3 = Series(data=dic)
s3
Out[7]:
数学 100
理综 188
dtype: int64
In [10]:
s4 = Series(data=np.random.randint(0,100,size=(3,)))
s4
Out[10]:
0 9
1 91
2 24
dtype: int32
In [12]:
s1
Out[12]:
a 1
b 2
c 3
dtype: int64
In [16]:
s1['a']
s1[0]
s1.a
Out[16]:
1
In [19]:
s1[0:2]
s1['a':'c']
Out[19]:
a 1
b 2
c 3
dtype: int64
In [24]:
s1.shape
s1.size
s1.index
s1.values
Out[24]:
array([1, 2, 3], dtype=int64)
In [26]:
s1.head(2)#只显示前两个数
s1.tail(2)
Out[26]:
b 2
c 3
dtype: int64
In [27]:
s1 = Series(data=[1,2,3,4],index=['a','b','c','d'])
s2 = Series(data=[1,2,3,4],index=['a','b','e','d'])
s1
Out[27]:
a 1
b 2
c 3
d 4
dtype: int64
In [28]:
s2
Out[28]:
a 1
b 2
e 3
d 4
dtype: int64
In [29]:
s = s1+s2
s
Out[29]:
a 2.0
b 4.0
c NaN
d 8.0
e NaN
dtype: float64
In [30]:
s.isnull()
Out[30]:
a False
b False
c True
d False
e True
dtype: bool
In [33]:
#使用隐事和显示索引
s[[0,1,2]]
s[['a','c']]
Out[33]:
a 2.0
c NaN
dtype: float64
In [35]:
s
Out[35]:
a 2.0
b 4.0
c NaN
d 8.0
e NaN
dtype: float64
In [36]:
#使用布尔值充当索引
s[[True,True,False,True,False]]
Out[36]:
a 2.0
b 4.0
d 8.0
dtype: float64
In [37]:
s.notnull()
Out[37]:
a True
b True
c False
d True
e False
dtype: bool
In [38]:
s[s.notnull()]
Out[38]:
a 2.0
b 4.0
d 8.0
dtype: float64
DataFrame是一个【表格型】的数据结构。DataFrame由按一定顺序排列的多列数据组成。设计初衷是将Series的使用场景从一维拓展到多维。DataFrame既有行索引,也有列索引。
DataFrame的创建
In [39]:
DataFrame(data=np.random.randint(0,100,size=(4,6)))
Out[39]:
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 93 | 24 | 73 | 95 | 46 | 36 |
1 | 17 | 98 | 7 | 13 | 79 | 34 |
2 | 82 | 51 | 52 | 21 | 4 | 50 |
3 | 77 | 23 | 91 | 31 | 6 | 12 |
In [42]:
dic = {
'name':['张三','李四','王老五'],
'salary':[10000,20000,15555]
}
df = DataFrame(data=dic,index=['a','b','c'])
df
Out[42]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
b | 李四 | 20000 |
c | 王老五 | 15555 |
In [46]:
df.values
df.columns
df.index
df.shape
Out[46]:
(3, 2)
============================================
练习4:
根据以下考试成绩表,创建一个DataFrame,命名为df:
张三 李四
语文 150 0
数学 150 0
英语 150 0
理综 300 0
============================================
Type Markdown and LaTeX: α2α2
In [49]:
df
Out[49]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
b | 李四 | 20000 |
c | 王老五 | 15555 |
In [51]:
#取出第一列
df['name']
Out[51]:
a 张三
b 李四
c 王老五
Name: name, dtype: object
In [52]:
#取出多列
df[['name','salary']]
Out[52]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
b | 李四 | 20000 |
c | 王老五 | 15555 |
In [56]:
#取出一行
df.loc['a']
Out[56]:
name 张三
salary 10000
Name: a, dtype: object
In [54]:
#取多行
df.loc[['a','c']]
Out[54]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
c | 王老五 | 15555 |
In [58]:
df.iloc[[1,2]]
Out[58]:
name | salary | |
---|---|---|
b | 李四 | 20000 |
c | 王老五 | 15555 |
In [59]:
df
Out[59]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
b | 李四 | 20000 |
c | 王老五 | 15555 |
In [61]:
#取单个的元素(李四的薪资取出)
df.iloc[1,1]
df.loc['b','salary']
Out[61]:
20000
In [62]:
#取多个个的元素
df.loc[['a','c'],'salary']
Out[62]:
a 10000
c 15555
Name: salary, dtype: int64
In [63]:
#切出前两行
df[0:2]
Out[63]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
b | 李四 | 20000 |
In [65]:
#切出前两列
df.iloc[:,0:2]
Out[65]:
name | salary | |
---|---|---|
a | 张三 | 10000 |
b | 李四 | 20000 |
c | 王老五 | 15555 |
In [ ]:
Type Markdown and LaTeX: α2α2
============================================
练习:
============================================
In [ ]:
时间数据类型的转换
将某一列设置为行索引
股票:
In [70]:
import tushare as ts
df = ts.get_k_data('600519',start='2000-01-01')
In [72]:
#写入到文件
df.to_csv('./maotai.csv')
In [74]:
#将本地的数据读取到df
df = pd.read_csv('./maotai.csv')
df.head(5)
Out[74]:
Unnamed: 0 | date | open | close | high | low | volume | code | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 2001-08-27 | 5.392 | 5.554 | 5.902 | 5.132 | 406318.00 | 600519 |
1 | 1 | 2001-08-28 | 5.467 | 5.759 | 5.781 | 5.407 | 129647.79 | 600519 |
2 | 2 | 2001-08-29 | 5.777 | 5.684 | 5.781 | 5.640 | 53252.75 | 600519 |
3 | 3 | 2001-08-30 | 5.668 | 5.796 | 5.860 | 5.624 | 48013.06 | 600519 |
4 | 4 | 2001-08-31 | 5.804 | 5.782 | 5.877 | 5.749 | 23231.48 | 600519 |
In [79]:
#将无用的列删除.drop系列的函数中axis=0行,1列
df.drop(labels='Unnamed: 0',axis=1,inplace=True) #inplace=True把数据从原始数据中删除
In [83]:
df.info()
RangeIndex: 4385 entries, 0 to 4384
Data columns (total 7 columns):
date 4385 non-null object
open 4385 non-null float64
close 4385 non-null float64
high 4385 non-null float64
low 4385 non-null float64
volume 4385 non-null float64
code 4385 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 239.9+ KB
In [88]:
##将date列中的数据类型转换成时间序列类型
df['date'] = pd.to_datetime(df['date'])
In [90]:
df.info()
RangeIndex: 4385 entries, 0 to 4384
Data columns (total 7 columns):
date 4385 non-null datetime64[ns]
open 4385 non-null float64
close 4385 non-null float64
high 4385 non-null float64
low 4385 non-null float64
volume 4385 non-null float64
code 4385 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 239.9 KB
In [94]:
#将date列作为源数据的行索引
df.set_index('date',inplace=True)
0 | 2001-08-27 | 5.392 | 5.554 | 5.902 | 5.132 | 406318.00 | 600519 |
| 1 | 1 | 2001-08-28 | 5.467 | 5.759 | 5.781 | 5.407 | 129647.79 | 600519 |
| 2 | 2 | 2001-08-29 | 5.777 | 5.684 | 5.781 | 5.640 | 53252.75 | 600519 |
| 3 | 3 | 2001-08-30 | 5.668 | 5.796 | 5.860 | 5.624 | 48013.06 | 600519 |
| 4 | 4 | 2001-08-31 | 5.804 | 5.782 | 5.877 | 5.749 | 23231.48 | 600519 |
In [79]:
#将无用的列删除.drop系列的函数中axis=0行,1列
df.drop(labels='Unnamed: 0',axis=1,inplace=True) #inplace=True把数据从原始数据中删除
In [83]:
df.info()
RangeIndex: 4385 entries, 0 to 4384
Data columns (total 7 columns):
date 4385 non-null object
open 4385 non-null float64
close 4385 non-null float64
high 4385 non-null float64
low 4385 non-null float64
volume 4385 non-null float64
code 4385 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 239.9+ KB
In [88]:
##将date列中的数据类型转换成时间序列类型
df['date'] = pd.to_datetime(df['date'])
In [90]:
df.info()
RangeIndex: 4385 entries, 0 to 4384
Data columns (total 7 columns):
date 4385 non-null datetime64[ns]
open 4385 non-null float64
close 4385 non-null float64
high 4385 non-null float64
low 4385 non-null float64
volume 4385 non-null float64
code 4385 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 239.9 KB
In [94]:
#将date列作为源数据的行索引
df.set_index('date',inplace=True)