自带数据格式
数值
a1 = 100
a2 = 3.14
print(a1)
print(a2)
100
3.14
print(type(a1))
print(type(a2))
float(a1)
100.0
int(a1)
100
a1 + 50
150
a2 * 2
6.28
字符串
s1 = 'python'
s2 = 'pandas'
print(s1)
print(type(s1))
python
s1 + s2
'pythonpandas'
list(s1)
['p', 'y', 't', 'h', 'o', 'n']
tuple(s2)
('p', 'a', 'n', 'd', 'a', 's')
len(s1)
6
name = '小米'
s = '北京'
i = 5
print('%s在%s,今天气温%i度'%(name,s,i))
小米在北京,今天气温5度
a = 3.14
str(a)
'3.14'
布尔值
a = 100
b = 90
a == b
False
c = True
type(c)
bool
自带数据结构
列表
- 中括号表示的一组元素. 元素可以是数值 字符串 列表 字典等多种样式
ls1 = [1,2,3,4,5]
ls2 = ['a','b','c','d','e']
ls3 = ['a','b',[1,2],'d','e']
print(ls1)
print(ls2)
print(ls3)
[1, 2, 3, 4, 5]
['a', 'b', 'c', 'd', 'e']
['a', 'b', [1, 2], 'd', 'e']
ls1[0]
1
ls1[2:3]
[3]
ls1[4]= 100
ls1
[1, 2, 3, 4, 100]
for i in ls1:
print(i+100)
101
102
103
104
200
list('python数据分析基础')
['p', 'y', 't', 'h', 'o', 'n', '数', '据', '分', '析', '基', '础']
range函数
range(10)
range(0, 10)
list(range(10))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
list(range(1,10))
[1, 2, 3, 4, 5, 6, 7, 8, 9]
list(range(3,10,2))
[3, 5, 7, 9]
进阶用法
lis1 = [1,2,3,4,5]
lis2 = [i+100 for i in lis1]
lis2
[101, 102, 103, 104, 105]
lis3 = [i*i for i in lis1 if i%2==0]
lis3
[4, 16]
lis4 = ['No.'+str(i) for i in lis1]
lis4
['No.1', 'No.2', 'No.3', 'No.4', 'No.5']
lis5 = ['No.'+str(i) for i in range(100)]
lis5[:10]
['No.0',
'No.1',
'No.2',
'No.3',
'No.4',
'No.5',
'No.6',
'No.7',
'No.8',
'No.9']
lis6 = [1,2,3,4,5]
lis7 = list('abcde')
lis8 = [list(z) for z in zip(lis6,lis7)]
lis8
[[1, 'a'], [2, 'b'], [3, 'c'], [4, 'd'], [5, 'e']]
字典
- 大括号表示的成对组成的数据
dic1 = {
'A':1,
'B':2}
dic2 = {
'A':'中国',
'B':'美国'}
dic3 = {
'A':[1,2,3],
'B':[4,2,5]}
print(dic1)
print(dic2)
print(dic3)
{'A': 1, 'B': 2}
{'A': '中国', 'B': '美国'}
{'A': [1, 2, 3], 'B': [4, 2, 5]}
dic1['A']
1
lis1 = [1,2,3,4,5]
lis2 = ['a','b','c','d','e']
dic4 = {
i:j for i in lis2 for j in lis1}
dic4
{'a': 5, 'b': 5, 'c': 5, 'd': 5, 'e': 5}
type(dic4)
dict
元组
tup1 = 4,5,6,7
print(tup1)
(4, 5, 6, 7)
list(tup1)
[4, 5, 6, 7]
集合
s1 = set([2,2,2,1,3,3,'a','a'])
print(s1)
{1, 2, 3, 'a'}
lis1 = list(s1)
lis1
[1, 2, 3, 'a']
数据结构
numpy中的数组
数组
import numpy as np
arr1 = np.array([1,2,3,4,5])
arr1
array([1, 2, 3, 4, 5])
arr2 = np.array([[1,2,3,4,5],
[6,7,8,9,10]])
arr2
array([[ 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10]])
print(arr2.shape)
print(arr2.size)
print(arr2.dtype)
(2, 5)
10
int32
arr3 = arr2 + 100
arr3
array([[101, 102, 103, 104, 105],
[106, 107, 108, 109, 110]])
随机数
- numpy中的random模块
import numpy as np
np.random.rand(5)
array([0.46591503, 0.87362145, 0.93809249, 0.25925983, 0.63894833])
np.random.rand(2,3)
array([[0.96901492, 0.30594543, 0.22570567],
[0.42208673, 0.99117033, 0.87474965]])
np.random.randn(5)
array([-1.61944511, 0.38982079, -0.63948306, 0.36588547, 0.63185553])
np.random.randn(2,3)
array([[-0.91993249, 0.61362181, -0.67974097],
[ 1.21197129, 1.84427274, -0.63912597]])
np.random.random(5)
array([0.54859105, 0.35911591, 0.91295063, 0.04102102, 0.82447624])
np.random.random((2,3))
array([[0.61639853, 0.61691637, 0.483431 ],
[0.95587659, 0.17188186, 0.04471951]])
np.random.randint(5,20,4)
array([13, 16, 11, 17])
np.random.randint(5,20,(2,3))
array([[19, 13, 7],
[13, 14, 7]])
np.random.choice(["A","B","C"],10)
array(['B', 'A', 'B', 'C', 'A', 'B', 'B', 'C', 'A', 'B'], dtype='
np.random.normal(loc=60,scale=15,size=10000)
array([81.19617159, 48.43510876, 51.33395763, ..., 82.90992432,
61.88327572, 54.07451554])
pandas中的数据结构
import numpy as np
import pandas as pd
Series
s1 = pd.Series(np.random.random(5))
s1
0 0.720470
1 0.502933
2 0.465361
3 0.348212
4 0.317928
dtype: float64
lis1 = [100,200,300,400,500]
s2 = pd.Series(lis1)
s2
0 100
1 200
2 300
3 400
4 500
dtype: int64
s3 = pd.Series(lis1,index=list('ABCDE'))
s3
A 100
B 200
C 300
D 400
E 500
dtype: int64
s3['A']
100
s3['B':'D']
B 200
C 300
D 400
dtype: int64
s3 + 50
A 150
B 250
C 350
D 450
E 550
dtype: int64
s3.tolist()
[100, 200, 300, 400, 500]
DataFrame
data3 = pd.DataFrame(np.random.randint(5,20,(10,5)),
columns=list('ABCDE'),
index=list('abcdefghij'))
data3.head()
|
A |
B |
C |
D |
E |
a |
15 |
14 |
10 |
12 |
14 |
b |
14 |
18 |
8 |
16 |
17 |
c |
7 |
6 |
10 |
11 |
11 |
d |
15 |
10 |
14 |
16 |
19 |
e |
11 |
7 |
13 |
18 |
9 |
data4 = pd.DataFrame({
'山东':[100,200,300,400],
'青岛':[30,60,70,90]},
index=['一','二','三','四'])
data4
|
山东 |
青岛 |
一 |
100 |
30 |
二 |
200 |
60 |
三 |
300 |
70 |
四 |
400 |
90 |
data3.shape
(10, 5)
data3.shape[0]
10
data3.index
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
data3.describe()
|
A |
B |
C |
D |
E |
count |
10.000000 |
10.000000 |
10.000000 |
10.000000 |
10.000000 |
mean |
10.200000 |
13.400000 |
11.600000 |
13.400000 |
11.500000 |
std |
4.077036 |
4.671426 |
3.687818 |
3.949684 |
4.478343 |
min |
5.000000 |
6.000000 |
5.000000 |
6.000000 |
5.000000 |
25% |
6.250000 |
10.250000 |
10.000000 |
11.250000 |
9.000000 |
50% |
10.000000 |
14.500000 |
11.500000 |
14.000000 |
11.500000 |
75% |
14.000000 |
17.500000 |
13.750000 |
16.000000 |
13.750000 |
max |
15.000000 |
19.000000 |
18.000000 |
18.000000 |
19.000000 |
data3.describe().round(2)
|
A |
B |
C |
D |
E |
count |
10.00 |
10.00 |
10.00 |
10.00 |
10.00 |
mean |
10.20 |
13.40 |
11.60 |
13.40 |
11.50 |
std |
4.08 |
4.67 |
3.69 |
3.95 |
4.48 |
min |
5.00 |
6.00 |
5.00 |
6.00 |
5.00 |
25% |
6.25 |
10.25 |
10.00 |
11.25 |
9.00 |
50% |
10.00 |
14.50 |
11.50 |
14.00 |
11.50 |
75% |
14.00 |
17.50 |
13.75 |
16.00 |
13.75 |
max |
15.00 |
19.00 |
18.00 |
18.00 |
19.00 |
data3.dtypes
A int32
B int32
C int32
D int32
E int32
dtype: object
data3.columns.tolist()
['A', 'B', 'C', 'D', 'E']
data3.info()
Index: 10 entries, a to j
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 10 non-null int32
1 B 10 non-null int32
2 C 10 non-null int32
3 D 10 non-null int32
4 E 10 non-null int32
dtypes: int32(5)
memory usage: 280.0+ bytes
data3['A']
a 15
b 14
c 7
d 15
e 11
f 5
g 6
h 6
i 14
j 9
Name: A, dtype: int32
data3['A'].tolist()
[15, 14, 7, 15, 11, 5, 6, 6, 14, 9]
data3[:5]
|
A |
B |
C |
D |
E |
a |
15 |
14 |
10 |
12 |
14 |
b |
14 |
18 |
8 |
16 |
17 |
c |
7 |
6 |
10 |
11 |
11 |
d |
15 |
10 |
14 |
16 |
19 |
e |
11 |
7 |
13 |
18 |
9 |
data3.loc[['d','e'],['A','B']]
data3[['A','B']]
|
A |
B |
a |
15 |
14 |
b |
14 |
18 |
c |
7 |
6 |
d |
15 |
10 |
e |
11 |
7 |
f |
5 |
19 |
g |
6 |
11 |
h |
6 |
18 |
i |
14 |
15 |
j |
9 |
16 |
data3.head(6)
|
A |
B |
C |
D |
E |
a |
15 |
14 |
10 |
12 |
14 |
b |
14 |
18 |
8 |
16 |
17 |
c |
7 |
6 |
10 |
11 |
11 |
d |
15 |
10 |
14 |
16 |
19 |
e |
11 |
7 |
13 |
18 |
9 |
f |
5 |
19 |
11 |
13 |
12 |
data3.tail(2)
|
A |
B |
C |
D |
E |
i |
14 |
15 |
15 |
15 |
13 |
j |
9 |
16 |
5 |
6 |
6 |
扫码关注微信, 赠送《pandas数据读取与清洗》视频及课程代码!