钉钉群(建立课程钉钉群)
慕课系统
http://hznu.fanya.chaoxing.com/portal
https://www.cnblogs.com/big-devil/p/7625894.html
print(1)
1
大纲,实验计划
数据分析 or 数据采集
Python的数据生态系统:numpy,scipy,pandas,beatifulsoup,scarpy,matplotlib,pyecharts…(见xmind)
主页,我们的版本是WinPython 3.7.0.2-64bit,可以自己去主页上下载:
https://winpython.github.io/
也可以去我的百度云,64位下载地址:
https://pan.baidu.com/s/1Yq2iwCMXN7jKYXIC_-6cTw
#列表
a=[9,2,1,4]
print(a[0],a[1:3])
9 [2, 1]
a.append(3)
a
[9, 2, 1, 4, 3]
d={2:4,5:1}
print(d[5])
print(d.values(),d.keys(),d.items())
1
dict_values([4, 1]) dict_keys([2, 5]) dict_items([(2, 4), (5, 1)])
print(list(d.values())[0])
for i in d.values():
print(i)
4
4
1
for i in range(2,5):
print(i)
2
3
4
d={1:4}
d.get(2,8)
8
a=[2,4,1,2,4]
r={}
for i in a:
r[i]=r.get(i,0)+1
print(r)
{2: 2, 4: 2, 1: 1}
a=[3,4,1,1,4]
c=0
for i in a:
c=c+i
print(c)
13
import numpy as np
import pandas as pd
a=pd.Series([2,3,6],index=["i","u",3])#值必须类型一样
a
i 2
u 3
3 6
dtype: int64
a["i"]
2
a[2]#双索引机制
6
#赋值
a[0]=8.32
#a[0]="aaa"
a
i 8
u 3
3 6
dtype: int64
a.values
array([8, 3, 6], dtype=int64)
a.index
Index(['i', 'u', 3], dtype='object')
b=pd.Series({"w":2,2:4,3:3,"o":5,"r":6})#字典形式
b
w 2
2 4
3 3
o 5
r 6
dtype: int64
a=pd.Series([2,4,3,5,6])#会自动添加索引
a
0 2
1 4
2 3
3 5
4 6
dtype: int64
b=pd.Series({"w":2,2:4,3:3,"o":5,"r":6},index=["r","o"])#只会选取index中的索引
b
r 6
o 5
dtype: int64
np.random.randn(6,3)
array([[ 0.93058142, 0.77019753, -0.51476983],
[-1.00178584, -0.48270254, 0.73043323],
[ 0.54292453, 0.48071071, -1.58201189],
[ 1.57962945, 0.2586103 , 0.36540852],
[ 0.47069817, 0.58140324, 0.90716359],
[-1.14667799, -2.17332435, 0.90351004]])
a=pd.DataFrame(np.random.randn(6,3),index=range(2,8), columns=["a","b","c"])#相对数组增加了横向和纵向的自定义索引,双索引机制
a
a | b | c | |
---|---|---|---|
2 | 1.271704 | 0.634301 | 0.405262 |
3 | -0.120751 | -0.879684 | -0.181473 |
4 | 0.953220 | 0.453240 | 0.149582 |
5 | 0.660946 | 0.461971 | -0.560160 |
6 | 0.152444 | 0.199540 | -0.237531 |
7 | 0.817927 | -0.364992 | -1.040131 |
a["b"]
2 0.634301
3 -0.879684
4 0.453240
5 0.461971
6 0.199540
7 -0.364992
Name: b, dtype: float64
#a[3]
a.values
array([[ 0.05118313, 0.39638064, 0.38787328],
[ 0.55483877, -1.17624348, 0.8606569 ],
[-0.90502513, 2.06223449, -0.27556449],
[-0.45554163, 0.007331 , -0.89722409],
[-1.2040777 , 0.20244219, 0.51471742],
[-0.44531058, -0.49707473, -1.94881836]])
a.index
RangeIndex(start=2, stop=8, step=1)
a.columns
Index(['a', 'b', 'c'], dtype='object')
a=pd.DataFrame(np.random.randn(6,3), columns=["a","b","c"])
a
a | b | c | |
---|---|---|---|
0 | 1.664421 | 0.495775 | -0.945343 |
1 | -0.443564 | 2.499683 | 0.145670 |
2 | -0.385458 | -0.146777 | 0.397846 |
3 | -0.307054 | 0.404264 | -0.754313 |
4 | -0.473609 | -0.243507 | -0.583231 |
5 | -2.057494 | -1.082490 | 0.376665 |
a=pd.DataFrame(np.random.randn(6,3))#会自动把隐式索引添加上去
a
0 | 1 | 2 | |
---|---|---|---|
0 | 2.750190 | -2.037230 | 1.342428 |
1 | 0.004734 | -0.160271 | -0.787664 |
2 | 0.166225 | 0.677395 | 1.055825 |
3 | -0.004122 | 0.418734 | 2.593051 |
4 | -0.641335 | -0.828338 | 0.882715 |
5 | -0.802709 | 0.019870 | 0.257846 |
b=pd.DataFrame({"a":[2,3],"b":[3,4]})#通过字典定义dataframe
b
a | b | |
---|---|---|
0 | 2 | 3 |
1 | 3 | 4 |
b=pd.Series({"w":2,2:4,3:3,"o":5,"r":6})#通过序列生成数据框
pd.DataFrame(b)
0 | |
---|---|
w | 2 |
2 | 4 |
3 | 3 |
o | 5 |
r | 6 |
pd.DataFrame({"c1":b,"c2":b},index=["w","o"])#通过series定义dataframe
c1 | c2 | |
---|---|---|
w | 2 | 2 |
o | 5 | 5 |
pd.DataFrame([b])#注意这里转置了
w | 2 | 3 | o | r | |
---|---|---|---|---|---|
0 | 2 | 4 | 3 | 5 | 6 |
r1=pd.read_csv(r"D:\t_alibaba_data3.txt",names=["user","brand","behavr","date"],sep="\t",dtype={"behavr":int})
#pandas会自己判断数据类型,但是有时也需要自己额外指定数据类型
r1.head()
user | brand | behavr | date | |
---|---|---|---|---|
0 | 10944750 | 13451 | 0 | 06/04 |
1 | 10944750 | 13451 | 2 | 06/04 |
2 | 10944750 | 13451 | 2 | 06/04 |
3 | 10944750 | 13451 | 0 | 06/04 |
4 | 10944750 | 13451 | 0 | 06/04 |
#r1["date"]
元素所处的位置,和列表的位置,字典的键类似
实现过滤的原理
a1=pd.Series([2,3,6,0],index=[7,9,3,2])
b1=pd.Index([0,1,2,3])#也可以通过Index自定义索引
a1.index
Int64Index([7, 9, 3, 2], dtype='int64')
b1
Int64Index([0, 1, 2, 3], dtype='int64')
a1.index&b1
Int64Index([3, 2], dtype='int64')
a1.index|b1
Int64Index([0, 1, 2, 3, 7, 9], dtype='int64')
a1.index^b1
Int64Index([0, 1, 7, 9], dtype='int64')
a1[a1.index&b1]
3 6
2 0
dtype: int64
b1[0]
#b1[0]=9#不可更改
0