头
import numpy as np
import pandas
import matplotlib
from pandas import Series,DataFrame
import pandas as pd
obj2
obj2['a']
obj2['d']
obj2[['a','d']]
obj[1]
obj[[1,2]]
obj[obj<2]
obj2.values
obj2.index
定义数组:
1.obj2=Series([4,7,-5,3],index=['d','b','a','c']);
2.obj2=DataFrame( {"name":["zzzz","ddfdf","dfdf"],"age":[11,22,33],"salary":[111111.11,11111111,222222]} )
3.obj2=DataFrame( {"name":["zzzz","ddfdf","dfdf"],"age":[11,22,33],"salary":[111111.11,11111111,222222]} , columns=["salary","name","age"])
4.obj2=DataFrame( {"name":["zzzz","ddfdf","dfdf"],"age":[11,22,33],"salary":[111111.11,11111111,222222]} , columns=["salary","name","age"],index=[0,1,2])
5.obj2=DataFrame(np.arange(16).reshape((4,4)),columns=["one","two","three","four"],index=["ddd","qqq","eee","rrr"])
6.obj2 = Series(range(4),index=list("dabc"))
7.obj2 = Series(["a","a","b","c"]*4)
8.string_data = Series(["Jack","articel",np.nan,"Jerry"])
9.Age_dropna=df[['Age']].dropna()
增加一列
obj2["OLDMAN"]=frame.age>60
删除行
obj2.drop(["qqq","ddd"])
修改或添加元素
frame["age"] = age
两个数组运算
1.s1 = Series([7.3,-2.5,3.4,1.5],index=["a","c","d","e"]);s2 = Series([-2.1,3.6,-1.5,4,3.1],index=["a","c","e","f","g"]);s1 + s2;
2.df1 = DataFrame(np.arange(12).reshape((3,4)),columns=list("abcd"));df2 = DataFrame(np.arange(20).reshape((4,5)),columns=list("abcde"));df1 + df2;
3.df1.add(df2,fill_value=0) #有零也求和
4.df1.add(df2).fillna(0) #有0的一组和为零
5.df['Age_isna']=0; df.loc[df.Age.isnull(),'Age_isna']=1 #设置值
数组函数映射
1. f = lambda x : x.max() - x.min() # 列最大减最小;df2.apply(f)# 使用每一列的最大值减去最小值,得到一个新的Series;
2.frame = DataFrame({"name":["Mr Li","Mr Zhang","Mr Wang"], "birthday":["1993-05-20","1996-06-15","1998-09-10"], "salary":[7500.00,12000.00,9000.00]});age = frame.birthday.apply(lambda x : 2020 - int(x[:4]));#该列每一个元素,对于一维数组
3. f = lambda x : x.max() - x.min() # 行最大减最小;df2.apply(f,axis=1)# 使用每一行的最大值减去最小值,得到一个新的Series;
4.df2.apply(sum) #对列的求和
5.format = lambda x : "%.2f" % x # 输入一个浮点数,保留2位小数 ;test = DataFrame(np.random.randn(4,3), columns=list("bde"),index=["Utah","Ohio","Texas","Oregon"]);test.applymap(format) # apply 与 applymap的区别 #二维数组每个元素值的改变
6.test["d"].map(format)#该列每一个元素保留两个小数
数组排序
1.obj.sort_index() # 按照索引排序
2.obj.sort_index(axis=1) # 按照列索引名称排序
3.obj.sort_values() # 按照值排序
4.obj.sort_index(axis=1,ascending=True) # 按照列索引名称,升序
5.frame.sort_values(by="salary",ascending=False) # 按照某一列的值进行降序排序
统计
1.obj2.index.is_unique #检查各个列的索引是否是唯一
2.obj2.name.value_counts() #对name一列的各个元素统计多少
3.obj2.sum() #按列求和
4.obj2.sum(axis=1) #按行求和
5.obj2.mean() #默认都是按列求平均值
6.obj2.mean(axis=1,skipna=False) # 按照行求平均,不忽略NaN
7.obj2.cumsum() #传递累加
8.obj2.describe() # 对数值/字符等型的列进行描述性统计
9.obj2.unique() #排除重复后的值
10.obj2.value_counts() #统计各元素的个数
11.mask =obj2.isin(["a","d"]) #统计各行是否有列表中的值;obj[mask] #显示存在值的行
12.data.apply(pd.value_counts).fillna(0) #统计各列元素个数,元素写在最前面一列
13.mask = string_data.isnull() #显示是否为空,返回非空;string_data[mask]
14.string_data.dropna() #显示删除有缺失行后的内容,原内容不变
15.data.dropna(how="all") #只删除全空的行
16.data.fillna(0) #把为空的值变为0
17.data.fillna(0,inplace=True) #把为空的值变为0,且改变原数据
18.obj2.info() #查看基本情况
19.titanic_df.quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]) #自定义分位数
csv篇
加载csv
1.df = pd.read_csv("students.csv") # 默认分隔符为英文的逗号
2.table = pd.read_table("table.csv") # 默认的分隔符是制表符
4.df = pd.read_csv("no_header.csv",header=None) #加载没有表头的csv
5.student = pd.read_csv("no_header.csv",names=["Id","name","gender","age"]) #给没表头的表表头
显示数据
df.head() # 默认只显示所有数据的前五行
添加一列
table["count"] = table.number * table.price
将处理后的结果保存
table.to_csv("processed_table.csv",index=False)
分组聚合
1.df = DataFrame({"key1":["1","1","2","2","1"], "key2":["male","female","male","female","male"], "data1":np.random.randn(5), "data2":np.random.randn(5)}); grouped = df["data1"].groupby(df["key1"]) # 分组
2.grouped.mean() # 聚合,使用的是求平均的聚合函数
3.grouped.sum() # 聚合,使用的是求和聚合函数
4. df["data1"].groupby([df["key1"],df["key2"]]).mean()# 相当于先按班级分,再按性别分,分完之后再使用平局函数进行聚合
加权平均
df = DataFrame({"category":list("aaaabbbb"),"data":np.random.randn(8), "weights":np.random.rand(8)});grouped = df.groupby("category") # 分组,可以分两组:a,b; get_wavg = lambda g : np.average(g["data"],weights=g["weights"]); grouped.apply(get_wavg)
两个类别行变量的相关性分析
1.pd.crosstab(titanic_df.Survived,titanic_df.Pclass,margins=True) #两个维度的各个情况频次统计
2.pd.crosstab(titanic_df.Survived,titanic_df.Pclass).apply(lambda r : r/r.sum(),axis=1) #两个维度的各个情况频率(对于一行)统计
3.titanic_df.groupby("Survived").agg({"Age":"mean","Fare":"mean"}) # 按照是否生还对乘客进行分组,然后再统计每个组别下年龄以及船票价格的平均值#按照其中一个维度的各个情况,另一个维度整体统计