python3 pandas模块

numpy能够帮助我们处理数值，但是pandas除了处理数值之外(基于numpy)，还能够帮助我们处理其他类型的数据

常用的数据类型

Series 一维，带标签数组
DataFrame 二维，Series容器

创建Series

import pandas as pd

t = pd.Series([1, 2, 31, 12, 3, 4])
# 0     1
# 1     2
# 2    31
# 3    12
# 4     3
# 5     4
# dtype: int64
print(t)


# 通过index属性，指定索引
t = pd.Series([1, 23, 2, 2, 1], index=list('abcde'))
# a     1
# b    23
# c     2
# d     2
# e     1
# dtype: int64
print(t)


temp_dict = {'name': 'xiaohong', 'age': 30, 'tel': 10086}
t = pd.Series(temp_dict)
# name    xiaohong
# age           30
# tel        10086
# dtype: object
print(t)

Series的切片和索引

'''
@Date: 2019-09-04 15:09:42
@LastEditors: pxcoder
@LastEditTime: 2019-09-05 17:29:52
'''
import pandas as pd

temp_dict = {'name': 'xiaohong', 'age': 30, 'tel': 10086}
t = pd.Series(temp_dict)
# name    xiaohong
# age           30
# tel        10086
# dtype: object
print(t)

# 通过索引取值
# 30
print(t['age'])

# 通过位置取值
# 30
print(t[1])

# 取连续位置的值
# 取第二行到第三行的值
# age       30
# tel    10086
# dtype: object
print(t[1:3])


# 取不连续位置的值
# 取第一行和第三行的值
# name    xiaohong
# tel        10086
# dtype: object
print(t[[0, 2]])


# 通过索引取不同位置的值
# 取索引为name和索引为tel的值
# name    xiaohong
# tel        10086
# dtype: object
print(t[['name', 'tel']])


t = pd.Series([1, 2, 31, 12, 3, 4])
# 取t中大于10的值
# 2    31
# 3    12
# dtype: int64
print(t[t > 10])

Series的索引和值

Series对象本质上由两个数构成，一个数组构成对象的键（index，索引），一个数组构成对象的值（value），键->值

import pandas as pd

temp_dict = {'name': 'xiaohong', 'age': 30, 'tel': 10086}
t = pd.Series(temp_dict)
# name    xiaohong
# age           30
# tel        10086
# dtype: object
print(t)

# 索引
# Index(['name', 'age', 'tel'], dtype='object')
print(t.index)

# 值
# ['xiaohong' 30 10086]
print(t.values)

读取外部数据

import pandas as pd

# 读取csv文件
t = pd.read_csv('./dog_name.csv')
print(t)


# 还可以读取excel，mysql,mongodb等多种类型的外部数据

创建DateFrame

import pandas as pd
import numpy as np

# 创建DataFrame数据
# DataFrame对象既有行索引，又有列索引
# 行索引，表明不同行，横向索引，叫index，0轴，axis=0
# 列索引，表明不同列，纵向索引，叫columns，1轴，axios=1
t = pd.DataFrame(np.arange(12).reshape(3, 4))
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
print(t)


# 指定行索引和列索引
t = pd.DataFrame(np.arange(12).reshape(
    3, 4), index=list('abc'), columns=list('wxyz'))
#    w  x   y   z
# a  0  1   2   3
# b  4  5   6   7
# c  8  9  10  11
print(t)


temp_dict = {'name': ['xiaoming', 'xiaogang'],
             'age': [20, 32], 'tel': [10086, 10010]}
t = pd.DataFrame(temp_dict)
#        name  age    tel
# 0  xiaoming   20  10086
# 1  xiaogang   32  10010
print(t)


temp_arr = [{'name': 'xiaoming', 'age': 20, 'tel': 10086},
            {'name': 'xiaogang', 'age': 32}]
t = pd.DataFrame(temp_arr)
#    age      name      tel
# 0   20  xiaoming  10086.0
# 1   32  xiaogang      NaN
print(t)

DataFrame的基础属性

属性	描述
df.shape	形状
df.dtypes	列数据的类型
df.ndim	数据维度
df.index	行索引
df.columns	列索引
df.values	对象值，二维ndarray数组

DataFrame整体情况查询

方法	描述
df.head(n)	显示头部n行，默认5行
df.tail(n)	显示末尾n行，默认5行
df.info()	相关信息概览：行数，列数，列索引，列非空值个数，列类型，内存占用
df.describe()	快速综合统计结果：计数，均值，标准差，最大值，四分位数，最小值
df.sort_values(by='coulmn_name', ascending=True)	按照column_name进行排序，ascending=True升序，ascending=False降序

pandas之取行或者取列

import pandas as pd

temp_arr = [{'labels': 'BELLA', 'count_animal_name': 856},
            {'labels': 'MAX', 'count_animal_name': 852},
            {'labels': 'CHARLIE', 'count_animal_name': 1159},
            {'labels': 'COCO', 'count_animal_name': 1153},
            {'labels': 'ROCKY', 'count_animal_name': 823}]

df = pd.DataFrame(temp_arr)

# 以count_animal_name列进行倒序排序
df_sort = df.sort_values(by='count_animal_name', ascending=False)

# 取前两行
print(df_sort[:2])

# 取count_animal_name列的数据
print(df['count_animal_name'])

# 取前两行count_animal_name列的数据
print(df[:2]['count_animal_name'])

pandas之loc和iloc

df.loc 通过标签索引获取数据

df.iloc 通过位置索引获取数据

import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(12).reshape(
    3, 4), index=list('ABC'), columns=list('WXYZ'))


# 获取行号为A，列号为W的值
# 0
print(df.loc['A', 'W'])

# 获取行号为A，列号为W和Z的值
print(df.loc['A', ['W', 'Z']])

# 获取行号为A和C，列号为W和Z的值
print(df.loc[['A', 'C'], ['W', 'Z']])

# 获取行号A到C，列号为W和Z的值
print(df.loc['A':'C', ['W', 'Z']])

# 获取第二行的数据
print(df.iloc[1])

# 获取第三列的数据
print(df.iloc[:, 2])

# 获取第一行和第三行的数据
print(df.iloc[[0, 2]])

# 获取第一列和第四列的数据
print(df.iloc[:, [0, 3]])

pandas之布尔索引

import pandas as pd

temp_arr = [{'labels': 'BELLA', 'count_animal_name': 856},
            {'labels': 'MAX', 'count_animal_name': 852},
            {'labels': 'CHARLIE', 'count_animal_name': 1159},
            {'labels': 'COCO', 'count_animal_name': 1153},
            {'labels': 'ROCKY', 'count_animal_name': 823}]

df = pd.DataFrame(temp_arr)

# 筛选count_animal_name>1000的数据
filter_df = df[df['count_animal_name'] > 1000]
print(filter_df)


# 筛选800 800) &
               (df['count_animal_name'] < 1000)]
print(filter_df)

# 筛选count_animal_name<800或者count_animal_name>1000的数据
filter_df = df[(df['count_animal_name'] < 800) |
               (df['count_animal_name'] > 1000)]
print(filter_df)


# 筛选count_animal_name<1000并且labels长度小于5的数据
filter_df = df[(df['count_animal_name'] < 1000) &
               (df['labels'].str.len() < 5)]
print(filter_df)

pandas之字符串方法

方法	描述
pd.str.cat()	字符串拼接，可指定分隔符
pd.str.contains()	返回各字符串是否包含指定字符串的布尔型数组
pd.str.count()	模式的出现次数
pd.str.endswith()、pd.str.startswith()	判断字符串是否以指定前缀开头或后缀结尾
pd.str.findall()	计算各字符串的模式列表
pd.str.get()	获取字符串中第i个字符
pd.str.join()	按指定分隔符连接字符串
pd.str.len()	计算字符串的长度
pd.str.lower()、pd.str.upper()	将字符串转换为大写或小写
pd.str.match()	匹配字符串
pd.str.pad()	在字符串左边、右边或者左右两边添加空白符
pd.str.center()	相当于pad(side='both')
pd.str.repeat()	重复字符串
pd.str.replace()	替换字符串
pd.str.slice()	截取字符串
pd.str.split()	根据分隔符或正则表达式对字符串进行拆分
pd.str.strip()、pd.str.rstrip()、pd.str.lstrip()	去除空白符、包括换行符

缺失数据处理

pd.isnull(df) 判断数据是否为NaN

pd.isnotnull(df) 判断数据是否不为NaN

python3 pandas模块

python3 pandas模块

常用的数据类型

创建Series

Series的切片和索引

Series的索引和值

读取外部数据

创建DateFrame

DataFrame的基础属性

DataFrame整体情况查询

pandas之取行或者取列

pandas之loc和iloc

pandas之布尔索引

pandas之字符串方法

缺失数据处理

你可能感兴趣的:(python3 pandas模块)