pandas的使用

pandas的使用

pandas读出的数据为dataframe,dataframe的每一列为series(包括索引和值),series中的值为ndarray。所以pandas是基于numpy封装的对数据进行处理的包。
1.pandas读取csv文件,以及数据类型说明。

import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))

输出结果:

#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#输出列的类型
print(food_info.dtypes)

输出结果:
pandas的使用_第1张图片
2.head方法以及columns和shape属性的使用。

first_rows = food_info.head()
#head方法默认为前五项
print(first_rows)
#head方法也可指定前多少行
print(food_info.head(3))
#输出列名
print(food_info.columns)

输出结果:
pandas的使用_第2张图片

#输出变量的形状
print(food_info.shape)

输出结果:(8618, 36)
3.loc的使用。

 #pandas uses zero-indexing
 #Series object representing the row at index 0.
 #获得第一行数据
 print(food_info.loc[0])
 # Series object representing the seventh row.
 print(food_info.loc[6])
 # Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
 #food_info.loc[8620]
 #The object dtype is equivalent to a string in Python
 # Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
 print(food_info.loc[3:6])
 # Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
 # Method 1
 two_five_ten = [2,5,10] 
 print(food_info.loc[two_five_ten])
 # Method 2
 print(food_info.loc[[2,5,10]])

4.单列

# Series object representing the "NDB_No" column.
ndb_col = food_info["NDB_No"]
print(ndb_col)
# Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"
ndb_col = food_info[col_name]
print(ndb_col)

输出分析:这两种方法输出的结果都是相同。
5.多列

columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]
print(zinc_copper)
print(type(zinc_copper))
# Skipping the assignment.
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print(zinc_copper)
print(type(zinc_copper))

输出结果分析:这两种方法输出的结果也是相同的。
6.找出列属性为g的列,并把列的值进行输出。

#将列的属性名转成list
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
#筛选出属性名以(g)结尾的属性名
for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

7.dataframe与单个数字加减乘除运算。

food_info = pandas.read_csv("food_info.csv")
print(food_info["Iron_(mg)"])
# Divides 1000 to each value in the column and returns a Series object.
div_1000 = food_info["Iron_(mg)"] / 1000
print(div_1000)
# Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100
print(add_100)
# Subtracts 100 from each value in the column and returns a Series object.
sub_100 = food_info["Iron_(mg)"] - 100
print(sub_100)
# Multiplies each value in the column by 2 and returns a Series object.
mult_2 = food_info["Iron_(mg)"]*2
print(mult_2)

8.dataframe与dataframe之间进行运算,运算结果仍为dataframe。
#每个值对应相乘

#It applies the arithmetic operator to the first value in both columns, the second value in both #columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
print(water_energy)

#每个值对应相加

#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
print(weighted_protein)
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
print(weighted_fat)
initial_rating = weighted_protein + weighted_fat
print(initial_rating)

#每个值对应项除

# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, 
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
print(food_info)

9.sort_value的使用。

#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
# inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果。
#print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True)
print(food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print(food_info["Sodium_(mg)"])

10.isnull的使用。

#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
titanic_survival = pd.read_csv("titanic_train.csv")
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)
print(age_is_null)
age_null_true = age[age_is_null]
print(age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)

11.sum的使用。

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)

12.mean的使用。

# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

13. pivot_table的使用。

#index可以理解为key,values可以理解为value。想象成key-value键值对。
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)

输出结果:
在这里插入图片描述

passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)

输出结果:
在这里插入图片描述

port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)

输出结果:
pandas的使用_第3张图片

14.dropna的使用。

#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
print(drop_na_columns.head())
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
print(new_titanic_survival.head())

15.索引获取值。

row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)

16.sort_value的使用。

# axis:0按照行名排序;1按照列名排序
# level:默认None,否则按照给定的level顺序排列---貌似并不是,文档
# ascending:默认True升序排列;False降序排列
# inplace:默认False,否则排序之后的数据直接替换原来的数据框
# kind:默认quicksort,排序的方法
# na_position:缺失值默认排在最后{"first","last"}
# by:按照那一列数据进行排序,但是by参数貌似不建议使用
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])

17. reset_index的使用。

#drop=True:在原有的索引列重置索引,不再另外添加新列。
# drop=False:原有的索引不变添加列名index,同时在新列上重置索引
itanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(itanic_reindexed.iloc[0:10])

18.apply的使用。
(1)默认以列进行操作。

# This function returns the hundredth item from a series
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)

(2)axis=1以行进行操作

#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"
 classes = titanic_survival.apply(which_class, axis=1)
print classes

(3)支持lambda匿名函数。

fandango = pd.read_csv('fandango_score_comparison.csv')
fandango_films = fandango.set_index('FILM', drop=False)
# returns the data types as a Series
types = fandango_films.dtypes
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))

(4)先分类,然后类名作为index,一旦类名做了index,若没有指定aggfunc,则吧对应的同一类的值默认计算均值。

def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False
  
minors = titanic_survival.apply(is_minor, axis=1)
print(minors)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)

titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)

age_group_survival的输出结果:
pandas的使用_第4张图片
19.series的使用。
(1)series的初始化。

fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
series_rt = fandango['RottenTomatoes']
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
print(type(film_names))
print(film_names)
rt_scores = series_rt.values
print(rt_scores)
# film_names为索引,rt_scores为索对应的值
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])

(2)series支持切片操作。

# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
fiveten = series_custom[5:10]
print(fiveten)

(3)reindex的使用,重新设置index。

original_index = series_custom.index.tolist()
print(original_index)
sorted_index = sorted(original_index)
print(sorted_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)

(4)sort_index的使用,按照index进行升序排序。

sc2 = series_custom.sort_index()
print(sc2[0:10])

(5)sort_values的使用,按照values进行升序排序。

sc3 = series_custom.sort_values()
print(sc3[0:10])

(6)add、sin和max的使用。

# Add each value with each other
print(np.add(series_custom, series_custom))
# Apply sine function to each value
print(np.sin(series_custom))
# Return the highest value (will return a single value not a Series)
print(np.max(series_custom))

(7)逻辑运算。

#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)

(8)算数运算。

rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)

20.set_index的使用。

#will return a new DataFrame that is indexed by the values in the specified column 
# drop=True 删除FILM为空的行
fandango = pd.read_csv('fandango_score_comparison.csv')
print(type(fandango))
fandango_films = fandango.set_index('FILM', drop=False)
print(fandango_films.index)

你可能感兴趣的:(Machine,Learning,pandas,python)