pandas读出的数据为dataframe,dataframe的每一列为series(包括索引和值),series中的值为ndarray。所以pandas是基于numpy封装的对数据进行处理的包。
1.pandas读取csv文件,以及数据类型说明。
import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))
输出结果:
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#输出列的类型
print(food_info.dtypes)
输出结果:
2.head方法以及columns和shape属性的使用。
first_rows = food_info.head()
#head方法默认为前五项
print(first_rows)
#head方法也可指定前多少行
print(food_info.head(3))
#输出列名
print(food_info.columns)
#输出变量的形状
print(food_info.shape)
输出结果:(8618, 36)
3.loc的使用。
#pandas uses zero-indexing
#Series object representing the row at index 0.
#获得第一行数据
print(food_info.loc[0])
# Series object representing the seventh row.
print(food_info.loc[6])
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
print(food_info.loc[3:6])
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
two_five_ten = [2,5,10]
print(food_info.loc[two_five_ten])
# Method 2
print(food_info.loc[[2,5,10]])
4.单列
# Series object representing the "NDB_No" column.
ndb_col = food_info["NDB_No"]
print(ndb_col)
# Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"
ndb_col = food_info[col_name]
print(ndb_col)
输出分析:这两种方法输出的结果都是相同。
5.多列
columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]
print(zinc_copper)
print(type(zinc_copper))
# Skipping the assignment.
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print(zinc_copper)
print(type(zinc_copper))
输出结果分析:这两种方法输出的结果也是相同的。
6.找出列属性为g的列,并把列的值进行输出。
#将列的属性名转成list
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
#筛选出属性名以(g)结尾的属性名
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
7.dataframe与单个数字加减乘除运算。
food_info = pandas.read_csv("food_info.csv")
print(food_info["Iron_(mg)"])
# Divides 1000 to each value in the column and returns a Series object.
div_1000 = food_info["Iron_(mg)"] / 1000
print(div_1000)
# Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100
print(add_100)
# Subtracts 100 from each value in the column and returns a Series object.
sub_100 = food_info["Iron_(mg)"] - 100
print(sub_100)
# Multiplies each value in the column by 2 and returns a Series object.
mult_2 = food_info["Iron_(mg)"]*2
print(mult_2)
8.dataframe与dataframe之间进行运算,运算结果仍为dataframe。
#每个值对应相乘
#It applies the arithmetic operator to the first value in both columns, the second value in both #columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
print(water_energy)
#每个值对应相加
#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
print(weighted_protein)
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
print(weighted_fat)
initial_rating = weighted_protein + weighted_fat
print(initial_rating)
#每个值对应项除
# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result,
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
print(food_info)
9.sort_value的使用。
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
# inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果。
#print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True)
print(food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print(food_info["Sodium_(mg)"])
10.isnull的使用。
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
titanic_survival = pd.read_csv("titanic_train.csv")
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)
print(age_is_null)
age_null_true = age[age_is_null]
print(age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)
11.sum的使用。
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)
12.mean的使用。
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
13. pivot_table的使用。
#index可以理解为key,values可以理解为value。想象成key-value键值对。
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
14.dropna的使用。
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
print(drop_na_columns.head())
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
print(new_titanic_survival.head())
15.索引获取值。
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
16.sort_value的使用。
# axis:0按照行名排序;1按照列名排序
# level:默认None,否则按照给定的level顺序排列---貌似并不是,文档
# ascending:默认True升序排列;False降序排列
# inplace:默认False,否则排序之后的数据直接替换原来的数据框
# kind:默认quicksort,排序的方法
# na_position:缺失值默认排在最后{"first","last"}
# by:按照那一列数据进行排序,但是by参数貌似不建议使用
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
17. reset_index的使用。
#drop=True:在原有的索引列重置索引,不再另外添加新列。
# drop=False:原有的索引不变添加列名index,同时在新列上重置索引
itanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(itanic_reindexed.iloc[0:10])
18.apply的使用。
(1)默认以列进行操作。
# This function returns the hundredth item from a series
def hundredth_row(column):
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
(2)axis=1以行进行操作
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print classes
(3)支持lambda匿名函数。
fandango = pd.read_csv('fandango_score_comparison.csv')
fandango_films = fandango.set_index('FILM', drop=False)
# returns the data types as a Series
types = fandango_films.dtypes
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
(4)先分类,然后类名作为index,一旦类名做了index,若没有指定aggfunc,则吧对应的同一类的值默认计算均值。
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)
print(minors)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)
age_group_survival的输出结果:
19.series的使用。
(1)series的初始化。
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
series_rt = fandango['RottenTomatoes']
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
print(type(film_names))
print(film_names)
rt_scores = series_rt.values
print(rt_scores)
# film_names为索引,rt_scores为索对应的值
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
(2)series支持切片操作。
# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
fiveten = series_custom[5:10]
print(fiveten)
(3)reindex的使用,重新设置index。
original_index = series_custom.index.tolist()
print(original_index)
sorted_index = sorted(original_index)
print(sorted_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)
(4)sort_index的使用,按照index进行升序排序。
sc2 = series_custom.sort_index()
print(sc2[0:10])
(5)sort_values的使用,按照values进行升序排序。
sc3 = series_custom.sort_values()
print(sc3[0:10])
(6)add、sin和max的使用。
# Add each value with each other
print(np.add(series_custom, series_custom))
# Apply sine function to each value
print(np.sin(series_custom))
# Return the highest value (will return a single value not a Series)
print(np.max(series_custom))
(7)逻辑运算。
#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)
(8)算数运算。
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)
20.set_index的使用。
#will return a new DataFrame that is indexed by the values in the specified column
# drop=True 删除FILM为空的行
fandango = pd.read_csv('fandango_score_comparison.csv')
print(type(fandango))
fandango_films = fandango.set_index('FILM', drop=False)
print(fandango_films.index)