• 读取北向.csv 指定trade_date为行索引
• 查看数据的基本信息 有无缺失值 对其缺失值进行处理
• 删除缺失值所在行
• 查看数据的基本信息 查看数据是否清洗完毕
• index列没啥用 将index列删除
• 观察数据是否有重复行
• 将重复行进行删除
• 将行索引 进行升序
• 将处理好的数据 保存至 北向(副).csv
#!/user/bin/env python
#-*-coding: utf-8-*-
#@Time : 2020/9/719:53
#@Author : GodSpeed
#@File : Pandas作业03_20200907.py
#@Software : PyCharm
'''
作业03
练习1
• 读取北向.csv 指定trade_date为 行索引
• 查看数据的基本信息 有无缺失值 对其缺失值进行处理
• 删除缺失值所在行
• 查看数据的基本信息 查看数据是否清洗完毕
• index列没啥用 将index列删除
• 观察数据是否有重复行
• 将重复行进行删除
• 将行索引 进行升序
• 将处理好的数据 保存至 北向(副).csv
'''
import pandas as pd
import numpy as np
#1.1 读取 北向.csv 指定trade_date为行索引
northward_data = pd.read_csv("北向.csv",index_col=1)
#print(northward_data.head())
'''
index ggt_ss ggt_sz hgt sgt north_money south_money
trade_date
20190624 0 -541.17 792.38 -757.96 -1153.14 -1911.10 251.21
20190621 1 -97.40 701.36 3722.36 3608.14 7330.50 603.96
20190620 2 660.05 555.23 1914.44 3650.47 5564.91 1215.28
20190619 3 -491.58 186.47 2092.51 2831.23 4923.74 -305.11
20190618 4 1667.40 832.29 974.92 617.24 1592.16 2499.69
'''
#1.2.1 查看数据的基本信息
#print(northward_data.info())
'''
Int64Index: 884 entries, 20190624 to 20190605
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 index 884 non-null int64
1 ggt_ss 850 non-null float64
2 ggt_sz 850 non-null float64
3 hgt 870 non-null float64
4 sgt 870 non-null float64
5 north_money 884 non-null float64
6 south_money 884 non-null float64
dtypes: float64(6), int64(1)
memory usage: 55.2 KB
None
'''
#1.2.2 有无缺失值
#判断缺失值做在的行
print(northward_data.isnull().sum()) #isnull.sum()返回每一列缺失值统计个数
'''
trade_date 0
ggt_ss 34
ggt_sz 34
hgt 14
sgt 14
north_money 0
south_money 0
dtype: int64
'''
#isnull().any() #是对列表对象迭代对象(True or False)进行一个再统计,
# 如果某一列有一个True则这一列的结果返回True
print(northward_data.isnull().any())
'''
trade_date False
ggt_ss True
ggt_sz True
hgt True
sgt True
north_money False
south_money False
dtype: bool
'''
#print(northward_data.isnull().values)
'''
[[False False False ... False False False]
[False False False ... False False False]
[False False False ... False False False]
...
[False False False ... False False False]
[False False False ... False False False]
[False False False ... False False False]]
'''
'''
any()一个序列中满足一个True,则返回True;
all()一个序列中所有值为True时,返回True,否则为False。
'''
# 判断数据是否存在nan
if northward_data.isnull().values.any():
print('存在nan数据')
#打印nan所在的行
#print(northward_data[northward_data.isnull().values == True])
else:
print('不存在nan数据')
'''
index ggt_ss ggt_sz hgt sgt north_money south_money
trade_date
20190510 29 1763.55 1294.88 NaN NaN 0.00 3058.43
20190510 29 1763.55 1294.88 NaN NaN 0.00 3058.43
20190430 34 NaN NaN -463.88 -83.01 -546.89 0.00
20190430 34 NaN NaN -463.88 -83.01 -546.89 0.00
20190429 35 NaN NaN 3819.12 713.61 4532.73 0.00
... ... ... ... ... ... ... ...
20190912 230 NaN NaN 2488.84 806.64 3295.48 0.00
20190911 231 NaN NaN 1804.94 627.08 2432.02 0.00
20190911 231 NaN NaN 1804.94 627.08 2432.02 0.00
20190628 283 1411.88 218.06 NaN NaN 0.00 1629.94
20190628 283 1411.88 218.06 NaN NaN 0.00 1629.94
[96 rows x 7 columns]
'''
# 1.3 删除缺失值所在行
northward_data.dropna(axis=0,inplace=True)
# 1.4 查看数据的基本信息 查看数据是否清洗完毕
print(northward_data.info())
'''
Int64Index: 836 entries, 20190624 to 20190605
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 index 836 non-null int64
1 ggt_ss 836 non-null float64
2 ggt_sz 836 non-null float64
3 hgt 836 non-null float64
4 sgt 836 non-null float64
5 north_money 836 non-null float64
6 south_money 836 non-null float64
dtypes: float64(6), int64(1)
memory usage: 52.2 KB
None
处理后不存在nan数据
Process finished with exit code 0
'''
if northward_data.isnull().values.any():
print('处理后,仍然存在nan数据')
else:
print('处理后不存在nan数据')
#1.5 index列没啥用 将index列删除
#del northward_data['index']
northward_data.drop(['index'],axis=1,inplace=True)
#print(northward_data)
#1.6 观察数据是否有重复行
if northward_data.duplicated().values.any():
print('存在重复行')
#print(northward_data[northward_data.duplicated().values == True])
else:
print('不存在重复行')
#1.7 将重复行进行删除
northward_data = northward_data.drop_duplicates()
print(northward_data)
if northward_data.duplicated().values.any():
print('处理后,仍然存在重复行')
#print(northward_data[northward_data.duplicated().values == True])
else:
print('处理后,不存在重复行') #处理后,不存在重复行
#1.8 将行索引 进行升序
northward_data = northward_data.sort_index(ascending=False) # 默认为0轴 并且默认为升序
print(northward_data)
'''
[822 rows x 6 columns]
处理后,不存在重复行
ggt_ss ggt_sz hgt sgt north_money south_money
trade_date
20200904 1977.81 2827.44 -3106.19 -3211.42 -6317.61 4805.25
20200903 80.46 1470.66 917.30 -1921.12 -1003.82 1551.12
20200902 1113.10 2915.24 -6007.57 -606.69 -6614.26 4028.34
20200901 617.59 2336.67 -1805.03 1505.10 -299.93 2954.26
20200831 2231.93 2660.40 -3414.43 -4611.59 -8026.02 4892.33
... ... ... ... ... ... ...
20170109 1175.00 337.00 -846.78 886.18 39.40 1512.00
20170106 385.00 207.00 -2103.00 644.00 -1459.00 592.00
20170105 686.00 291.00 -912.34 633.50 -278.84 977.00
20170104 593.00 258.00 -1348.85 629.70 -719.15 851.00
20170103 994.00 277.00 -582.47 615.49 33.02 1271.00
[822 rows x 6 columns]
'''
#1.9 将处理好的数据 保存至 北向(副).csv
northward_data.to_csv('北向(副).csv')
读取 FoodFacts.csv 数据,该数据是全球食品数据,需分析每个国家添加剂的平均使用。
步骤分析
• 1.读取数据
• 2.数据质量考量
• 3.清洗数据
• 4.对各个国家的使用数量进行统计
•4.1.清洗,统计国家数据
• 4.2 .通过国家统计添加剂用量
• 5.保存统计结果
#!/user/bin/env python
#-*-coding: utf-8-*-
#@Time : 2020/9/813:40
#@Author : GodSpeed
#@File : Pandas第三次作业修正版本.py
#@Software : PyCharm
import pandas as pd
import numpy as np
'''
练习2
读取 FoodFacts.csv 数据,该数据是全球食品数据,需分析每个国家添加剂的平均使用。
步骤分析
• 1.读取数据
• 2.数据质量考量
• 3.清洗数据
• 4.对各个国家的使用数量进行统计
• 1.清洗,统计国家数据
• 2.通过国家统计添加剂用量
• 5.保存统计结果
'''
# 1 读取数据
#food_facts_data = pd.read_csv('FoodFacts.csv')
# sys:1: DtypeWarning: Columns (0,3,5,27,36) have mixed types.
# Specify dtype option on import or set low_memory=False.
'''
系统:1:DtypeWarning:列(0,3,5,27,36)已混合类型。指定导入时的dtype选项或将低内存设置为False。
'''
#要把这个 low_memory 关掉
#food_facts_data = pd.read_csv('FoodFacts.csv',low_memory=False)
#print(food_facts_data.head())
'''
code ... nutrition_score_uk_100g
0 000000000000012866 ... NaN
1 0000000024600 ... NaN
2 0000000036252 ... NaN
3 0000000039259 ... NaN
4 0000000039529 ... NaN
'''
#print(food_facts_data.columns.values)
'''
['code' 'url' 'creator' 'created_t' 'created_datetime' 'last_modified_t'
'last_modified_datetime' 'product_name' 'generic_name' 'quantity'
'packaging' 'packaging_tags' 'brands' 'brands_tags' 'categories'
'categories_tags' 'categories_en' 'origins' 'origins_tags'
'manufacturing_places' 'manufacturing_places_tags' 'labels' 'labels_tags'
......
'cocoa_100g' 'chlorophyl_100g' 'carbon_footprint_100g'
'nutrition_score_fr_100g' 'nutrition_score_uk_100g']
'''
#print(food_facts_data.shape) #(65503, 159)
#删除NaN数据
def del_NaN(pandas_data):
'''
:param pandas_data: 源数据
:return: 返回删除NaN后的数据
'''
#判断数据中是否存在NaN数据
if pandas_data.isnull().values.any():
print('pandas_data存在nan数据,马上进行处理')
# 打印nan所在的行
#pandas_data.isnull().any() # 显示缺失值所在的列
# print(pandas_data.isnull().values)
else:
print('pandas_data不存在nan数据')
return pandas_data
#删除Nan数据
return pandas_data.dropna()
# 清洗数据
def clean_countries_en_data(pandas_data):
'''
:param pandas_data: 源数据
:return: 返回处理后的数据
'''
# 步骤1: 去除所有countries_en包括逗号的异常数据
# 方法1:用正则表达式 ^(?!.*字符串) 来过滤
#without_commas_data = pandas_data[pandas_data['countries_en'].str.contains(r"^(?!.*,)")]
#print(without_commas_data)
# 方法2: 用布尔索引 ~ 非符号
without_commas_data = pandas_data[~pandas_data['countries_en'].str.contains(r",")]
print('去逗号',without_commas_data)
'''
去逗号 countries_en additives_n
5 United Kingdom 0.0
6 France 0.0
8 France 0.0
10 United Kingdom 5.0
11 United Kingdom 5.0
... ... ...
65480 United States 4.0
65490 France 0.0
65494 France 0.0
65499 France 0.0
65501 France 0.0
[42319 rows x 2 columns]
'''
# 步骤2: 国家数据统一变为小写
# 方法1: df.str.lower()
#without_commas_data['countries_en'] = without_commas_data['countries_en'].str.lower().copy()
#Try using .loc[row_indexer,col_indexer] = value instead
#without_commas_data['countries_en'] = countries_en_data.str.lower()
# 关闭SettingWithCopyWarning:
# 方法2:利用函数映射
without_commas_data["countries_en"]= without_commas_data["countries_en"].map(lambda x:x.lower())
print('大小写转换without_commas_data=',without_commas_data)
'''
大小写转换without_commas_data= countries_en additives_n
5 united kingdom 0.0
6 france 0.0
8 france 0.0
10 united kingdom 5.0
11 united kingdom 5.0
... ... ...
65480 united states 4.0
65490 france 0.0
65494 france 0.0
65499 france 0.0
65501 france 0.0
[42319 rows x 2 columns]
'''
return without_commas_data
#针对研究对象,从诸多列中,提取感兴趣的列信息,提供读取效率
#本需求中获取三列信息
if __name__ == '__main__':
# 关闭SettingWithCopyWarning:
pd.set_option('mode.chained_assignment', None)
food_brief_data = pd.read_csv('FoodFacts.csv', usecols=["countries_en","additives_n"])
print('food_brief_data111',food_brief_data)
'''
countries_en additives_n
0 France NaN
1 France NaN
2 France NaN
3 France NaN
4 France NaN
... ... ...
65498 Poland NaN
65499 France 0.0
65500 France NaN
65501 France 0.0
65502 China NaN
[65503 rows x 2 columns]
'''
food_brief_data = del_NaN(food_brief_data)
print('food_brief_data222',food_brief_data)
'''
food_brief_data222 countries_en additives_n
5 United Kingdom 0.0
6 France 0.0
8 France 0.0
10 United Kingdom 5.0
11 United Kingdom 5.0
... ... ...
65480 United States 4.0
65490 France 0.0
65494 France 0.0
65499 France 0.0
65501 France 0.0
[43616 rows x 2 columns]
[43616 rows x 3 columns]
Process finished with exit code 0
'''
# 数据清洗
food_brief_data = clean_countries_en_data(food_brief_data)
print('food_brief_data222',food_brief_data)
'''
food_brief_data222 countries_en additives_n
5 united kingdom 0.0
6 france 0.0
10 united kingdom 5.0
13 france 2.0
15 spain 0.0
... ... ...
65304 new zealand 6.0
65326 new zealand 8.0
65341 new zealand 7.0
65403 burkina faso 2.0
65405 burkina faso 1.0
[411 rows x 2 columns]
'''
# 分析每个国家添加剂的平均使用
last_DataFrame = food_brief_data.groupby(['countries_en']).mean()
print(type(last_DataFrame)) #
# 更换last_DataFrame列索引的additives_n为additives_mean
last_DataFrame.rename(columns={
'additives_n':'additives_mean'}, inplace = True)
print('last_DataFrame=', last_DataFrame)
last_DataFrame = last_DataFrame.sort_values(by='additives_mean',ascending=False)
'''
last_DataFrame= additives_mean
countries_en
albania 0.000000
algeria 3.500000
andorra 0.000000
argentina 2.222222
australia 0.489871
... ...
turkey 0.300000
united arab emirates 1.000000
united kingdom 1.243810
united states 2.162905
venezuela 0.000000
[84 rows x 1 columns]
'''
# 保存文件
last_DataFrame.to_csv('按国家统计添加剂使用情况.csv', encoding='utf_8_sig')