导入库
import pandas as pd
chipo = pd.read_csv('./data/chipotle.csv')
chipo
chipo.iloc[0:11,:]
chipo.shape[1]
————————————————————————————
Output:
5
chipo.columns
————————————————————————————
Output:
Index(['order_id', 'quantity', 'item_name', 'choice_description',
'item_price'],
dtype='object')
chipo.index
————————————————————————————
Output:
RangeIndex(start=0, stop=4622, step=1)
'''
1.确定所需字段:item_name
2.统计每个商品出现的次数
'''
# chipo['item_name'].value_counts().head(1)
chipo['item_name'].value_counts().idxmax()
————————————————————————————
Output:
'Chicken Bowl'
# 方法一:
len(set(chipo['item_name'])) # set(集合)可以实现去重
# 方法二:
chipo.drop_duplicates('item_name').shape[0]
# 方法三:
chipo['item_name'].unique().size
————————————————————————————
Output:
50
chipo['quantity'].sum()
————————————————————————————
Output:
4972
'''
1.查看“item_price”的数据类型
2.字符提取
3.将字符转换为浮点型
'''
chipo['item_price'] = chipo['item_price'].str[1:].astype('float')
chipo['item_price']
————————————————————————————
Output:
0 2.39
1 3.39
2 3.39
3 2.39
4 16.98
...
4617 11.75
4618 11.75
4619 11.25
4620 8.75
4621 8.75
Name: item_price, Length: 4622, dtype: float64
chipo['revenue'] = chipo['item_price'] * chipo['quantity']
chipo['revenue'].sum()
————————————————————————————
Output:
39237.02
len(chipo.groupby(by='order_id').sum())
————————————————————————————
Output:
1834
# 按订单id分组,统计每一组的消费金额平均值
chipo['cost'] = chipo['quantity'] * chipo['item_price']
chipo.groupby('order_id')['cost'].mean()
————————————————————————————
Output:
order_id
1 2.890000
2 33.960000
3 6.335000
4 10.500000
5 6.850000
...
1830 11.500000
1831 4.300000
1832 6.600000
1833 11.750000
1834 9.583333
Name: cost, Length: 1834, dtype: float64
euro12 = pd.read_csv('./data/Euro2012.csv')
euro12
# euro12.Goals 或用loc、iloc
euro12['Goals']
# 球队数据去重,查看去重后数据的行数
euro12['Team'].unique().shape[0]
————————————————————————————
Output:
16
# euro12.shape[1]
len(euro12.columns)
————————————————————————————
Output:
35
discipline = euro12[['Team','Yellow Cards','Red Cards']]
discipline
discipline.sort_values(by=['Red Cards','Yellow Cards'],ascending=[True,False])
discipline.groupby('Team')['Yellow Cards'].mean()
————————————————————————————
Output:
Team
Croatia 9.0
Czech Republic 7.0
Denmark 4.0
England 5.0
France 6.0
Germany 4.0
Greece 9.0
Italy 16.0
Netherlands 5.0
Poland 7.0
Portugal 12.0
Republic of Ireland 6.0
Russia 6.0
Spain 11.0
Sweden 7.0
Ukraine 5.0
Name: Yellow Cards, dtype: float64
'''
1.先按条件判断,返回一个布尔索引
2.再将布尔索引一一映射回原数据
'''
euro12[euro12['Goals'] > 6]
# 方法二
# euro12[euro12['Team'].str.startswith('G')]
euro12[euro12['Team'].str[0] == 'G']
euro12.iloc[:,:7]
euro12.iloc[:,:-4]
# 多个条件访问数据
# 方法一:
a = euro12['Team'] == 'England'
b = euro12['Team'] == 'Italy'
c = euro12['Team'] == 'Russia'
euro12[a | b | c]['Shooting Accuracy']
# 方法二:
euro12[euro12['Team'].isin(['England','Italy','Russia'])]['Shooting Accuracy']
————————————————————————————————————————
Output:
3 50.0%
7 43.0%
12 22.5%
Name: Shooting Accuracy, dtype: object
drinks = pd.read_csv('./data/drinks.csv')
drinks
drinks.groupby('continent')['beer_servings'].mean().sort_values().tail(1)
————————————————————————————
Output:
continent
EU 193.777778
Name: beer_servings, dtype: float64
drinks.groupby('continent')['wine_servings'].describe()
drinks.groupby('continent')[['beer_servings', 'spirit_servings', 'wine_servings']].mean()
drinks.groupby('continent')[['beer_servings', 'spirit_servings', 'wine_servings']].median()
drinks.groupby('continent')['spirit_servings'].agg(['mean','max','min'])
crime = pd.read_csv('./data/US_Crime_Rates_1960_2014.csv')
crime
#crime.info()
crime.dtypes
————————————————————————————
Output:
Unnamed: 0 int64
Year datetime64[ns]
Population int64
Total int64
Violent int64
Property int64
Murder int64
Forcible_Rape int64
Robbery int64
Aggravated_assault int64
Burglary int64
Larceny_Theft int64
Vehicle_Theft int64
dtype: object
crime['Year'] = pd.to_datetime(crime['Year'],format='%Y')
crime['Year']
crime.index = crime['Year']
crime
# 方法二:
# crime.set_index('Year')
crime.drop(columns=['Total']) # inplace=True 在原数据上操作
# 重采样:resample
crime.resample('10AS').sum()
# 死亡人数最多的时候
#(crime['Population'].diff()[1:] / crime['Population'].value[:-1]).idxmax()
(crime['Population'].diff()[1:]/crime['Population'].values[:-1]).idxmax()
————————————————————————————
Output:
Timestamp('1970-01-01 00:00:00.000002')
import pandas as pd
raw_data_1 = {
'subject_id': ['1', '2', '3', '4', '5'],
'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
raw_data_2 = {
'subject_id': ['4', '5', '6', '7', '8'],
'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
raw_data_3 = {
'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
data1 = pd.DataFrame(raw_data_1)
data2 = pd.DataFrame(raw_data_2)
data3 = pd.DataFrame(raw_data_3)
all_data = pd.concat([data1,data2],axis=0)
all_data_col = pd.concat([data1,data2],axis=1)
pd.merge(all_data,data3,on='subject_id')
pd.merge(data1,data2,on='subject_id')
import pandas as pd
import numpy as np
读取数据
# \s:表示匹配空格tab;+:表示匹配前一个字符1到多次
# parse_dates:直接在读取数据时将制定列转换为时间格式数据
wind = pd.read_csv('./data/wind.csv',sep='\s+',parse_dates=[[0,1,2]])
wind
————————————————————————————————————
wind['Yr_Mo_Dy'] = wind['Yr'].map(str) + '-'+ wind['Mo'].map(str) + '-'+ wind['Dy'].map(str)
wind['Yr_Mo_Dy']
pd.to_datetime(wind['Yr_Mo_Dy'])
wind.drop(columns=[['Yr','Mo','Dy']])
'''
目标:修复数据中异常时间的数据,将异常时间数据统一减少100
实现方式:自定义函数
1.观察数据找到异常数据所在处:“Yr_Mo_Dy”
2.查看“Yr_Mo_Dy”数据类型:datetime64
3.提取异常的内容:提取日期中的年份
4.将提取出来的年份减去100
5.将处理好的年份拼接月份和天数的信息:datetime.date()用于拼接年份月份以及天数
6.返回处理后的日期(年月日)
'''
import datetime
# 以一个日期为例子实现异常处理
# pd.to_datetime(datetime.date(wind['Yr_Mo_Dy'][0].year-100,wind['Yr_Mo_Dy'][0].month,wind['Yr_Mo_Dy'][0].day))
# 封装函数
def date_solve(x):
'''
该函数实现对输入x进行异常值处理
param x:时间类型的数据
return 异常处理后的x(时间类型)
'''
if x.year>2000:
return pd.to_datetime(datetime.date(x.year-100,x.month,x.day))
else:
return x
wind['Yr_Mo_Dy'] = wind['Yr_Mo_Dy'].apply(date_solve)
wind.set_index('Yr_Mo_Dy',inplace=True)
wind.isnull().sum()
wind.notnull().sum()
# 针对多个变量计算不同的统计量
# 方法一
wind.mean()
# 方法二
wind.agg(['mean'])
# 针对不同变量计算多个统计量(统计每一列的最小值,最大值,平均值和标准差)
loc_stats = wind.agg(['min','max','mean','std'])
# 针对不同的变量计算多个统计量(统计每一天的所有地点的风速最小值,最大值,平均值和标准差)
wind.agg(['min','max','mean','std'],axis=1)
wind_one = wind[wind.index.month==1]
wind_one.mean()
wind.asfreq('Y')
wind.asfreq('MS')
apple = pd.read_csv('./data/appl_1980_2014.csv')
apple
apple.dtypes
apple['Date'] = pd.to_datetime(apple['Date'])
# apple.index = apple['Date']
apple.set_index('Date',inplace=True) # 推荐这种
#apple.index.unique().shapea[0] == apple.shape[0]
apple.index.is_unique
apple.sort_index() # 默认升序
#apple.groupby([apple.index.year,apple.index.month])['Date'].agg(lambda x:x.index.max())
apple.groupby([apple.index.year,apple.index.month]).agg(lambda x:x.index.max())
(apple.index.max() - apple.index.min()).days
len(set([(i.year,i.month) for i in apple.index]))
iris = pd.read_csv('./data/iris.csv',names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
iris
# iris = pd.read_csv('./data/iris.csv',header = None)
# iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris.isnull().sum() # 无缺失值
iris.loc[10:19,'petal_length'] = None
iris['petal_length'] = iris['petal_length'].fillna(1.0)
iris.drop(columns=['class'])
iris.iloc[0:3,:] = None
iris = iris.dropna(axis=0,how='all')
iris.reset_index(drop=True)
import matplotlib.pyplot as plt
import pandas as pd
pd.__version__
job_info = pd.read_csv('./data/job_info.csv',encoding='gbk',header=None) # 原数据没有表头
job_info
job_info.describe()
job_info.duplicated().sum() # 存在重复值74个
job_info.isnull().sum() # 工资一列存在缺失值108个
job_info.columns = ['公司', '岗位', '工作地点', '工资', '发布日期']
job_info['岗位'].value_counts().idxmax()
————————————————————————————
Output:
'数据分析师'
job_info[job_info['发布日期'] == '09-03']
# 查看数据
job_info['工作地点'].value_counts()
job_info[job_info['工作地点']=='异地招聘'] = ''
def area_solve(x):
'''
该函数实现提取输入x中的省份或者市的字符
param x:字符串
return 提取省份或者市字符的x:字符串
'''
if '-' in x: # 判断“-”是否在x中,如果有返回True,如果没有返回False
return x.split('-')[0]
else:
return x
job_info['工作地点'] = job_info['工作地点'].apply(area_solve)
index_1 = job_info['工作地点'].str.contains('深圳|广州|北京|上海')
index_2 = job_info['岗位']=='数据分析师'
job_info[index_1 & index_2]
job_info['工资'].str[-3:].value_counts()
'''
千/月:x*1000
万/月:x*10000
万/年:x*10000/12
其他:None
'''
# 正则表达式
# \d:匹配0-9的字符
# *:匹配前一个字符0到多次
# ?:匹配前一个字符0到1次
# \:转义字符
import re
# 以一个工资为例实现字符提取以及单位转换
s = job_info['工资'][0]
float(re.findall('\d\.?\d*',s)[0]) * 10000
def get_salary(x):
'''
该函数实现对输入x进行字符提取和单位转换
param x:字符串,例如“2-3.5万/月”
return (最低工资,最高工资),例如:(20000,35000)
'''
try: # 数据中存在空的字符串以及其他单位的工资
if x[-3]=='万':
a = [float(i)*10000 for i in re.findall('\d\.?\d*',x)]
elif x[-3]=='千':
a = [float(i)*1000 for i in re.findall('\d\.?\d*',x)]
# 到这里,此时工资的单位为元/月,元/年
if x[-1]=='年':
a = [a/12 for i in a]
return a
except:
return None
job_info['最低工资'] = job_info['工资'].apply(get_salary).str[0] # 提取列表中第一个元素
job_info['最高工资'] = job_info['工资'].apply(get_salary).str[1] # 提取列表中第二个元素
job_info['最低工资'].value_counts() # 低薪:4000以下,中等薪资:4000-10000,高薪;10000以上
job_info['薪资等级'] = pd.cut(job_info['最低工资'],[0,4000,10000,job_info['最低工资'].max()],labels=['低薪','中等薪资','高薪'])
job_info['薪资等级'].value_counts()
job_info.to_csv('./data/job_info(处理后).csv',index=False)
在我的资源共享中免费获取