23.04.02数据分析项目1

  1. 2015,2016,2017国内主要城市数据分析:
    1)unique(), nunique()
df = pd.read_csv('./2015年国内主要城市年度数据.csv')
print(df.info())

# unique()用于获取Series对象的唯一值
print(df['国内生产总值'].unique())
print(df['国内生产总值'].nunique())  # number of unique values

2)每列数据缺失量

# 每一列的缺失值数量
print("NaN:\n", df.isnull().sum())
# 每一列的不是缺失值数量
print("Not NaN:\n", df.notnull().sum())

3)统计信息

# 最大值索引号
print("最大值索引号:\n",df['国内生产总值'].idxmax())
# 汇总统计信息
print('\n>>>',df.describe().T)   #列变行
for (columnName,columnData) in df.describe().iteritems():
    print('Column Name: ',columnName)
    print('Column Contents: ',columnData.values)  #不用.values是一个Series
    print('='*100)

4)数据抽样

data = df.sample(n=5,replace=False) # 不重复采样

5)apply()函数

#方法1
def fun(x):
    x = int(x)
    return x
data1 = df['国内生产总值'].apply(fun)
print(data1.head())
#lambda匿名函数
data2 = df['国内生产总值'].apply(lambda x: int(x))
print(data2.head())
#numpy函数
data3 = df.iloc[:,:].apply(np.sum)
print(data3.head())

6) 纵向连接数据

# 合并数据
df1 = pd.read_csv('./2015年国内主要城市年度数据.csv')
df2 = pd.read_csv('./2016年国内主要城市年度数据.csv')
df3 = pd.read_csv('./2017年国内主要城市年度数据.csv')
#纵向连接
df1 = pd.concat(objs=[df1,df2,df3],axis=0)
print(df1.sample(n=7,replace=False))
  1. 世界各地饮酒数据
# 哪个大陆平均消耗的beer更多?
# 如果直接使用切片,认为是操作行下标
print(drinks.groupby('continent')['beer_servings'].mean().sort_values(ascending=False)[0:1])

# 打印出每个大陆的wine_servings的描述性统计值
print(drinks.groupby('continent')['wine_servings'].count())
print(drinks.groupby('continent')['wine_servings'].describe())

# 打印每个大陆spirit饮品消耗的平均值,最大值和最小值(agg method)
print(drinks.groupby('continent')['spirit_servings'].agg(['mean','max','min']))
  1. 2012美国大选献金分析
# 填充空值为NOT PROVIDED
df.fillna(value='NOT PROVIDED', inplace=True)
print(df.info())

# 将捐赠金额<=0的数据删除
print(df.loc[df['contb_receipt_amt'] <= 0,'contb_receipt_amt'])
drop_index = df.loc[df['contb_receipt_amt'] <= 0].index
df.drop(labels=drop_index,axis=0,inplace=True)

# 新建一列为各个候选人的party
# 找出所有候选人的名字
cand_array = df['cand_nm'].unique()
print(cand_array)
# 找到候选人对应的party,并记入字典
cand_party_map = {
    'Bachmann, Michelle':'Republican',
    'Romney, Mitt':'Republican',
    'Obama, Barack':'Democrat',
    "Roemer, Charles E. 'Buddy' III":"Reform",
    'Pawlenty, Timothy':'Republican',
    'Johnson, Gary Earl':'Libertarian',
    'Paul, Ron':'Republican',
    'Santorum, Rick':'Republican',
    'Cain, Herman':'Republican',
    'Gingrich, Newt':'Republican',
    'McCotter, Thaddeus G':'Republican',
    'Huntsman, Jon':'Republican',
    'Perry, Rick':'Republican',
}
df['party'] = df['cand_nm'].map(cand_party_map)
print(df.head())
# 查看每天各个party收到的receipt总量
print(df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum())

# 将表中日期格式转换为'yyyy-mm-dd'
df['contb_receipt_dt'] = pd.to_datetime(df['contb_receipt_dt'])
df['contb_receipt_dt'] = df['contb_receipt_dt'].dt.strftime("%Y-%m-%d")

# 查看disabled veteran主要支持谁
df_vet = df[df['contbr_occupation'] == 'DISABLED VETERAN']
print(df_vet)
print(df_vet.groupby(by='cand_nm')['contb_receipt_amt'].sum().sort_values(ascending=False))
  1. 2012欧洲杯数据
# 选取以字母G开头的球队数据
euro12[euro12['Team'].str.startswith('G')]

# 选择除了最后3列之外的全部列
print(euro12.iloc[:,:-3])

# 找到England, Italy,Russia的Shooting Accuracy (2种方法)
print(euro12.loc[euro12['Team'].isin(['England','Italy','Russia']),['Team','Shooting Accuracy']])
print(euro12.loc[euro12['Team'].isin(['England','Italy','Russia'])][['Team','Shooting Accuracy']])
  1. 美国各州人口分析

1) 合并表格:

# 将人口数据和各州简称数据合并
abb_population = pd.merge(abb,population,left_on='abbreviation',right_on='state/region',how='outer')
# 删除合并中重复的state/region列
abb_population.drop(labels='abbreviation',axis=1,inplace=True)
# 找到哪些state/region使得state为NaN, 并补充正确的state名称
print(abb_population.loc[abb_population['state'].isnull()]['state/region'].unique())
# 先给USA的全称对应的空值批量赋值
indice = abb_population.loc[abb_population['state/region'] == 'USA'].index
abb_population.loc[indice,'state'] = 'United States'
# 先给PR的全称对应的空值批量赋值
indice = abb_population.loc[abb_population['state/region'] == 'PR'].index
abb_population.loc[indice,'state'] = 'Puerto Rico'

2)条件查询:

# 找出2010年的全民人口数据(基于df做条件查询)
print(abb_population_areas.query('ages == "total" & year == 2010'))
  1. 股票行情分析
    1)获取数据
import tushare as ts
import pandas as pd

# 获取某只股票(中国国航)的历史行情数据
# df = ts.get_k_data(code='601111',start='2007-01-01')

#将数据储存到本地
# df.to_csv('./air_china.csv')
# 读取数据
df = pd.read_csv("./air_china.csv")

2)索引修改

#将'date'列的数据类型从obj 转换到 datetime
df['date'] = pd.to_datetime(df['date'])

#将'date'列作为源数据的行索引
df.set_index('date',inplace=True)

3)数值查询

#输出该股票所有收盘比开盘上涨3%的日期
df_dates = df[(df['close'] - df['open']) / df['open'] >= 0.03].index

# 输出该股票所有开盘比前日收盘跌幅超过2%的日期
# shift(1) 往下移一位
df_dates2 = df[(df['open'] - df['close'].shift(1)) / df['close'].shift(1) < -0.02].index

4)计算总收益

'''假如我从2010年1月1日开始,每个月第一个交易日买入1手股票,
   每年最后一个交易日卖出所有股票,到今天为止的收益如何?
'''
new_df = df['2010-01':'2023-03']
#重采样每月的第一个数据
df_monthly = new_df.resample('M').first()
#买股票的总花费
cost = df_monthly['open'].sum() * 100

#卖出股票到手的钱
#特殊情况:2023年买入的股票卖不出去
df_yearly = new_df.resample('A').last()[:-1]
#卖出股票到手的钱
receive = df_yearly['open'].sum() * 1200
# 用昨天的收盘价作为2023年3个月的股票单价
last_months = df['close'][-1] * 300

# 计算总收益
print(round(receive + last_months - cost, 2))
  1. 泰坦尼克号幸存者分析
    1)中文设置:
#中文设置
plt.rcParams['font.family'] = 'Songti SC'

2)性别处理(object --> int)

# 处理性别:1男性,0女性 (函数或者映射)
# def sex_values(Sex):
#     if Sex == 'male':
#         return 1
#     else:
#         return 0
# df['Sex'] = df['Sex'].apply(sex_values)
# 或者使用映射
df['Sex'] = df['Sex'].map({'male':1,'female':0})

3)性别对生还的影响:

#性别对生还率的影响
# 获取生还者的性别信息
# df_sex1 = df['Sex'][df['Survived']==1]
# # 获取遇难者的性别信息
# df_sex0 = df['Sex'][df['Survived']==0]
# # 绘制直方图
# plt.hist([df_sex1,df_sex0],stacked=True,label=['Rescured','Not Saved'])
# plt.xticks([-1,0,1,2],[-1,'F','M',2])
# plt.legend()
#plt.show()

4)性别和等级对生还的影响:

# 性别和等级共同对生还率的影响
group_all = df.groupby(by=['Sex','Pclass']).count()['PassengerId']
#生还者统计
survived_passengers_group = survived_passengers_df.groupby(by=['Sex','Pclass']).count()['PassengerId']
#每一组生还比例
survived_passengers_ratio = survived_passengers_group / group_all
print(survived_passengers_ratio)
# 使用pandas绘图
bar = survived_passengers_ratio.plot.bar(title='性别和乘客等级共同对生还率的影响')
for p in bar.patches:
    bar.text(p.get_x()*1.05,p.get_height()*1.05,'%.2f%%'%(p.get_height()*100))
plt.show()

你可能感兴趣的:(数据分析,python,numpy)