综合练习

题目:http://datawhale.club/t/topic/579/4

【任务一】企业收入的多样性

【题目描述】一个企业的产业收入多样性可以仿照信息熵的概念来定义收入熵指标:

其中 p(xi) 是企业该年某产业收入额占该年所有产业总收入的比重。在company.csv中存有需要计算的企业和年份,在company_data.csv中存有企业、各类收入额和收入年份的信息。现请利用后一张表中的数据,在前一张表中增加一列表示该公司该年份的收入熵指标 I 。

df1 = pd.read_csv('../data/exercise/company.csv')
df2 = pd.read_csv('../data/exercise/company_data.csv')
def px(x):
    return -np.sum(x/x.sum()*np.log2(x/x.sum()))
df3 = df2.groupby(['日期','收入类型'])['收入额'].sum()
df3 = df3.reset_index()
df3 = df3.groupby('日期')['收入额'].apply(px).rename(index=lambda x: int(x[:4])).reset_index()
df1.merge(df3, on='日期',how='left').rename(columns={'收入额':'收入熵指标'}).head()

结果:

  证券代码    日期  收入熵指标
0  #000007  2014  5.633910
1  #000403  2015  5.753387
2  #000408  2016  5.704088
3  #000408  2017       NaN
4  #000426  2015  5.753387

【任务二】组队学习信息表的变换

【题目描述】请把组队学习的队伍信息表变换为如下形态,其中“是否队长”一列取1表示队长,否则为0

    是否队长    队伍名称        昵称      编号
0   1       你说的都对队  山枫叶纷飞   5
1   0       你说的都对队  蔡           6
2   0       你说的都对队  安慕希     7
3   0       你说的都对队  信仰      8
4   0       你说的都对队  biubiu  20
... ...     ...         ...         ...
141 0       七星联盟        Daisy       63
142 0       七星联盟        One Better  131
143 0       七星联盟        rain        112
144 1       应如是     思无邪     54
145 0       应如是     Justzer0    58
df = pd.read_excel('../data/exercise/组队信息汇总表(Pandas).xlsx')
import re
def replaceName(x):
    obj = re.match("队员_群昵称\.(\d)",x)
    obj2 = re.match("队员(\d+)编号",x.replace(" ",""))
    if x == '队员_群昵称':
        return '群昵称_队员1'
    if x == '队长编号':
        return '编号_队长'
    if x == '队长_群昵称':
        return '群昵称_队长'
    if obj:
        return '群昵称_队员' + str(int(obj.group(1))+1)
    else:
        if obj2:
            return '编号_队员' + str(int(obj2.group(1)))
        return x
df = df.rename(columns=lambda x: replaceName(x))
df['tmpIndex'] = df.index

df = pd.wide_to_long(df, 
                stubnames=['群昵称','编号'],
                i=['队伍名称'],
                j='是否队长',
                sep="_",
                suffix='.+').swaplevel(0,1).rename(columns={'群昵称':'昵称'})
df = df.drop('所在群', axis = 1).reset_index(['队伍名称']).rename(index=lambda x: 1 if x == '队长' else 0).reset_index()
df = df.drop(df[pd.isnull(df.编号)].index).sort_values(['tmpIndex','是否队长'],ascending=[True,False]).drop(columns='tmpIndex').reset_index().drop(columns='index')

【任务三】

#1. 有多少县满足总投票数超过县人口数的一半
df1 = pd.read_csv('../data/exercise/county_population.csv')
df2 = pd.read_csv('../data/exercise/president_county_candidate.csv')
df1 = df1.set_index('US County')
new_idx = df1.index.map(lambda x:tuple(x.split(', ')))
df1.index = new_idx
df1 = df1.reset_index(level=0).rename_axis(index={'US County': 'State'}).reset_index().set_index('US County').rename(index=lambda x:x[1:]).reset_index()
df2 = df2.rename(columns={'state':'State', 'county':'US County'})
df3 = df2.merge(df1, on=['US County','State'],how='left')
df3['percent'] = df3['total_votes']/df3['Population']
df3.query('percent > 0.5')['US County'].count()
'''
74
'''
#2.
series1 = df2.groupby(['candidate'])['total_votes'].sum().sort_values(ascending=False)
df4 = df2.pivot_table(index='State',columns=['candidate'],values='total_votes',aggfunc='sum')
df4.columns = series1.index
df4
'''
candidate   Joe Biden   Donald Trump    Jo Jorgensen    Howie Hawkins   Write-ins   Rocky De La Fuente  Gloria La Riva  Kanye West  Don Blankenship Brock Pierce    ... Tom Hoefling    Ricki Sue King  Princess Jacob-Fambro   Blake Huber Richard Duncan  Joseph Kishore  Jordan Scott    Gary Swing  Keith McCormic  Zachary Scalf
State                                                                                   
Alabama NaN 7312.0  NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Alaska  NaN 34210.0 NaN NaN NaN NaN 825.0   NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 318.0   NaN NaN NaN
Arizona NaN 2032.0  NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Arkansas    NaN NaN NaN NaN NaN 1713.0  2141.0  NaN NaN 1475.0  ... NaN 2812.0  NaN NaN NaN NaN 1321.0  NaN NaN NaN
California  NaN 80.0    NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 60155.0 NaN NaN NaN
5 rows × 38 columns
'''
#3.
df5=df2.merge(df2.groupby(['State','US County'])['total_votes'].sum().rename('county sum'), on=["State",'US County'],how='left')
df5['voteshare'] = df5['total_votes']/df5['county sum']
def test(x):
    biden = float(x.query('candidate=="Joe Biden"')['voteshare'])
    Trump = float(x.query('candidate=="Donald Trump"')['voteshare'])
    return str(biden-Trump) + "_" + x.name[1]
BTSeries = df5.groupby(['State','US County'])['candidate','voteshare'].apply(lambda x: test(x)).rename('result').reset_index().set_index('result')
new_idx =BTSeries.index.map(lambda x: tuple(x.split('_')))
BTSeries.index = new_idx
BTSeries = BTSeries.reset_index(level=0).rename_axis(index={'result': 'BT'}).reset_index()
BTSeries['result'] = BTSeries['result'].apply(lambda x:float(x))
BTSeries.groupby('State').filter(lambda x: x.median() > 0)['State'].drop_duplicates()
'''
197               California
319              Connecticut
488                 Delaware
491     District of Columbia
725                   Hawaii
1878           Massachusetts
2999              New Jersey
3536            Rhode Island
4065                 Vermont
'''

你可能感兴趣的:(综合练习)