1.列操作 apply
from string import upper
df['Name'] = df.Name.apply(upper)
df['Email Provider'] = df.Email.apply(
lambda x: x.split('@')[-1]
)
2.行操作 lambda
例子1: 40小时以下和40小时以上不同薪,计算出每个人总薪
import codecademylib
import pandas as pd
df = pd.read_csv('employees.csv')
total_earned = lambda row: (row.hourly_wage * 40) + ((row.hourly_wage * 1.5) * (row.hours_worked - 40)) \
if row.hours_worked > 40 \
else row.hourly_wage * row.hours_worked
df['total_earned'] = df.apply(total_earned, axis = 1)
print(df)
例子2 分别进行列操作和行操作
import codecademylib
import pandas as pd
orders = pd.read_csv('shoefly.csv')
print(orders.head(5))
#列
source=lambda x:'animal' \
if (x=='leather')\
else 'vegan'
orders['shoe_source']=orders.shoe_material.apply(source)
print(orders.head(5))
#行
get_lastname=lambda row:'Dear Mr. '+row.last_name\
if row.gender=='male'\
else 'Dear Ms. '+row.last_name
orders['salutation']=orders.apply(get_lastname,axis=1)
print(orders.head(5))
例子3
import codecademylib
import pandas as pd
inventory=pd.read_csv('inventory.csv')
print(inventory.head(10))
staten_island=inventory[0:10]
product_request=staten_island.product_description
print(inventory.info())
seed_request=inventory[(inventory.product_type=='seeds')&(inventory.location=='Brooklyn')]
print(seed_request)
inventory['in_stock']=inventory.quantity.apply(lambda x:False \
if(x==0)\
else True
)
#print(inventory.head(10))
inventory['total_value']=inventory.apply(lambda row:row.quantity*row.price,axis=1)
#print(inventory.head(10))
combine_lambda = lambda row: \
'{} - {}'.format(row.product_type,
row.product_description)
inventory['full_description']=inventory.apply(combine_lambda,axis=1)
print(inventory.head(10))
3.Aggregates in Pandas 聚集
|
Average of all values in column |
|
Standard deviation |
|
Median |
|
Maximum value in column |
|
Minimum value in column |
|
Number of values in column |
|
Number of unique values in column |
|
List of unique values in column |
得到每种鞋型的最高价
orders = pd.read_csv('orders.csv')
pricey_shoes=orders.groupby('shoe_type').price.max()
pricey_shoes = orders.groupby('shoe_type').price.max().reset_index()
print(pricey_shoes)
import codecademylib
import numpy as np
import pandas as pd
orders = pd.read_csv('orders.csv')
print(orders)
cheap_shoes=orders.groupby('shoe_color').price.apply(lambda x:np.percentile(x,25))
print(cheap_shoes)
import codecademylib
import numpy as np
import pandas as pd
orders = pd.read_csv('orders.csv')
shoe_counts=orders.groupby(['shoe_type','shoe_color']).id.count().reset_index()
print(shoe_counts)
shoe_counts.rename(columns={'id': 'count'}, inplace=True)
#shoe_counts.columns = ['shoe_type', 'shoe_color','count']
print(shoe_counts)
import codecademylib
import numpy as np
import pandas as pd
orders = pd.read_csv('orders.csv')
shoe_counts = orders.groupby(['shoe_type', 'shoe_color']).id.count().reset_index()
print(shoe_counts)
shoe_counts.rename(columns={'id': 'count'}, inplace=True)
shoe_counts_pivot=shoe_counts.pivot(columns='shoe_color',index='shoe_type',values='count').reset_index()
print(shoe_counts_pivot)
shoe_type | shoe_color | ||
---|---|---|---|
0 | ballet flats | black | 2 |
1 | ballet flats | brown | 11 |
2 | ballet flats | navy | 17 |
3 | ballet flats | red | 13 |
4 | ballet flats | white | 7 |
5 | sandals | black | 3 |
6 | sandals | brown | 10 |
7 | sandals | navy | 13 |
8 | sandals | red | 14 |
9 | sandals | white | 10 |
10 | stilettos | black | 8 |