import pandas as pd
def big_countries(world: pd.DataFrame) -> pd.DataFrame:
condition = (world.area >= 3000000) | (world['population'] >= 25000000)
return world[condition][['name','population','area']]
import pandas as pd
def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
# orders.customerId
return customers[~customers.id.isin(orders.customerId)][['name']].rename(columns={"name":"Customers"})
import pandas as pd
def article_views(views: pd.DataFrame) -> pd.DataFrame:
df =views[(views.author_id==views.viewer_id)][['author_id']].drop_duplicates()
return df.sort_values(by='author_id', ascending=True).rename(columns={'author_id': 'id'})
import pandas as pd
def invalid_tweets(tweets: pd.DataFrame) -> pd.DataFrame:
condition = tweets.content.str.len() > 15
return tweets[condition][['tweet_id']]
import pandas as pd
def calculate_special_bonus(employees: pd.DataFrame) -> pd.DataFrame:
employees['bonus'] = employees.apply(
lambda x: x['salary'] if x['employee_id'] % 2 and not x['name'].startswith('M') else 0,
axis=1)
return employees[['employee_id','bonus']].sort_values(by='employee_id', ascending=True)
方法一: apply(func,axis=1) .str.strip().title().lower()
import pandas as pd
def title_name(x):
names = x['name'].split(' ')
titled_name = names[0].lower().title()
for name in names[1:]:
titled_name = titled_name + ' ' + name.lower()
return titled_name.strip()
def fix_names(users: pd.DataFrame) -> pd.DataFrame:
users['name']=users.apply(title_name, axis=1)
return users.sort_values('user_id')
方法二:users['name']=users['name'].str.capitalize()
import pandas as pd
def fix_names(users: pd.DataFrame) -> pd.DataFrame:
users['name'] = users['name'].str.capitalize()
return users.sort_values('user_id')
import pandas as pd
def valid_emails(users: pd.DataFrame) -> pd.DataFrame:
return users[users["mail"].str.match(r"^[a-zA-Z][a-zA-Z0-9_.-]*\@leetcode\.com$")]
'''输入'''
| patient_id | patient_name | conditions |
| ---------- | ------------ | ------------ |
| 1 | Daniel | YFEV COUGH |
| 2 | Alice | |
| 3 | Bob | DIAB100 MYOP |
| 4 | George | ACNE DIAB100 |
| 5 | Alain | DIAB201 |
str.match
'''代码'''
import pandas as pd
def find_patients(patients: pd.DataFrame) -> pd.DataFrame:
return patients[patients.conditions.str.match(r"\bDIAB1\w*\b")]
'''输出'''
| patient_id | patient_name | conditions |
| ---------- | ------------ | ------------ |
| 3 | Bob | DIAB100 MYOP |
str.contains
'''代码'''
import pandas as pd
def find_patients(patients: pd.DataFrame) -> pd.DataFrame:
return patients[patients.conditions.str.contains(r"\bDIAB1\w*\b")]
'''输出'''
| patient_id | patient_name | conditions |
| ---------- | ------------ | ------------ |
| 3 | Bob | DIAB100 MYOP |
| 4 | George | ACNE DIAB100 |
match、fullmatch和contains之间的区别在于严格性:
import pandas as pd
def nth_highest_salary(employee: pd.DataFrame, N: int) -> pd.DataFrame:
salary_df = employee[['salary']].drop_duplicates().rename(columns ={'salary':f'getNthHighestSalary({N})'})
try:
salary_df = salary_df.sort_values(by=[f'getNthHighestSalary({N})'], ascending=False).iloc[[N-1]]
except Exception as ex:
salary_df = pd.DataFrame({f'getNthHighestSalary({N})':[None]})
return salary_df
Employee =
| id | salary |
| -- | ------ |
| 1 | 100 |
| 2 | 200 |
| 3 | 300 |
import pandas as pd
def second_highest_salary(employee: pd.DataFrame) -> pd.DataFrame:
try:
employee = employee.drop_duplicates(subset=['salary']).sort_values(by=['salary'], ascending=False).iloc[[1]]
except Exception as _:
employee = pd.DataFrame({'SecondHighestSalary': [None]})
else:
employee = employee[['salary']].rename(columns={'salary': 'SecondHighestSalary'})
return employee
| SecondHighestSalary |
| ------------------- |
| 200 |
drop_duplicates()函数如果需要根据某几列删掉重复值时使用subset=['col1','col2]
只有一列时iloc[1]返回Series,如果需要返回DataFrame,多加一个[]如iloc[[1]]
通过字典构造DataFrame({key:[value]}) 时注意value要用列表[],单value也用列表[]。
Employee =
| id | name | salary | departmentId |
| -- | ----- | ------ | ------------ |
| 1 | Joe | 70000 | 1 |
| 2 | Jim | 90000 | 1 |
| 3 | Henry | 80000 | 2 |
| 4 | Sam | 60000 | 2 |
| 5 | Max | 90000 | 1 |
Department =
| id | name |
| -- | ----- |
| 1 | IT |
| 2 | Sales |
| Department | Employee | Salary |
| ---------- | -------- | ------ |
| IT | Jim | 90000 |
| Sales | Henry | 80000 |
| IT | Max | 90000 |
import pandas as pd
def department_highest_salary(employee: pd.DataFrame, department: pd.DataFrame) -> pd.DataFrame:
df = pd.merge(employee, department, left_on='departmentId', right_on='id')
df.rename(columns={'name_x': 'Employee', 'salary': 'Salary', 'name_y': 'Department'}, inplace=True)
# Pick up the max_salary series
max_salary = df.groupby('Department')['Salary'].transform('max')
# Use condition df['Salary']=max_salary to filter expected df
df = df[df['Salary'] == max_salary]
# return the df with expected columns.
return df[['Department', 'Employee', 'Salary']]
在pandas中,当合并多个DataFrame时,如果重名的列存在,则会自动创建一个新的列名,以避免重复。
例如,假设我们有两个DataFrame,其中一个包含列名为"A"和"B",另一个也包含列名为"A"和"B",如果我们使用merge函数将它们合并在一起,则会自动生成一个新的列名,例如"A_x"和"B_x"来表示第一个DataFrame中的列,"A_y"和"B_y"来表示第二个DataFrame中的列。
以下是一个示例代码:
pythonimport pandas as pddf1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})merged_df = pd.merge(df1, df2, on='A')print(merged_df)
输出结果:
pythonA B A_x B_x A_y B_y0 1 3 4 5 71 2 4 5 6 8
可以看到,在合并后的DataFrame中,"A"和"B"列都存在,但每个列都有两个不同的值,同时自动生成了新的列名来避免重复。
除了使用_x和_y作为后缀外,pandas还会使用其他字符来区分不同的DataFrame。例如,如果第一个DataFrame的列名是"A",第二个DataFrame的列名也是"A",第三个DataFrame的列名也是"A",则合并后的DataFrame会自动生成列名"A_0"、"A_1"和"A_2"来区分不同的数据来源。
如果仍然不够用,可以考虑手动指定合并后的列名,例如使用pd.merge(df1, df2, on='column_name', suffixes=('suffix1', 'suffix2'))参数来指定后缀,这样可以将第一个DataFrame的列名后添加"suffix1",第二个DataFrame的列名后添加"suffix2",以此类推。
groupby后可以通过[]来对某一列的数据进行操作,df.groupby('Department')['Salary'].transform('max'),否则对所有列进行操作。
Scores =
| id | score |
| -- | ----- |
| 1 | 3.5 |
| 2 | 3.65 |
| 3 | 4 |
| 4 | 3.85 |
| 5 | 4 |
| 6 | 3.65 |
| score | rank |
| ----- | ---- |
| 4 | 1 |
| 4 | 1 |
| 3.85 | 2 |
| 3.65 | 3 |
| 3.65 | 3 |
| 3.5 | 4 |
import pandas as pd
def order_scores(scores: pd.DataFrame) -> pd.DataFrame:
scores['rank'] = scores['score'].rank(method='dense', ascending=False)
return scores[['score', 'rank']].sort_values(by='score', ascending=False)
id score rank
0 1 3.5 4.0
1 2 3.65 3.0
2 3 4.0 1.0
3 4 3.85 2.0
4 5 4.0 1.0
5 6 3.65 3.0
Person =
| id | email |
| -- | ---------------- |
| 1 | [email protected] |
| 2 | [email protected] |
| 3 | [email protected] |
| id | email |
| -- | ---------------- |
| 1 | [email protected] |
| 2 | [email protected] |
import pandas as pd
# Modify Person in place
def delete_duplicate_emails(person: pd.DataFrame) -> None:
person.sort_values(by=['id'], inplace=True, ascending=True)
person.drop_duplicates(subset=['email'], inplace=True)
Products =
| product_id | store1 | store2 | store3 |
| ---------- | ------ | ------ | ------ |
| 0 | 95 | 100 | 105 |
| 1 | 70 | null | 80 |
| product_id | store | price |
| ---------- | ------ | ----- |
| 0 | store1 | 95 |
| 1 | store1 | 70 |
| 0 | store2 | 100 |
| 0 | store3 | 105 |
| 1 | store3 | 80 |
import pandas as pd
def rearrange_products_table(products: pd.DataFrame) -> pd.DataFrame:
# Step1: Pick up "product_id" and "store1" from DataFrame products
store1_df: pd.DataFrame = products.loc[products['store1'].notnull(), ['product_id', 'store1']]
# Step2: Rename "store1" to "price"
store1_df.rename(columns={'store1': 'price'}, inplace=True)
# Step3: Add store column with value store1
store1_df['store'] = 'store1'
# Step4: Adjust the column order to meet output formart requirement
store1_df = store1_df[['product_id', 'store', 'price']]
# Repeat Step1-4 for store2 and store3
store2_df: pd.DataFrame = products.loc[products['store2'].notnull(), ['product_id', 'store2']]
store2_df.rename(columns={'store2': 'price'}, inplace=True)
store2_df['store'] = 'store2'
store2_df = store2_df[['product_id', 'store', 'price']]
store3_df: pd.DataFrame = products.loc[products['store3'].notnull(), ['product_id', 'store3']]
store3_df.rename(columns={'store3': 'price'}, inplace=True)
store3_df['store'] = 'store3'
store3_df = store3_df[['product_id', 'store', 'price']]
# Step5: concat the 3 DataFrame follow axis=0
return pd.concat([store1_df, store2_df, store3_df], axis=0)
Accounts =
| account_id | income |
| ---------- | ------ |
| 3 | 108939 |
| 2 | 12747 |
| 8 | 87709 |
| 6 | 91796 |
| category | accounts_count |
| -------------- | -------------- |
| High Salary | 3 |
| Low Salary | 1 |
| Average Salary | 0 |
import pandas as pd
def count_salary_categories(accounts: pd.DataFrame) -> pd.DataFrame:
low_salary_filter: pd.Series = accounts['income'] < 20000
low_salary_count = low_salary_filter.sum()
average_salary_filter: pd.Series = (accounts['income'] >= 20000) & (accounts['income'] <= 50000)
average_salary_count = average_salary_filter.sum()
high_salary_filter: pd.Series = accounts['income'] > 50000
high_salary_count = high_salary_filter.sum()
salary_levels_dict = {
'category': ['Low Salary', 'Average Salary', 'High Salary'],
'accounts_count': [low_salary_count, average_salary_count, high_salary_count]
}
salary_levels = pd.DataFrame(data=salary_levels_dict)
return salary_levels
通过将 income 列中的每个值与 20000 进行比较来创建一个 Boolean Series ,工资小于 20000 的为 True,其它的为 False。
接下来,我们可以使用 sum() 方法统计 True 值的个数,sum() 将 True 视为 1,将 False 视为 0。因此,count 表示该系列中 True 的数量,它对应于低工资的账号数量。
作者:力扣官方题解
链接:https://leetcode.cn/problems/count-salary-categories/
来源:力扣(LeetCode)
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。