genre [ˈʒɒnrə] 让
column 靠乐母
import pandas as pd
df = pd.DataFrame({‘地区’: [‘山东’, ‘北京’], “⼈⼝”: [1.1, 1.2],
“省会”: [“济南”, “北京”]})
print(df)
df2 = df.reindex([0, 1, 2]) # (返回值多一个索引,原数据不变)
print(df2)
df2.loc[2] = [‘山西’, 0.7, ‘太原’] # 相当于增加了一行
print(df2)
df3 = df.set_index(‘地区’) # 地区设置为index
print(‘*’*30)
print(df3.index)
print(df3.loc[‘山东’]) # 相当于一个字典的作用
df4 = df3.reset_index()
print(df4)
import pandas as pd
df = pd.DataFrame({‘地区’: [‘山东’, ‘北京’], “⼈⼝”: [1.1, 1.2],
“省会”: [“济南”, “北京”]})
df5 = pd.DataFrame({‘地区’: [‘陕西’], ‘省会’: [‘西安’], “⼈⼝”: [1.5]})
print(df.append(df5)) # 如果有,可以实现自动对齐;如果没有,相当于新增一列
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.info()) # 可以检查空值
df_not_revenue_null = df[df[‘Revenue (Millions)’].notnull()] # 是notnull
df_not_revenue_average = df_not_revenue_null[‘Revenue (Millions)’].mean() # 计算平均值
df.loc[df[‘Revenue (Millions)’].isnull(), ‘Revenue (Millions)’] = df_not_revenue_average
print(df.info()) # 空值均被填充为平均值 注意: 必须有括号"()"
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
df2 = df.drop_duplicates(subset=[‘Director’, ‘Year’], keep=‘first’)
ret = df[df[‘Director’] == ‘James Gunn’]
print(df.shape)
print(df2.shape) # 通过去重,有300多条数据是重复的
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
ret = df.groupby(by=[‘Director’]).aggregate({‘Rating’: ‘mean’,
‘Votes’: ‘sum’}) # 通过字典传入参数
print(ret) # 计算结果
print(ret.index) # 索引就是导演的名字
print(ret.columns) # 查看有几列 输出是2列
import pandas as pd
df1 = pd.DataFrame({‘地区’: [‘北京’, ‘⼭东’, ‘天津’],
‘⼈⼝’: [1.2, 1.1, 0.7]})
df2 = pd.DataFrame({‘地区’: [‘北京’, ‘⼭东’, ‘陕⻄’],
‘省会’: [‘北京’, ‘济南’, ‘⻄安’]})
df1_dist = df1.set_index(‘地区’) # 先把地区设置为index,才可以进行连接
df2_dist = df2.set_index(‘地区’)
df_join = df1_dist.join(df2_dist, how=‘left’) # left right outer(并集) inner(交集)
print(df_join)
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.head()) # 查看前5行数据
print(df.pivot_table(index=‘Director’, columns=‘Year’, values=[‘Rating’, ‘Votes’],
aggfunc={‘Rating’: ‘mean’,
‘Votes’: ‘sum’}))
import pandas as pd
import numpy as np
df = pd.DataFrame([[1, 3], [4, 1], [6, 2]],
columns=[‘a’, ‘b’])
print(df)
arr = df.values
print(arr)
arr2 = np.insert(df.values, 1, [10, 20], axis=0)
print(arr2)
result2 = pd.DataFrame(arr2, columns=[‘a’, ‘b’])
print(result2)
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.loc[:, ‘Rank’:‘Genre’]) # 行全选,通过标签选择前3列数据(标签选择)
print(df.iloc[:, :3]) # 位置索引选择 用iloc函数
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.info()) # 整体查看空值,是info;具体哪一列,用loc函数查看
df2 = df.loc[df[‘Revenue (Millions)’].isnull(), :] # 行是选择为空值的,列是全选
print(df2)
import pandas as pd
d = {‘year’: [2019, 2020, 2021], ‘day’: [40, 50, 60]}
df = pd.DataFrame(d)
print(df)
df[‘int_number’] = df[‘year’] * 1000 + df[‘day’]
print(df)
df[‘date’] = pd.to_datetime(df[‘int_number’], format=‘%Y%j’) # %j:一年当中的第几天
print(df)
import pandas as pd
import numpy as np
from pandas.util.testing import makeTimeDataFrame
ret = makeTimeDataFrame(10)
print(ret.shape)
ret2 = pd.DataFrame(np.random.randint(1, 1000, size=(10, 3)), columns=[‘编码’, ‘销量’, ‘库存’],
index=pd.util.testing.makeDateIndex(10, freq=‘H’))
print(ret2)
import pandas as pd
d = {‘商品’: [‘牙刷’, ‘药膏’, ‘牙杯’, ‘毛巾’],
‘销量’: [100, ‘40’, 50.0, 200]}
df = pd.DataFrame(d)
print(df)
print(df[‘销量’].apply(type).value_counts()) # 查看值出现了几次
print(df[‘销量’].apply(type)) # 对销量这列进行透视
df[‘销量’] = df[‘销量’].astype(‘int’)
print(df[‘销量’].sum()) # 不类型转换会报错
import pandas as pd
d = {
“gender”: [“male”, “female”, “male”, “female”],
“color”: [“red”, “green”, “blue”, “green”],
“age”: [25, 30, 15, 32]
}
df = pd.DataFrame(d)
print(df)
d = {‘male’: 0, ‘female’: 1}
df[‘gender2’] = df[‘gender’].map(d)
d2 = {‘red’: 0, ‘green’: 1, ‘blue’: 2}
df[‘color2’] = df[‘color’].map(d2)
print(df)
import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.columns)
col1 = list(df.columns[1:3])
col1.append(df.columns[0])
col1.extend(list(df.columns[3:]))
print(df[col1])
import pandas as pd
import numpy as np
time_index = pd.util.testing.makeDateIndex(240, freq=‘H’)
data = np.random.randint(1, 10, size=(240, 1))
df = pd.DataFrame(data, index=time_index)
print(df.resample(‘D’)[0].sum()) # 下采样
import pandas as pd
d = {“customer”: [“A”, “B”, “C”, “D”],
“sales”: [1100, “950.5”, “$400”, " KaTeX parse error: Expected 'EOF', got '}' at position 10: 1250.75"]}̲ df = pd.DataFr…]', ‘’, regex=True) # 把美元符号替换为空
print(df1)
print(df1.apply(type))
df[‘sales2’] = df1.astype(‘float’)
print(df)
import pandas as pd
d = {‘carname’: [‘A’, ‘B’, ‘A’, ‘A’, ‘B’, ‘C’, ‘B’, ‘D’, ‘E’, ‘A’],
‘销量’: [100, 100, 200, 300, 100, 50, 20, 30, 40, 10]}
print(d)
df = pd.DataFrame(d)
print(df)
freq = df[‘carname’].value_counts()
small_category = freq[freq < 2]
print(freq)
print(small_category)
print(small_category.index) # D E F
df2 = df[‘carname’].replace(small_category.index, ‘其他’) # 原数据不变,生成新的对象,新的一列
print(df2)
import pandas as pd
d = {‘carname’: [‘A’, ‘B’, ‘A’, ‘A’, ‘B’, ‘C’, ‘B’, ‘D’, ‘E’, ‘A’],
‘销量’: [100, 100, 200, 300, 100, 50, 20, 30, 40, 10]}
df = pd.DataFrame(d)
def f©:
freq = df[‘carname’].value_counts()
small_category = freq[freq < 2]
if c in small_category.index:
return “其他”
else:
return c
print(df[‘carname’].map(f))