python数据分析第5天
pandas 的应用
创建series对象
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
ser1 = pd.Series(
data=[320, 180, 300, 405, 120],
index=['一季度', '二季度', '三季度', '四季度', '三季度']
)
ser1
ser1[0]
ser1['一季度']
ser2 = pd.Series(data={
'一季度': 300,
'二季度': 450,
'三季度': 180,
'四季度': 270,
'三季度': 160
})
ser2
ser2[0:2]
ser2['一季度':'三季度']
ser2[['一季度', '四季度']]
ser2[ser2 >= 200]
ser2.values
ser2.index
ser2.hasnans
ser2.is_monotonic
ser2.is_unique
ser2.describe()
ser2.describe()['25%']
ser2.describe()['max']
ser3 = pd.Series(['apple', 'banana', 'apple', 'pitaya', 'apple', 'pitaya', 'durian'])
ser3
ser3.unique()
"""
array(['apple', 'banana', 'pitaya', 'durian'], dtype=object)
"""
ser3.drop_duplicates()
ser3.nunique()
ser3.duplicated()
ser3.value_counts()
ser4 = pd.Series(data=[10, 20, np.NaN, 30, np.NaN])
ser4
ser4.isnull()
ser4.notnull()
ser4[ser4.notnull()]
ser4.dropna()
ser4.fillna(100)
ser4.fillna(ser4.median())
ser4.fillna(method='ffill')
ser4.fillna(method='bfill')
ser5 = pd.Series(np.arange(1, 10))
ser5.where(ser5 < 5, 99)
ser5.mask(ser5 < 5, 100)
ser7 = pd.Series(np.random.randint(30, 80, 50))
import math
temp = ser7.apply(lambda x: math.floor(x ** 0.5 * 10))
temp
temp[temp >= 60].count()
ser7[ser7 >= 60].count()
ser8 = pd.Series(
data=[35, 96, 12, 57, 25, 89],
index=['grape', 'banana', 'pitaya', 'apple', 'peach', 'orange']
)
ser8.sort_values(ascending=False)
ser8.sort_index(inplace=True)
ser8.nlargest(3)
ser8.nsmallest(2)
import heapq
nums = [35, 12, 98, 57, 78, 42, 87]
heapq.nlargest(3, nums)
heapq.nsmallest(2, nums)
ser9 = pd.Series({'一季度': 400, '二季度': 520, '三季度': 180, '四季度': 380})
ser9.plot(figsize=(6, 3), width=0.2, kind='bar', color=['r', 'y', 'b', 'g'])
plt.grid(True, alpha=0.25, axis='y', linestyle='--')
plt.xticks(rotation=0)
plt.yticks(np.arange(0, 601, 100))
for i in range(ser9.size):
plt.text(i, ser9[i] + 5, ser9[i], ha='center')
plt.show()
ser9.plot(figsize=(3, 3), kind='pie', autopct='%.1f%%',
wedgeprops=dict(width=0.4, edgecolor='white'),
pctdistance=0.8)
plt.ylabel('')
plt.show()
scores = np.random.randint(60, 101, (5, 3))
courses = ['语文', '数学', '英语']
ids = [1001, 1002, 1003, 1004, 1005]
df1 = pd.DataFrame(data=scores, columns=courses, index=ids)
df1
scores = {
'语文': [62, 72, 93, 88, 93],
'数学': [95, 65, 86, 66, 87],
'英语': [66, 75, 82, 69, 82],
}
ids = [1001, 1002, 1003, 1004, 1005]
df2 = pd.DataFrame(data=scores, index=ids)
df2
df3 = pd.read_csv(
'*.csv',
index_col='id',
quotechar='|',
usecols=['id', 'name', 'score'],
nrows=20,
skiprows=np.arange(1, 21)
)
df3
df4 = pd.read_csv(
'*.csv',
encoding='gbk'
)
df4
df5 = pd.read_csv(
'*.tsv',
delimiter='\t'
)
df5
df6 = pd.read_excel(
'data/excel/2020年销售数据.xlsx',
header=1,
sheet_name='Sheet3',
usecols=['销售日期', '销售区域', '销售渠道', '品牌', '售价', '销售数量'],
nrows=100,
skiprows=np.arange(2, 102)
)
df6
import pymysql
conn = pymysql.connect(host='127.0.0.1', port=3306,
user='*', password='*',
database='*', charset='utf8mb4')
conn
df7 = pd.read_sql(
'select dno, dname, dloc from tb_dept',
conn, index_col='dno'
)
df7
df8 = pd.read_sql(
'select eno, ename, job, sal, comm, dno from tb_emp',
conn, index_col='eno'
)
df8
df8['married'] = ['未婚'] * 5 + ['已婚'] * 9
df8
df8.loc[9800] = ['$', '$', 12000, 800, 20, '已婚']
df8
df8.drop(columns=['comm', 'married'])
df8.drop(index=[1359, 3233, 3088])
df8.reset_index(inplace=True)
df8
df8.set_index('eno')
pd.merge(left=df8, right=df7, how='inner', left_on='dno', right_on='deptno')
pd.concat((df8, df7))
import os
filenames = os.listdir('$')
dfs = [pd.read_excel(os.path.join('$', filename), header=1)
for filename in filenames]
pd.concat(dfs, ignore_index=True).to_excel('汇总数据.xlsx', index=False)
df8[df8.sal >= 4000]
df8.query('dno == 20 and sal >= 4000')