import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pymysql
% matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
# 加载数据
conn = pymysql.connect(host = "127.0.0.1",
port = 3306,
user = "root",
password = "123456",
db = "test",
charset = "utf8"
)
sqldata = pd.read_sql('SELECT * FROM lagou', conn)
sqldata.head(2) # 查看数据
sqldata.to_csv("拉勾数据分析.csv", encoding='utf_8_sig') # 导出
sqldata.isnull().sum() # 查看缺失值
sqldata.info() # 查看信息
共1631个值,经纬度(longitude, latitude)缺失部分数据,标签项(hitags)数据严重缺失,但对此次分析并无影响
# 分割工资
spllist = sqldata['salary'].str.replace("k",'').str.replace("K",'').str.split('-')
# 取平均值
ee = (spllist.str[0].astype('int') + spllist.str[1].astype('int'))/2
sqldata.drop('salary',axis=1)
sqldata['salary'] = ee
# 查看异常值
sqldata[sqldata['salary']>200]
# 删除异常值
sqldata.drop(sqldata[sqldata['salary']>200].index,inplace=True)
city_series = sqldata['city'].value_counts()
fig,ax=plt.subplots(figsize=(10,5))
ax.bar(city_series.index,city_series)
ax.set_title("各城市招聘数量分布",fontsize=17)
for a,b in zip(city_series.index,city_series):
plt.text(a, b+1.5, '%.0f' % b, ha='center', va= 'bottom',fontsize=10)
plt.xticks(np.arange(len(city_series)),city_series.index)
plt.xlabel('地区', size=12);
plt.ylabel('频数', size=12);
plt.ylim(0,520)
plt.show()
总体薪资分布情况
sqldata['salary'].hist(figsize=(10,5),bins = 30, edgecolor='k', grid = False, )
plt.xlabel('薪资(千/月)', size=12)
plt.ylabel('频数', size=12)
plt.title('总体薪资分布', size=17)
plt.xticks(range(0,90,5), size=12)
plt.yticks(size=12)
"应届生"及"经验不限"薪资分布情况
# 应届生 经验不限 数据
yingjie = sqldata[sqldata['workYear'].str.contains("应届|不限")]
yingjie.hist(figsize=(10,5),bins = 30, edgecolor='k', grid = False, )
plt.xlabel('薪资(千/月)', size=12)
plt.ylabel('频数', size=12)
plt.title('应届生及经验不限-薪资分布', size=17)
plt.xticks(range(0,90,5), size=12)
plt.yticks(size=12)
plt.show()
# 工作经验均值
jobMean = sqldata.groupby('workYear').mean().reset_index()
jobCount = sqldata.groupby('workYear').count()['city'].reset_index()
# jobCount['percent'] = jobCount['city']/sqldata.count()['city']
jobCount.rename(columns={'city' : 'count'},inplace=True)
jobCount
# 根据薪资排序
jobMean.sort_values('salary',inplace=True)
jobMean
# 箱线图各年限薪资
nianxianBoxData = [
np.array(sqldata[sqldata['workYear'] == "应届毕业生"]['salary']),
np.array(sqldata[sqldata['workYear'] == "1年以下"]['salary']),
np.array(sqldata[sqldata['workYear'] == "1-3年"]['salary']),
np.array(sqldata[sqldata['workYear'] == "不限" ]['salary']),
np.array(sqldata[sqldata['workYear'] == "3-5年" ]['salary']),
np.array(sqldata[sqldata['workYear'] == "5-10年"]['salary']),
np.array(sqldata[sqldata['workYear'] == "10年以上"]['salary'])
]
fig = plt.figure(figsize=(18,6))
ax1 = plt.subplot(121)
ax3 = plt.subplot(122)
sizes = jobCount['count']
labels = jobCount['workYear']
ax1.pie(sizes,labels=labels,
autopct='%1.1f%%',
shadow=False,pctdistance =0.8,
explode=None,
textprops={'fontsize': 13, 'color': 'w'},
startangle=83
)
ax3.boxplot(
nianxianBoxData,
boxprops = {'color':'blue'},
flierprops = {'markerfacecolor':'red','color':'black','markersize':4}
)
ax3.set_xticklabels(
['应届毕业生','1年以下','1-3年','不限','3-5年','5-10年','10年以上']
)
ax1.set_title("工作年限频数分布",fontsize=17)
ax3.set_title("各年限的薪资分布",fontsize=17)
ax1.legend(loc='best')
ax3.set_xlabel('年限', size=12);
ax3.set_ylabel('薪资(千/月)', size=12);
ax1.axis('equal')
plt.show()
workyear = sqldata.groupby('education').count()['city'].reset_index()
workyear.rename(columns={'city' : 'count'},inplace=True)
workyear
size3 = workyear['count']
labels3 = workyear['education']
# 学历箱线图各年限薪资数据
xueliBoxData = [
np.array(sqldata[sqldata['education'] == "大专"]['salary']),
np.array(sqldata[sqldata['education'] == "不限"]['salary']),
np.array(sqldata[sqldata['education'] == "本科"]['salary']),
np.array(sqldata[sqldata['education'] == "硕士"]['salary']),
np.array(sqldata[sqldata['education'] == "博士"]['salary'])
]
# 作图
fig = plt.figure(figsize=(18,6))
ax1 = plt.subplot(121)
ax2 = plt.subplot(122)
ax1.pie(size3,labels=labels3,
autopct='%1.1f%%',
shadow=False,pctdistance =0.8,
explode=None,
textprops={'fontsize': 13, 'color': 'w'},
startangle=180
)
ax2.boxplot(xueliBoxData,
boxprops = {'color':'blue'},
flierprops = {'markerfacecolor':'red','color':'black','markersize':4}
)
ax2.set_xticklabels(['大专','不限','本科','硕士','博士'])
ax2.set_title("各年限的薪资分布",fontsize=17)
ax2.set_xlabel('学历', size=12);
ax2.set_ylabel('薪资(千/月)', size=12);
ax1.legend(loc='best')
ax1.set_title("学历频数分布",fontsize=17)
ax1.axis('equal')
plt.show()
effectdata = [
sqldata[sqldata['workYear'] == "不限"].groupby('education').mean().rename(columns={'salary' : '不限'}),
sqldata[sqldata['workYear'] == "应届毕业生"].groupby('education').mean().rename(columns={'salary' : '应届毕业生'}),
sqldata[sqldata['workYear'] == "1年以下"].groupby('education').mean().rename(columns={'salary' : '1年以下'}),
sqldata[sqldata['workYear'] == "1-3年"].groupby('education').mean().rename(columns={'salary' : '1-3年'}),
sqldata[sqldata['workYear'] == "3-5年"].groupby('education').mean().rename(columns={'salary' : '3-5年'}),
sqldata[sqldata['workYear'] == "5-10年"].groupby('education').mean().rename(columns={'salary' : '5-10年'}),
sqldata[sqldata['workYear'] == "10年以上"].groupby('education').mean().rename(columns={'salary' : '10年以上'}),
]
effect = pd.concat(effectdata,axis=1,sort=True)
doctor = effect.loc[['博士']]
effect.drop('博士',inplace=True)
effect = effect.append(doctor)
effect
# 柱状图
effect.T.plot.bar(figsize=(16,10),width=0.7)
plt.xticks(rotation=0)
plt.xlabel('年限', size=12)
plt.ylabel('薪资(千/月)', size=12)
plt.title("相同经验下,学历对薪资的影响",fontsize=17)
plt.show()
beijingnum = sqldata[(sqldata['city']== '北京')&(sqldata['education']== '本科')&(sqldata['workYear'].str.contains("应届|不限"))]['salary']
plt.figure(figsize=(10,5))
plt.hist(beijingnum)
plt.xlabel('薪资(千/月)', size=12);
plt.ylabel('频数', size=12);
plt.title('北京应届生及不限经验的招聘信息平均工资统计', size=17)
plt.show()
beijingnum.describe().reset_index()
分割文本
positionLable = sqldata['positionLable']
sp = positionLable.str.split(',').tolist()
wordlist = [i for item in sp for i in item]
wordlist
def isAllZh(s):
if len(s) == 1:
return False
for c in s:
if not('\u4e00' <= c <= '\u9fa5'):
return False
return True
word_count = {}
for word in wordlist:
# if isAllZh(word):
word_count[word] = word_count.get(word, 0) + 1
items = list(word_count.items())
# print(items)
chinese_words = []
items.sort(key=lambda x: x[1], reverse=True)
for i in range(50):
word, count = items[i]
for j in range(count):
chinese_words.append(word)
print("{0:<10}{1:>5}".format(word, count))
词云分析
string = ' '.join(chinese_words)
w = WordCloud(collocations=False,
width=2000,height=1000,
font_path='simhei.ttf',
background_color='white',
stopwords=['数据分析'],
max_words=50,
max_font_size=400,
min_font_size=40
)
w.generate_from_text(string)
plt.imshow(w)
plt.axis('off')
plt.figure()