import pandas as pd
import numpy as np
import jieba
数据读取
df = pd.read_excel(r'E:\python爬虫\前程无忧招聘信息.xlsx',index_col=0)
数据去重与空值处理
df.drop_duplicates(subset=['公司名称','岗位名称'],inplace=True)
df[df['招聘人数'].isnull()]
df.dropna(how='all',inplace=True)
岗位名称字段处理
df['岗位名称'] = df['岗位名称'].apply(lambda x:x.lower())
counts = df['岗位名称'].value_counts()
target_job = ['算法','开发','分析','工程师','数据','运营','运维','it','仓库','统计']
index = [df['岗位名称'].str.count(i) for i in target_job]
index = np.array(index).sum(axis=0) > 0
job_info = df[index]
job_list = ['数据分析',"数据统计","数据专员",'数据挖掘','算法','大数据','开发工程师',
'运营','软件工程','前端开发','深度学习','ai','数据库','仓库管理','数据产品',
'客服','java','.net','andrio','人工智能','c++','数据管理',"测试","运维","数据工程师"]
job_list = np.array(job_list)
def Rename(x,job_list=job_list):
index = [i in x for i in job_list]
if sum(index) > 0:
return job_list[index][0]
else:
return x
job_info['岗位名称'] = job_info['岗位名称'].apply(Rename)
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据专员","数据分析"))
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据统计","股指期货数据分析"))
岗位薪资字段处理
index1 = job_info["岗位薪资"].str[-1].isin(["年","月"])
index2 = job_info["岗位薪资"].str[-3].isin(["万","千"])
job_info = job_info[index1 & index2]
job_info['平均薪资'] = job_info['岗位薪资'].astype(str).apply(lambda x:np.array(x[:-3].split('-'),dtype=float))
job_info['平均薪资'] = job_info['平均薪资'].apply(lambda x:np.mean(x))
统一工资单位
job_info['单位'] = job_info['岗位薪资'].apply(lambda x:x[-3:])
job_info['公司领域'].value_counts()
def con_unit(x):
if x['单位'] == "万/月":
z = x['平均薪资']*10000
elif x['单位'] == "千/月":
z = x['平均薪资']*1000
elif x['单位'] == "万/年":
z = x['平均薪资']/12*10000
return int(z)
job_info['平均薪资'] = job_info.apply(con_unit,axis=1)
job_info['单位'] = '元/月'
工作地点字段处理
job_info['工作地点'] = job_info['工作地点'].apply(lambda x:x.split('-')[0])
公司领域字段处理
job_info['公司领域'] = job_info['公司领域'].apply(lambda x:x.split('/')[0])
招聘人数字段处理
job_info['招聘人数'] = job_info['招聘人数'].apply(lambda x:x.replace("若干","1").strip()[1:-1])
工作经验与学历要求字段处理
job_info['工作经验'] = job_info['工作经验'].apply(lambda x:x.replace("无需","1年以下").strip()[:-2])
job_info['学历需求'] = job_info['学历需求'].apply(lambda x:x.split()[0])
公司规模字段处理
job_info['公司规模'].value_counts()
def func(x):
if x == '少于50人':
return "<50"
elif x == '50-150人':
return "50-150"
elif x == '150-500人':
return '150-500'
elif x == '500-1000人':
return '500-1000'
elif x == '1000-5000人':
return '1000-5000'
elif x == '5000-10000人':
return '5000-10000'
elif x == '10000人以上':
return ">10000"
else:
return np.nan
job_info['公司规模'] = job_info['公司规模'].apply(func)
公司福利字段处理
job_info['公司福利'] = job_info['公司福利'].apply(lambda x:str(x).split())
职位信息字段处理
job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.split('职能类别')[0])
with open(r"E:\C++\停用词表.txt",'r',encoding = 'utf8') as f:
stopword = f.read()
stopword = stopword.split()
job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.lower()).apply(lambda x:"".join(x)).apply(lambda x:x.strip()).apply(jieba.lcut).apply(lambda x:[i for i in x if i not in stopword])
cons = job_info['公司领域'].value_counts()
industries = pd.DataFrame(cons.index,columns=['行业领域'])
industry = pd.DataFrame(columns=['分词明细','行业领域'])
for i in industries['行业领域']:
words = []
word = job_info['职位信息'][job_info['公司领域'] == i]
word.dropna(inplace=True)
[words.extend(str(z).strip('\'[]').split("\', \'")) for z in word]
df1 = pd.DataFrame({'分词明细':words,
'行业领域':i})
industry = industry.append(df1,ignore_index=True)
industry = industry[industry['分词明细'] != "\n"]
industry = industry[industry['分词明细'] != ""]
count = pd.DataFrame(industry['分词明细'].value_counts())
lst = list(count[count['分词明细'] >=300].index)
industry = industry[industry['分词明细'].isin(lst)]
数据存储
industry.to_excel(r'E:\python爬虫\数据预处理\词云.xlsx')
job_info.to_excel(r'E:\python爬虫\数据预处理\前程无忧(已清洗).xlsx')