import pandas as pd
import numpy as np
import jieba

数据读取

df = pd.read_excel(r'E:\python爬虫\前程无忧招聘信息.xlsx',index_col=0)

数据去重与空值处理

df.drop_duplicates(subset=['公司名称','岗位名称'],inplace=True)
df[df['招聘人数'].isnull()]
df.dropna(how='all',inplace=True)

岗位名称字段处理

df['岗位名称'] = df['岗位名称'].apply(lambda x:x.lower())
counts = df['岗位名称'].value_counts()
target_job = ['算法','开发','分析','工程师','数据','运营','运维','it','仓库','统计']
index = [df['岗位名称'].str.count(i) for i in target_job]
index = np.array(index).sum(axis=0) > 0
job_info = df[index]
job_list = ['数据分析',"数据统计","数据专员",'数据挖掘','算法','大数据','开发工程师',

        '运营','软件工程','前端开发','深度学习','ai','数据库','仓库管理','数据产品',
        '客服','java','.net','andrio','人工智能','c++','数据管理',"测试","运维","数据工程师"]

job_list = np.array(job_list)
def Rename(x,job_list=job_list):

index = [i in x for i in job_list]
if sum(index) > 0:
    return job_list[index][0]
else:
    return x

job_info['岗位名称'] = job_info['岗位名称'].apply(Rename)
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据专员","数据分析"))
job_info["岗位名称"] = job_info["岗位名称"].apply(lambda x:x.replace("数据统计","股指期货数据分析"))

岗位薪资字段处理

index1 = job_info["岗位薪资"].str[-1].isin(["年","月"])
index2 = job_info["岗位薪资"].str[-3].isin(["万","千"])
job_info = job_info[index1 & index2]
job_info['平均薪资'] = job_info['岗位薪资'].astype(str).apply(lambda x:np.array(x[:-3].split('-'),dtype=float))
job_info['平均薪资'] = job_info['平均薪资'].apply(lambda x:np.mean(x))

统一工资单位

job_info['单位'] = job_info['岗位薪资'].apply(lambda x:x[-3:])
job_info['公司领域'].value_counts()
def con_unit(x):

if x['单位'] == "万/月":
    z = x['平均薪资']*10000
elif x['单位'] == "千/月":
    z = x['平均薪资']*1000
elif x['单位'] == "万/年":
    z = x['平均薪资']/12*10000
return int(z)

job_info['平均薪资'] = job_info.apply(con_unit,axis=1)
job_info['单位'] = '元/月'

工作地点字段处理

job_info['工作地点'] = job_info['工作地点'].apply(lambda x:x.split('-')[0])

公司领域字段处理

job_info['公司领域'] = job_info['公司领域'].apply(lambda x:x.split('/')[0])

招聘人数字段处理

job_info['招聘人数'] = job_info['招聘人数'].apply(lambda x:x.replace("若干","1").strip()[1:-1])

工作经验与学历要求字段处理

job_info['工作经验'] = job_info['工作经验'].apply(lambda x:x.replace("无需","1年以下").strip()[:-2])
job_info['学历需求'] = job_info['学历需求'].apply(lambda x:x.split()[0])

公司规模字段处理

job_info['公司规模'].value_counts()
def func(x):

if x == '少于50人':
    return "<50"
elif x == '50-150人':
    return "50-150"
elif x == '150-500人':
    return '150-500'
elif x == '500-1000人':
    return '500-1000'
elif x == '1000-5000人':
    return '1000-5000'
elif x == '5000-10000人':
    return '5000-10000'
elif x == '10000人以上':
    return ">10000"
else:
    return np.nan

job_info['公司规模'] = job_info['公司规模'].apply(func)

公司福利字段处理

job_info['公司福利'] = job_info['公司福利'].apply(lambda x:str(x).split())

职位信息字段处理

job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.split('职能类别')[0])
with open(r"E:\C++\停用词表.txt",'r',encoding = 'utf8') as f:

stopword = f.read()

stopword = stopword.split()
job_info['职位信息'] = job_info['职位信息'].apply(lambda x:x.lower()).apply(lambda x:"".join(x)).apply(lambda x:x.strip()).apply(jieba.lcut).apply(lambda x:[i for i in x if i not in stopword])
cons = job_info['公司领域'].value_counts()
industries = pd.DataFrame(cons.index,columns=['行业领域'])
industry = pd.DataFrame(columns=['分词明细','行业领域'])
for i in industries['行业领域']:

words = []
word = job_info['职位信息'][job_info['公司领域'] == i]
word.dropna(inplace=True)
[words.extend(str(z).strip('\'[]').split("\', \'")) for z in word]
df1 = pd.DataFrame({'分词明细':words,
                    '行业领域':i})
industry = industry.append(df1,ignore_index=True)

industry = industry[industry['分词明细'] != "\n"]
industry = industry[industry['分词明细'] != ""]
count = pd.DataFrame(industry['分词明细'].value_counts())
lst = list(count[count['分词明细'] >=300].index)
industry = industry[industry['分词明细'].isin(lst)]

数据存储

industry.to_excel(r'E:\python爬虫\数据预处理\词云.xlsx')
job_info.to_excel(r'E:\python爬虫\数据预处理\前程无忧(已清洗).xlsx')

前程无忧岗位数据爬取+Tableau可视化分析