项目基本流程
1.确认目标数据,利用python爬虫技术获取某招聘网站上数据分析师岗位的任职要求;
2.对获取的数据进行分词、去掉停留词处理,画出词云;
一、获取数据
1.1导入需要的库
import requests
import lxml
from bs4 import BeautifulSoup
import pandas as pd
head={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'}
1.2获取所有详情页的url
def get_info(page):
url_list = []
for i in range(page):
page=i+1
ka='page'+'-'+str(i+1)
url='https://www.zhipin.com/c101010100/h_101010100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&page={0}&sort=2&ka={1}'.format(page,ka)
response=requests.get(url,headers=head)
content=BeautifulSoup(response.text, 'lxml')
all_a = content.find_all('div', class_='info-primary')
for j in all_a:
url_list.append('https://www.zhipin.com' + j.find('a')['href'])
return url_list
1.3获取详情页的源码,并存储至DataFrame中
content_list=[]
for url_detail in urls:
response_detail=requests.get(url_detail,headers=head)
content=response_detail.text.strip()
content_list.append(content)
data=pd.DataFrame()
data['content']=content_list
data['url']=urls
data.head() #查看并确认获取到的数据是否正确
1.4删除无效信息
drop_data=data[data.content.str.contains('为了您的账号安全,我们需要在执行操作之前验证您的身份,请输入验证码')]
for i in drop_data.index:
data.drop(index=i,inplace=True)
1.5从源码中获取需要的字段
job_titles=[]
wages=[]
com_names=[]
job_descs=[]
exps=[]
xls=[]
for i in range(len(data)):
soup_content=BeautifulSoup(data.content[i],'lxml')
job_title=soup_content.find('h1').text #职位名称
wage=soup_content.find('span',class_='badge').text.strip() #工资
com_name=soup_content.find('div',class_='info-company').find('h3').find('a').text #公司名称
job_desc=soup_content.find('div',class_='text').text.strip() #工作描述
job_req=soup_content.find('div', class_='info-primary').find('p').contents #要求
exp=job_req[2][3:]#经验
xl=job_req[4][3:]#学历
job_titles.append(job_title)
wages.append(wage)
com_names.append(com_name)
job_descs.append(job_desc)
exps.append(exp)
xls.append(xl)
1.6将抽取的字段存储至DataFrame中
data_details=pd.DataFrame()
data_details['职位']=job_titles
data_details['工资']=wages
data_details['公司名称']=com_names
data_details['工作描述']=job_descs
data_details['经验']=exps
data_details['学历']=xls
data_details['url']=data['url']
1.7将爬取得最终数据存储至csv文件中
data_details.to_csv(r'data_details.csv',sep=',',encoding ='gb18030')
二、画出词云
2.1导入需要的库
import jieba #分词
from wordcloud import WordCloud #词云
from wordcloud import STOPWORDS #停留词库
from PIL import Image #导入图片,画个性化词云
import matplotlib.pyplot as plt #画图
2.2读取目标文件,将目标字段改为列表
data_details=pd.read_csv('data_details.csv',sep=',',encoding ='gb18030')
data_jobdes=data_details.工作描述.tolist()
2.3目标数据处理(分词、去掉停留词)
words=[]
for content in data_jobdes:
seg=jieba.lcut(content)
for word in seg:
if word=='\n'or len(word)<=1:
continue
elif word in STOPWORDS:
continue
else:
words.append(word.lower())
2.4计算词频
words_count=word_clean.groupby(by=['word_clean'])['word_clean'].count().to_frame()
words_count.rename(columns={'word_clean':'counts'},inplace=True)
word_count_sort=words_count.reset_index().sort_values(by=['counts'],ascending=False)
2.5将词频设置为字典格式
word_freq_dict={}
for i in word_count_sort.values:
word_freq_dict[i[0]]=i[1]
2.6画词云
pic=np.array(Image.open(r'./pic1.jpg')) #设置背景图片,并将图片用数组表示
plt.rcParams['figure.figsize']=(30,15) #设置画布大小
#绘制词云的基本参数
my_cloud=WordCloud(
font_path='C:/Windows/Fonts/simkai.ttf', #字体格式
width=400,
height=200,
background_color='white',
mask=pic)
cloud_pic=my_cloud.fit_words(word_freq_dict)
#显示词云
plt.imshow(cloud_pic,interpolation='bilinear') #设置图像清晰度,interpolation插值:双线性、最近邻
plt.axis('off')
plt.savefig(r'./cat_wordcloud.jpg')
plt.show()