import requests
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
import time
def request(keyword,i,city='输入选择城市'):
url='https://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p={}'.format(city,keyword,i)
r=requests.get(url).text
tree=etree.HTML(r)
bs4=BeautifulSoup(r,'lxml')
return tree,bs4
def content(tree,bs4):
position=[]
zwmc=bs4.find_all('td','zwmc')
for mc in zwmc:
zw=mc.find('div').text
pattern =re.compile(u"[\u4e00-\u9fa5]+")
z=re.findall(pattern,zw)
text=" ".join(z)
position.append(text)
company=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[3]/a[1]/text()')
wage=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[4]/text()')
place=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[5]/text()')
require=[]
for zz in bs4.find_all('li','newlist_deatil_last'):
require.append(zz.text)
unregular=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[2]/td/div/div/ul/li[1]/span/text()')
return position,company,wage,place,require,unregular
i=1
position=[]
company=[]
wage=[]
place=[]
require=[]
unregular=[]
next_page=[1]
keyword=input('请输入关键词,例如JAVA,销售代表,行政助理等:')
while len(next_page)>0:
try:
tree,bs4=request(keyword=keyword,i=i)
next_page=tree.xpath('/html/body/div[3]/div[3]/div[2]/form/div[1]/div[1]/div[3]/ul/li[11]/a/@href')
a,b,c,d,e,f=content(tree,bs4)
position+=a
company+=b
wage+=c
place+=d
require+=e
unregular+=f
print('第{}页爬取完毕'.format(i))
i+=1
except:
sleep_time=random.randint(1,3)
time.sleep(sleep_time)
print('Wait%ds'%sleep_time)
print('爬取完毕!')
from collections import namedtuple
stock_namedtuple=namedtuple('数据分析',('职位名称','公司名称','职位月薪',
'工作地点','具体描述','更多内容'))
results=[stock_namedtuple(position,company,wage,place,
require,unregular) for position,company,wage,place,
require,unregular in zip(position,company,wage,place,
require,unregular)]
result=pd.DataFrame(results)
result.to_csv('{}.csv'.format(keyword),encoding='utf-8')