爬取智联招聘(搜索含关键词职位)

import requests
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
import time

def request(keyword,i,city='输入选择城市'):
    url='https://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p={}'.format(city,keyword,i)
    r=requests.get(url).text
    tree=etree.HTML(r)
    bs4=BeautifulSoup(r,'lxml')
    return tree,bs4

def content(tree,bs4):
    position=[]
    zwmc=bs4.find_all('td','zwmc')
    for mc in zwmc:
        zw=mc.find('div').text
        pattern =re.compile(u"[\u4e00-\u9fa5]+")
        z=re.findall(pattern,zw)
        text=" ".join(z)
        position.append(text)
    company=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[3]/a[1]/text()')
    wage=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[4]/text()')
    place=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[5]/text()')
    require=[]
    for zz in bs4.find_all('li','newlist_deatil_last'):
        require.append(zz.text)
    unregular=tree.xpath('//*[@id="newlist_list_content_table"]/table/tr[2]/td/div/div/ul/li[1]/span/text()')
    return position,company,wage,place,require,unregular


i=1
position=[]
company=[]
wage=[]
place=[]
require=[]
unregular=[]
next_page=[1]
keyword=input('请输入关键词,例如JAVA,销售代表,行政助理等:')
while len(next_page)>0:
    try:
        tree,bs4=request(keyword=keyword,i=i)
        next_page=tree.xpath('/html/body/div[3]/div[3]/div[2]/form/div[1]/div[1]/div[3]/ul/li[11]/a/@href')
        a,b,c,d,e,f=content(tree,bs4)
        position+=a
        company+=b
        wage+=c
        place+=d
        require+=e
        unregular+=f
        print('第{}页爬取完毕'.format(i))
        i+=1
    except:
        sleep_time=random.randint(1,3)
        time.sleep(sleep_time)
        print('Wait%ds'%sleep_time)

print('爬取完毕!')

from collections import namedtuple
stock_namedtuple=namedtuple('数据分析',('职位名称','公司名称','职位月薪',
                                    '工作地点','具体描述','更多内容'))
results=[stock_namedtuple(position,company,wage,place,
                                        require,unregular) for position,company,wage,place,
                                        require,unregular in zip(position,company,wage,place,
                                        require,unregular)]
result=pd.DataFrame(results)
result.to_csv('{}.csv'.format(keyword),encoding='utf-8')

你可能感兴趣的:(爬虫,Python)