爬取boss直聘网上海地区python相关职业招聘信息

比较简单的代码,有不少地方可以优化!

import requests
from lxml import etree
import os

def getpage(url):
    try:
        header={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
        r=requests.get(url,headers=header)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        return html
    except Exception as err:
        print(str(err))

def parsepage(html):
    res=etree.HTML(html)
    quotes=res.xpath('//div[@class="job-primary"]')
    for quote in quotes:
        name=quote.xpath('//div[@class="job-title"]/text()')
        money=quote.xpath('//span[@class="red"]/text()')
        company = quote.xpath('//div[@class="company-text"]/h3/a/text()')
        url=quote.xpath('//div[@class="job-primary"]//h3[@class="name"]/a/@href')
        # didian=res.xpath('//*[@id="main"]/div/div[3]/ul/li[1]/div/div[1]/p/text()[1]')
        for i in range(0,len(name)):
            savepage(name[i],money[i],company[i],'http://www.zhipin.com'+url[i])

def main(url):
    html=getpage(url)
    parsepage(html)

def savepage(a,b,c,d):
    if not os.path.exists('jobinfo'):
        os.mkdir('jobinfo')
    with open('jobinfo/detail.json','a') as f:
        tplt="{0:{5}<30}\t{1:{5}<30}\t{2:{5}<30}\t{3:{5}<30}\t{4}"
        f.write(tplt.format(a,b,c,d,'\n',chr(12288)))
        f.close()

if __name__=='__main__':
    list=[]
    for i in range(1,10):
        url='https://www.zhipin.com/c101020100-p100109/?page='+str(i)+'&ka=page-'+str(i)
        main(url)

你可能感兴趣的:(python,爬虫,pycharm,python,爬虫)