python爬取猎聘网数据并且存入数据库

  目前我还是爬虫小白,只是会简单通过解析网页数据来保存,还不会模拟登陆,ip代理等技巧,这边只是爬取静态网页,对于动态生成的内容,我就没折了 ,由于最近工作比较忙就没有进一步学习高级爬虫技巧,大伙先勉强看着,后面学会了,给大家分享高级技巧。代码如下:

'''
爬取猎聘网职位信息做数据分析
'''
from bs4 import BeautifulSoup
import requests
import time
import pymongo

'''
保存数据库方法:
'''
def saveToDB(json):
    print("正在保存到数据库")
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client.LiePinData
    data = db.android
    data.insert_one(json)



'''
爬取主程序
'''
nextUrlBase = "https://www.liepin.com/zhaopin/?init=-1&headckid=870b81c75324cfd6&fromSearchBtn=2&sfrom=click-pc_homepage-centre_searchbox-search_new&ckid=870b81c75324cfd6°radeFlag=0&key=android&siTag=wxsyNkzhnKj80VCkF-zOWA~fA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_fp&d_ckId=2e3b9060a66797f918418a1a6256318f&d_curPage=0&d_pageSize=40&d_headId=2e3b9060a66797f918418a1a6256318f&curPage=15"
page = 0
headers={'user-agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url="https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=android"
r = requests.get(url,headers)
#这里和网页的编码设置相关
r.encoding = 'utf-8'#解决乱码问题
soup = BeautifulSoup(r.text, 'html.parser')
jobList = soup.find_all('div','sojob-item-main clearfix')
print("一页数量 :",len(jobList))
for pageNum in range(0,100):
    time.sleep(1)#休眠1秒
    print("------page-----",pageNum,page)
    if page>0:
        url=nextUrlBase+str(page)
        print("开始爬取 : "+url)
        r = requests.get(url,headers)
        r.encoding = 'utf-8'#解决乱码问题
        soup = BeautifulSoup(r.text, 'html.parser')
        jobList = soup.find_all('div','sojob-item-main clearfix')
    page = page+1
    for job in jobList:
        jobName = ""
        try:
            jobName = job.find('h3').a.text.strip()
        except:
            jobName = "没有找到公司职位"
        print("职位: ",jobName)
        companyName = ""
        try:
            companyName = job.find('p','company-name').a.text.strip()
        except:
            companyName = "没有找到公司名称"
        print("公司名称",companyName)
        salary = ""
        try:
            salary = job.find('span','text-warning').text.strip()
        except:
            salary = "没有找到薪水"
        print("薪水",salary)
        edu = ""
        try:
            edu = job.find('span','edu').text.strip()
        except:
            edu = "没有找到学历"
        print("学历",edu)
        workPos = ""
        try:
            workPos = job.find('p','condition clearfix').a.text.strip()
        except:
            workPos = "没有找到工作地点"
        print("工作地点",workPos)
        workTime = ""
        try:
            workTime = job.find('span','').text.strip()
        except:
            workTime="没有找到工作年限"
        print("工作年限",workTime)

        hrefDetail = job.find('h3').a['href']
        if "/a/" in hrefDetail:
            hrefDetail = hrefDetail[3:len(hrefDetail)]
            hrefDetail = "https://www.liepin.com/a/"+hrefDetail
            print("详情链接a: "+hrefDetail)
        else:
            print("详情链接:",hrefDetail)
        r2 = requests.get(hrefDetail,headers)
        soup = BeautifulSoup(r2.text, 'html.parser')
        duty = soup.find('div','content content-word')
        dutyText = ""
        try:
            dutyText = duty.text.strip()
        except:
            dutyText="没有找到职位描述"
        print("职位描述",dutyText)
        companyInfo = soup.find('div','info-word')
        companyInfoText="";
        try:
            companyInfoText = companyInfo.text.strip()
        except:
            companyInfoText="没有找到企业介绍"
        print("企业介绍",companyInfoText)
        json = {"职位 ":jobName,"公司名称":companyName,"薪水":salary,"学历":edu,"工作地点":workPos,"工作年限":workTime,"详情链接":hrefDetail,"职位描述":dutyText,"企业介绍":companyInfoText}
        saveToDB(json)
        time.sleep(1)

关注公众号 不迷路,随时随地查看分享的文章:

python爬取猎聘网数据并且存入数据库_第1张图片

你可能感兴趣的:(python)