python爬取拉勾网任意职位数据

 
  
 
  
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 27 15:44:14 2016
#python vesion:3.5.2

@author: mozzielx
"""


import urllib.parse
import urllib.request
import json
from openpyxl import Workbook


def get_content():
    url = 'http://www.lagou.com/jobs/positionAjax.json?px=default'
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
    #需要向网站post的data
    values = {
            'first':'true',
            'kd':'机械',  #查找职位,可改成需要查找的职位
            'pn':page_num #当前页数
            }
    headers = { 'User-Agent' : user_agent }
    data = urllib.parse.urlencode(values).encode(encoding='utf-8')
    req = urllib.request.Request(url, data, headers)
    response = urllib.request.urlopen(req)
    the_page = response.read().decode("utf-8")
    data_json = json.loads(the_page)
    tar = data_json['content']['positionResult']['result']
    return tar

#清空
page_num=''
companyLabelList=[]
companyFullName=[]
positionAdvantage=[]
city=[]
salary=[]
companySize=[]
district=[]#哪个区
jobNature=[]
workYear=[] #工作资历
education=[]
positionName=[]#岗位名称
industryField=[]#公司研究领域

#数据标准化
def standardize_data(tar):
    for each in tar:
        companyLabelList.append(each['companyLabelList'])
        companyFullName.append(each['companyFullName'])
        positionAdvantage.append(each['positionAdvantage'])
        city.append(each['city'])
        salary.append(each['salary'])
        companySize.append(each['companySize'])
        district.append(each['district'])
        jobNature.append(each['jobNature'])
        workYear.append(each['workYear'])
        education.append(each['education'])
        positionName.append(each['positionName'])
        industryField.append(each['industryField'])
        
#将数据保存到excel
def excel_data():
    wb=Workbook()
    ws=wb.active
    ws['A1']='positionName'
    ws['B1']='salary'
    ws['C1']='jobNature'
    ws['D1']='workYear'
    ws['E1']='education'
    ws['F1']='positionAdvantage'
    ws['G1']='city'
    ws['H1']='district'
    ws['I1']='companyFullName'
    ws['J1']='companyLabelList'
    ws['K1']='industryField'
    for row in range(2,len(city) + 2):
            for col in range(1,2):
                _ = ws.cell(column=col, row=row, value="%s" % positionName[row-2])
            for col in range(2,3):
                _ = ws.cell(column=col, row=row, value="%s" % salary[row-2])  
            for col in range(3,4):
                _ = ws.cell(column=col, row=row, value="%s" % jobNature[row-2])
            for col in range(4,5):
                _ = ws.cell(column=col, row=row, value="%s" % workYear[row-2])
            for col in range(5,6):
                _ = ws.cell(column=col, row=row, value="%s" % education[row-2])
            for col in range(6,7):
                _ = ws.cell(column=col, row=row, value="%s" % positionAdvantage[row-2])
            for col in range(7,8):
                _ = ws.cell(column=col, row=row, value="%s" % city[row-2])
            for col in range(8,9):
                _ = ws.cell(column=col, row=row, value="%s" % district[row-2])
            for col in range(9,10):
                _ = ws.cell(column=col, row=row, value="%s" % companyFullName[row-2])
            for col in range(10,11):
                _ = ws.cell(column=col, row=row, value="%s" % companyLabelList[row-2])
            for col in range(11,12):
                _ = ws.cell(column=col, row=row, value="%s" % industryField[row-2])            
        
    wb.save('拉勾网——机械有关职位数据.xlsx')
    
if __name__ == '__main__':
    for each in range(1,100):#5可以改成任意页数(最大值)
        page_num=each
        standardize_data(get_content())
    excel_data()
    

你可能感兴趣的:(数据爬取,爬虫,数据)