Python爬虫--爬取拉勾网数据

代码一:

将爬取回来的所有数据,加上根据不同的值生成的hash值一起存入mongo,为了防止再次爬数据的时候重复提交数据,加入了存入数据库之前的数据验证(即存入数据库之前验证数据库中是否已经存在该hash值)

#coding=utf-8

import requests,pymongo,math,json
import sys,re,ConfigParser,random
import numpy as np
import hashlib
import time

reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')



class lagouspiders:

    def __init__(self):
        self.headers = {  # 请求头文件
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
            'Host': 'www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
            'X-Anit-Forge-Code': '0',
            'X-Anit-Forge-Token': 'None',
            'X-Requested-With': 'XMLHttpRequest'
        }
        self.data = {  # 请求参数
            'first': 'true',
            'kd': config.get('lagoumsg', 'kd'),  # 搜索条件:职位名称
            # 'pn':config.get('lagoumsg','pn'),    #页码
            'city': config.get('lagoumsg', 'city')  # 搜索条件:地址
        }
        self.proxy_list = [  # 设置代理
            {'http': '202.117.120.242:8080'},  #
            {'http': '113.200.214.164:9999'},  #
            {'http': '27.46.5.97:9797'},  #
            {'http': '113.200.214.164:9999'},  #
            {'http': '42.157.5.154:9999'},  #
            {'http': '113.118.96.46:9797'},  #
            {'http': '210.26.125.142:8080'},  #
        ]
        self.proxy = random.choice(self.proxy_list)

    def test_crawler(self):
        result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=self.headers,data=self.data)        #发起请求,获取拉钩数据
        result_json1 = result1.json()      #将获取到的数据转换为json格式
        totalCount = result_json1['content']['positionResult']['totalCount']  # 获取所查询到的信息条数
        city = result_json1['content']['positionResult']['locationInfo']['city']  # 获取所查询的城市信息
        querypositionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName']  # 获取所查询的职位名称
        pageSize1 = result_json1['content']['pageSize']
        page=math.ceil(float(totalCount) / pageSize1)
        page=int(page)#页数
        distinctcount = 0
        listmin = []
        listmax = []

        for j in range(1,page+1):

            result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=self.headers,data=self.data,proxies = self.proxy)
            result_json = result.json()      #将获取到的数据转换为json格式
            result_dict = json.loads(result.content)    #将结果转为dict
            resultinsret = result_dict['content']['positionResult']['result']     #需要存入mongo的数据
            resultSize = result_json['content']['positionResult']['resultSize']

            for i in range(0,resultSize):    #将每一页的数据写入到mongo里面

                #获取当天时间,并将时间+搜索的职位名称作为数据库名称
                date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
                databasename = str(date) + str(querypositionName)
                # 定义mongo数据库
                client = pymongo.MongoClient('192.168.20.155',5555)
                rent_info = client[databasename]  # 给数据库命名
                sheet_table = rent_info['sheet_table']    #创建表单

                salary = result_json['content']['positionResult']['result'][i]['salary']  # 薪资范围
                salary_num = re.findall(r"\d+", str(salary))    #将得到的薪资范围转换
                salary_max = salary_num[1]  # 工资上限
                salary_min = salary_num[0]  # 工资下限

                # #生成hash值
                companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
                positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
                salary_max = str(salary_max)
                salary_min = str(salary_min)
                resultand = companyFullName + positionName + '薪资为' +salary_min + '-' + salary_max
                md5 = hashlib.md5()
                md5.update(resultand)
                resultandhash = md5.hexdigest()
                resultinsret[i]['resultandhash'] = str(resultandhash)   #将生成的resulthash值放到需要存入mongo的字典里
                #在存入数据库之前先判定是否数据库已经存在此数据
                resultandhashlist = []
                for hashs in sheet_table.distinct('resultandhash'):  # 根据resultandhash对数据进行去重
                    resultandhashlist.append(hashs)
                if resultandhash in resultandhashlist:     #检查需要存入的数据数据库中是否已经存在
                    print ''+resultand+'的数据已经存在'
                else:
                    sheet_table.insert_one(resultinsret[i])  # 将上面已经赋值的字典数据且数据库中没有当前数据的数据写入到mongo数据库中
            time.sleep(6)

        #对取到的数据进行分析:求薪资的最大值中位数,最小值中位数
        for resultandhash in sheet_table.distinct('resultandhash'):  # 根据公司全称companyFullName对数据进行去重
            getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash})  # 根据去重查出来的companyFullName去查询该条数据的完整信息
            salary = getresult['salary']  # 薪资范围
            salary_num = re.findall(r"\d+", str(salary))  # 将得到的薪资范围转换
            salary_max = int(salary_num[1])  # 工资上限
            salary_min = int(salary_num[0])  # 工资下限
            listmin.append(salary_min)    #将新取到的薪资最小值添加到list中
            listmax.append(salary_max)
            distinctcount = distinctcount + 1
        avgsalarymin = np.median(listmin)
        avgsalarymax = np.median(listmax)
        print '' + city + '城市共有' + querypositionName + '职位' + str(distinctcount) + '条,大概薪资为' + str(avgsalarymin) + '-' + str(avgsalarymax) + 'k'  # 打印出最大薪资的中位数

if __name__ == "__main__":
    m = lagouspiders()
    m.test_crawler()

代碼二

先解析取回来爬取回来的数据,然后将需要存入数据库的数据取出来生成list,然后存入mongo数据库

#coding=utf-8

import requests,pymongo,math,json,time
import sys,re,ConfigParser,unittest,random
import numpy as np
import hashlib


reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')

headers = {      # 请求头文件
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
    'Host':'www.lagou.com',
    'Referer':'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
    'X-Anit-Forge-Code':'0',
    'X-Anit-Forge-Token':'None',
    'X-Requested-With':'XMLHttpRequest'
}
data = {    # 请求参数
    'first':'true',
    'kd':config.get('lagoumsg','kd'),     #搜索条件:职位名称
    # 'pn':config.get('lagoumsg','pn'),    #页码
    'city':config.get('lagoumsg','city')     #搜索条件:地址
}

class lagouspiders(unittest.TestCase):
    def test_crawler(self):
        result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=headers,data=data)        #发起请求,获取拉钩数据
        result_json1 = result1.json()      #将获取到的数据转换为json格式
        totalCount = result_json1['content']['positionResult']['totalCount']  # 获取所查询到的信息条数
        city = result_json1['content']['positionResult']['locationInfo']['city']  # 获取所查询的城市信息
        positionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName']  # 获取所查询的职位名称
        pageSize1 = result_json1['content']['pageSize']
        page=math.ceil(float(totalCount) / pageSize1)
        page=int(page)#页数
        distinctcount = 0
        totalsalary = 0
        listmin = []
        listmax = []

        for j in range(1,page+1):
            proxy_list = [            #设置代理
                {'http':'202.117.120.242:8080'},#
                {'http':'113.200.214.164:9999'},#
                {'http':'27.46.5.97:9797'}, #
                {'http':'113.200.214.164:9999'},  #
                {'http':'42.157.5.154:9999'},#
                {'http':'113.118.96.46:9797'},#
               {'http':'210.26.125.142:8080'},#
            ]
            # 随机选择一个代理
            proxy = random.choice(proxy_list)
            print proxy     #打印当前所使用的是哪个代理
            result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=headers,data=data,proxies = proxy)
            # print 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+''      #打印目前请求的完整地址
            result_json = result.json()      #将获取到的数据转换为json格式
            # print result_json      #打印出json的结果
            # line = json.dumps(result_json,ensure_ascii=False)
            # print line.encode('utf-8')      # 打印解码之后的结果
            resultSize = result_json['content']['positionResult']['resultSize']
            # print resultSize   #显示当前是第几页

            for i in range(0,resultSize):    #将每一页的数据写入到mongo里面
                salary = result_json['content']['positionResult']['result'][i]['salary']  # 薪资范围
                salary_num = re.findall(r"\d+", str(salary))    #将得到的薪资范围转换
                salary_max = salary_num[1]  # 工资上限
                salary_min = salary_num[0]  # 工资下限

                #生成hash值
                companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
                positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
                salary_max = str(salary_max)
                salary_min = str(salary_min)
                resultand = positionName + companyFullName + salary_min + salary_max
                md5 = hashlib.md5()
                md5.update(resultand)
                resultandhash = md5.hexdigest()

                mongomsg = {}    #定义一个空字典
                mongomsg['positionName'] = str(result_json['content']['positionResult']['result'][i]['positionName'])     #将取到的内容添加到空字典里形成新字典
                mongomsg['companyFullName'] = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
                mongomsg['salary_max'] = str(salary_max)
                mongomsg['salary_min'] = str(salary_min)
                mongomsg['district'] = str(result_json['content']['positionResult']['result'][i]['district'] )
                mongomsg['industryField'] = str(result_json['content']['positionResult']['result'][i]['industryField'])
                mongomsg['companySize'] = str(result_json['content']['positionResult']['result'][i]['companySize'])
                mongomsg['education'] = str(result_json['content']['positionResult']['result'][i]['education'])
                mongomsg['createTime'] = str(result_json['content']['positionResult']['result'][i]['createTime'])
                mongomsg['resultandhash'] = str(resultandhash)

                # 定义mongo数据库
                client = pymongo.MongoClient('192.168.20.155',5555)
                rent_info = client['rent_info']  # 给数据库命名
                sheet_table = rent_info['sheet_table']    #创建表单
                sheet_table.insert_one(mongomsg)     #将上面已经赋值的字典数据写入到mongo数据库中
            time.sleep(15)

        # 对取到的数据进行分析处理:求该城市该职位的薪资平均值
        # for resultandhash in sheet_table.distinct('resultandhash'):       #根据公司全称companyFullName对数据进行去重
        #     getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash})      #根据去重查出来的companyFullName去查询该条数据的完整信息
        #     min = getresult['salary_min']     #取出所查询结果里面的工资最小值salary_min
        #     max = getresult['salary_max']  #取出所查询结果里面的工资最大值salary_max
        #     sum = int(min) + int(max)     #将该条职位信息薪资的最小值与最大值相加
        #     totalsalary = totalsalary + sum    #将所有条数的薪资最小值与最大值相加
        #     distinctcount = distinctcount + 1    #得到根据公司全称companyFullName对数据进行去重后公司职位信息条数
        #     avgsalary = format(float(totalsalary) / distinctcount, '.2f')     #所有去重条数的最大最小工资值总数除以不同公司的职位条数,并保留2位小数
        # print '' + city + '城市共有' + positionName + '职位' + str(distinctcount) + '条,平均薪资为每月 ' + avgsalary + 'k'    #打印出最后的结果


# #处理方式一
#         #对取到的数据进行分析:求薪资的最大值中位数,最小值中位数
#         for resultandhash in sheet_table.distinct('resultandhash'):  # 根据公司全称companyFullName对数据进行去重
#             getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash})  # 根据去重查出来的companyFullName去查询该条数据的完整信息
#             min = getresult['salary_min']  # 取出所查询结果里面的工资最小值salary_min
#             max = getresult['salary_max']  # 取出所查询结果里面的工资最大值salary_max
#             listmin.append( '%s'%min)    #将新取到的薪资最小值添加到list中
#             listmax.append( '%s'%max)
#             distinctcount = distinctcount + 1
#         for i in range(0,len(listmin)):    #因为上面得到的listmin是[u'1', u'0', u'7', u'8', u'1', u'0', u'6', u'7', u'1', u'0', u'6', u'1', u'5'],需要进行转换
#             listmin.append(int(listmin[0]))       #取出list中的第0位数字转成int类型,然后添加到list的尾端
#             listmin.remove(listmin[0])   #删除掉list中的第0位,整个for循环执行完之后就是原本长度的list
#         avgsalarymin = np.median(listmin)
#
#         for i in range(0, len(listmax)):
#             listmax.append(int(listmax[0]))
#             listmax.remove(listmax[0])
#         avgsalarymax = np.median(listmax)
#         print '' + city + '城市共有' + positionName + '职位' + str(distinctcount) + '条,大概薪资为' + str(avgsalarymin) + '-' + str(avgsalarymax) + 'k'    #打印出最大薪资的中位数

#处理方式二
        #对取到的数据进行分析:求薪资的最大值中位数,最小值中位数
        for resultandhash in sheet_table.distinct('resultandhash'):  # 根据公司全称companyFullName对数据进行去重
            getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash})  # 根据去重查出来的companyFullName去查询该条数据的完整信息
            min = int(getresult['salary_min'])  # 取出所查询结果里面的工资最小值salary_min
            max = int(getresult['salary_max'])  # 取出所查询结果里面的工资最大值salary_max
            listmin.append(min)    #将新取到的薪资最小值添加到list中
            listmax.append(max)
            distinctcount = distinctcount + 1
        avgsalarymin = np.median(listmin)
        avgsalarymax = np.median(listmax)
        print '' + city + '城市共有' + positionName + '职位' + str(distinctcount) + '条,大概薪资为' + str(avgsalarymin) + '-' + str(avgsalarymax) + 'k'  # 打印出最大薪资的中位数


if __name__ == "__main__":
    unittest.main()

代码三:

#coding=utf-8

import requests,pymongo,math,json,time
import sys,re,ConfigParser,unittest,random
import hashlib


reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')

headers = {      # 请求头文件
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
    'Host':'www.lagou.com',
    'Referer':'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
    'X-Anit-Forge-Code':'0',
    'X-Anit-Forge-Token':'None',
    'X-Requested-With':'XMLHttpRequest'
}
data = {    # 请求参数
    'first':'true',
    'kd':config.get('lagoumsg','kd'),     #搜索条件:职位名称
    # 'pn':config.get('lagoumsg','pn'),    #页码
    'city':config.get('lagoumsg','city')     #搜索条件:地址
}

class lagouspiders(unittest.TestCase):
    def test_crawler(self):
        result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=headers,data=data)        #发起请求,获取拉钩数据
        result_json1 = result1.json()      #将获取到的数据转换为json格式
        totalCount = result_json1['content']['positionResult']['totalCount']  # 获取所查询到的信息条数
        city = result_json1['content']['positionResult']['locationInfo']['city']  # 获取所查询的城市信息
        querypositionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName']  # 获取所查询的职位名称
        pageSize1 = result_json1['content']['pageSize']
        page=math.ceil(float(totalCount) / pageSize1)
        page=int(page)#页数
        distinctcount = 0
        totalsalary = 0

        for j in range(1,page+1):
            proxy_list = [            #设置代理
                {'http':'202.117.120.242:8080'},#
                {'http':'113.200.214.164:9999'},#
                {'http':'27.46.5.97:9797'}, #
                {'http':'113.200.214.164:9999'},  #
                {'http':'42.157.5.154:9999'},#
                {'http':'113.118.96.46:9797'},#
               {'http':'210.26.125.142:8080'},#
            ]
            # 随机选择一个代理
            proxy = random.choice(proxy_list)
            print proxy     #打印当前所使用的是哪个代理
            result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=headers,data=data,proxies = proxy)
            # print 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+''      #打印目前请求的完整地址
            result_json = result.json()      #将获取到的数据转换为json格式
            # print result_json      #打印出json的结果
            # line = json.dumps(result_json,ensure_ascii=False)
            # print line.encode('utf-8')      # 打印解码之后的结果
            resultSize = result_json['content']['positionResult']['resultSize']
            # print resultSize   #显示当前是第几页

            for i in range(0,resultSize):    #将每一页的数据写入到mongo里面
                salary = result_json['content']['positionResult']['result'][i]['salary']  # 薪资范围
                salary_num = re.findall(r"\d+", str(salary))    #将得到的薪资范围转换
                salary_max = salary_num[1]  # 工资上限
                salary_min = salary_num[0]  # 工资下限

                companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
                positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
                salary_max = str(salary_max)
                salary_min = str(salary_min)
                resultand = positionName + companyFullName + salary_min + salary_max
                md5 = hashlib.md5()
                md5.update(resultand)
                resultandhash = md5.hexdigest()

                mongomsg = {}    #定义一个空字典
                mongomsg['positionName'] = str(result_json['content']['positionResult']['result'][i]['positionName'])     #将取到的内容添加到空字典里形成新字典
                mongomsg['companyFullName'] = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
                mongomsg['salary_max'] = str(salary_max)
                mongomsg['salary_min'] = str(salary_min)
                mongomsg['district'] = str(result_json['content']['positionResult']['result'][i]['district'] )
                mongomsg['industryField'] = str(result_json['content']['positionResult']['result'][i]['industryField'])
                mongomsg['companySize'] = str(result_json['content']['positionResult']['result'][i]['companySize'])
                mongomsg['education'] = str(result_json['content']['positionResult']['result'][i]['education'])
                mongomsg['createTime'] = str(result_json['content']['positionResult']['result'][i]['createTime'])
                mongomsg['resultandhash'] = str(resultandhash)

                # 定义mongo数据库
                client = pymongo.MongoClient('192.168.20.155',5555)
                rent_info = client['rent_info']  # 给数据库命名
                sheet_table = rent_info['sheet_table']    #创建表单
                # sheet_table.insert_one(mongomsg)     #将上面已经赋值的字典数据写入到mongo数据库中
            time.sleep(9)

        # 对取到的数据进行分析处理:
        for resultandhash in sheet_table.distinct('resultandhash'):       #根据公司全称companyFullName对数据进行去重
            getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash})      #根据去重查出来的companyFullName去查询该条数据的完整信息
            min = getresult['salary_min']     #取出所查询结果里面的工资最小值salary_min
            max = getresult['salary_max']  #取出所查询结果里面的工资最大值salary_max
            sum = int(min) + int(max)     #将该条职位信息薪资的最小值与最大值相加
            totalsalary = totalsalary + sum    #将所有条数的薪资最小值与最大值相加
            distinctcount = distinctcount + 1    #得到根据公司全称companyFullName对数据进行去重后公司职位信息条数
            avgsalary = format(float(totalsalary) / distinctcount, '.2f')     #所有去重条数的最大最小工资值总数除以不同公司的职位条数,并保留2位小数
        print '' + city + '城市共有' + querypositionName + '职位' + str(distinctcount) + '条,平均薪资为每月 ' + avgsalary + 'k'    #打印出最后的结果


if __name__ == "__main__":
    unittest.main()

你可能感兴趣的:(爬虫)