将爬取回来的所有数据,加上根据不同的值生成的hash值一起存入mongo,为了防止再次爬数据的时候重复提交数据,加入了存入数据库之前的数据验证(即存入数据库之前验证数据库中是否已经存在该hash值)
#coding=utf-8
import requests,pymongo,math,json
import sys,re,ConfigParser,random
import numpy as np
import hashlib
import time
reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')
class lagouspiders:
def __init__(self):
self.headers = { # 请求头文件
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest'
}
self.data = { # 请求参数
'first': 'true',
'kd': config.get('lagoumsg', 'kd'), # 搜索条件:职位名称
# 'pn':config.get('lagoumsg','pn'), #页码
'city': config.get('lagoumsg', 'city') # 搜索条件:地址
}
self.proxy_list = [ # 设置代理
{'http': '202.117.120.242:8080'}, #
{'http': '113.200.214.164:9999'}, #
{'http': '27.46.5.97:9797'}, #
{'http': '113.200.214.164:9999'}, #
{'http': '42.157.5.154:9999'}, #
{'http': '113.118.96.46:9797'}, #
{'http': '210.26.125.142:8080'}, #
]
self.proxy = random.choice(self.proxy_list)
def test_crawler(self):
result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=self.headers,data=self.data) #发起请求,获取拉钩数据
result_json1 = result1.json() #将获取到的数据转换为json格式
totalCount = result_json1['content']['positionResult']['totalCount'] # 获取所查询到的信息条数
city = result_json1['content']['positionResult']['locationInfo']['city'] # 获取所查询的城市信息
querypositionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName'] # 获取所查询的职位名称
pageSize1 = result_json1['content']['pageSize']
page=math.ceil(float(totalCount) / pageSize1)
page=int(page)#页数
distinctcount = 0
listmin = []
listmax = []
for j in range(1,page+1):
result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=self.headers,data=self.data,proxies = self.proxy)
result_json = result.json() #将获取到的数据转换为json格式
result_dict = json.loads(result.content) #将结果转为dict
resultinsret = result_dict['content']['positionResult']['result'] #需要存入mongo的数据
resultSize = result_json['content']['positionResult']['resultSize']
for i in range(0,resultSize): #将每一页的数据写入到mongo里面
#获取当天时间,并将时间+搜索的职位名称作为数据库名称
date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
databasename = str(date) + str(querypositionName)
# 定义mongo数据库
client = pymongo.MongoClient('192.168.20.155',5555)
rent_info = client[databasename] # 给数据库命名
sheet_table = rent_info['sheet_table'] #创建表单
salary = result_json['content']['positionResult']['result'][i]['salary'] # 薪资范围
salary_num = re.findall(r"\d+", str(salary)) #将得到的薪资范围转换
salary_max = salary_num[1] # 工资上限
salary_min = salary_num[0] # 工资下限
# #生成hash值
companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
salary_max = str(salary_max)
salary_min = str(salary_min)
resultand = companyFullName + positionName + '薪资为' +salary_min + '-' + salary_max
md5 = hashlib.md5()
md5.update(resultand)
resultandhash = md5.hexdigest()
resultinsret[i]['resultandhash'] = str(resultandhash) #将生成的resulthash值放到需要存入mongo的字典里
#在存入数据库之前先判定是否数据库已经存在此数据
resultandhashlist = []
for hashs in sheet_table.distinct('resultandhash'): # 根据resultandhash对数据进行去重
resultandhashlist.append(hashs)
if resultandhash in resultandhashlist: #检查需要存入的数据数据库中是否已经存在
print ''+resultand+'的数据已经存在'
else:
sheet_table.insert_one(resultinsret[i]) # 将上面已经赋值的字典数据且数据库中没有当前数据的数据写入到mongo数据库中
time.sleep(6)
#对取到的数据进行分析:求薪资的最大值中位数,最小值中位数
for resultandhash in sheet_table.distinct('resultandhash'): # 根据公司全称companyFullName对数据进行去重
getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash}) # 根据去重查出来的companyFullName去查询该条数据的完整信息
salary = getresult['salary'] # 薪资范围
salary_num = re.findall(r"\d+", str(salary)) # 将得到的薪资范围转换
salary_max = int(salary_num[1]) # 工资上限
salary_min = int(salary_num[0]) # 工资下限
listmin.append(salary_min) #将新取到的薪资最小值添加到list中
listmax.append(salary_max)
distinctcount = distinctcount + 1
avgsalarymin = np.median(listmin)
avgsalarymax = np.median(listmax)
print '' + city + '城市共有' + querypositionName + '职位' + str(distinctcount) + '条,大概薪资为' + str(avgsalarymin) + '-' + str(avgsalarymax) + 'k' # 打印出最大薪资的中位数
if __name__ == "__main__":
m = lagouspiders()
m.test_crawler()
先解析取回来爬取回来的数据,然后将需要存入数据库的数据取出来生成list,然后存入mongo数据库
#coding=utf-8
import requests,pymongo,math,json,time
import sys,re,ConfigParser,unittest,random
import numpy as np
import hashlib
reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')
headers = { # 请求头文件
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest'
}
data = { # 请求参数
'first':'true',
'kd':config.get('lagoumsg','kd'), #搜索条件:职位名称
# 'pn':config.get('lagoumsg','pn'), #页码
'city':config.get('lagoumsg','city') #搜索条件:地址
}
class lagouspiders(unittest.TestCase):
def test_crawler(self):
result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=headers,data=data) #发起请求,获取拉钩数据
result_json1 = result1.json() #将获取到的数据转换为json格式
totalCount = result_json1['content']['positionResult']['totalCount'] # 获取所查询到的信息条数
city = result_json1['content']['positionResult']['locationInfo']['city'] # 获取所查询的城市信息
positionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName'] # 获取所查询的职位名称
pageSize1 = result_json1['content']['pageSize']
page=math.ceil(float(totalCount) / pageSize1)
page=int(page)#页数
distinctcount = 0
totalsalary = 0
listmin = []
listmax = []
for j in range(1,page+1):
proxy_list = [ #设置代理
{'http':'202.117.120.242:8080'},#
{'http':'113.200.214.164:9999'},#
{'http':'27.46.5.97:9797'}, #
{'http':'113.200.214.164:9999'}, #
{'http':'42.157.5.154:9999'},#
{'http':'113.118.96.46:9797'},#
{'http':'210.26.125.142:8080'},#
]
# 随机选择一个代理
proxy = random.choice(proxy_list)
print proxy #打印当前所使用的是哪个代理
result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=headers,data=data,proxies = proxy)
# print 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'' #打印目前请求的完整地址
result_json = result.json() #将获取到的数据转换为json格式
# print result_json #打印出json的结果
# line = json.dumps(result_json,ensure_ascii=False)
# print line.encode('utf-8') # 打印解码之后的结果
resultSize = result_json['content']['positionResult']['resultSize']
# print resultSize #显示当前是第几页
for i in range(0,resultSize): #将每一页的数据写入到mongo里面
salary = result_json['content']['positionResult']['result'][i]['salary'] # 薪资范围
salary_num = re.findall(r"\d+", str(salary)) #将得到的薪资范围转换
salary_max = salary_num[1] # 工资上限
salary_min = salary_num[0] # 工资下限
#生成hash值
companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
salary_max = str(salary_max)
salary_min = str(salary_min)
resultand = positionName + companyFullName + salary_min + salary_max
md5 = hashlib.md5()
md5.update(resultand)
resultandhash = md5.hexdigest()
mongomsg = {} #定义一个空字典
mongomsg['positionName'] = str(result_json['content']['positionResult']['result'][i]['positionName']) #将取到的内容添加到空字典里形成新字典
mongomsg['companyFullName'] = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
mongomsg['salary_max'] = str(salary_max)
mongomsg['salary_min'] = str(salary_min)
mongomsg['district'] = str(result_json['content']['positionResult']['result'][i]['district'] )
mongomsg['industryField'] = str(result_json['content']['positionResult']['result'][i]['industryField'])
mongomsg['companySize'] = str(result_json['content']['positionResult']['result'][i]['companySize'])
mongomsg['education'] = str(result_json['content']['positionResult']['result'][i]['education'])
mongomsg['createTime'] = str(result_json['content']['positionResult']['result'][i]['createTime'])
mongomsg['resultandhash'] = str(resultandhash)
# 定义mongo数据库
client = pymongo.MongoClient('192.168.20.155',5555)
rent_info = client['rent_info'] # 给数据库命名
sheet_table = rent_info['sheet_table'] #创建表单
sheet_table.insert_one(mongomsg) #将上面已经赋值的字典数据写入到mongo数据库中
time.sleep(15)
# 对取到的数据进行分析处理:求该城市该职位的薪资平均值
# for resultandhash in sheet_table.distinct('resultandhash'): #根据公司全称companyFullName对数据进行去重
# getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash}) #根据去重查出来的companyFullName去查询该条数据的完整信息
# min = getresult['salary_min'] #取出所查询结果里面的工资最小值salary_min
# max = getresult['salary_max'] #取出所查询结果里面的工资最大值salary_max
# sum = int(min) + int(max) #将该条职位信息薪资的最小值与最大值相加
# totalsalary = totalsalary + sum #将所有条数的薪资最小值与最大值相加
# distinctcount = distinctcount + 1 #得到根据公司全称companyFullName对数据进行去重后公司职位信息条数
# avgsalary = format(float(totalsalary) / distinctcount, '.2f') #所有去重条数的最大最小工资值总数除以不同公司的职位条数,并保留2位小数
# print '' + city + '城市共有' + positionName + '职位' + str(distinctcount) + '条,平均薪资为每月 ' + avgsalary + 'k' #打印出最后的结果
# #处理方式一
# #对取到的数据进行分析:求薪资的最大值中位数,最小值中位数
# for resultandhash in sheet_table.distinct('resultandhash'): # 根据公司全称companyFullName对数据进行去重
# getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash}) # 根据去重查出来的companyFullName去查询该条数据的完整信息
# min = getresult['salary_min'] # 取出所查询结果里面的工资最小值salary_min
# max = getresult['salary_max'] # 取出所查询结果里面的工资最大值salary_max
# listmin.append( '%s'%min) #将新取到的薪资最小值添加到list中
# listmax.append( '%s'%max)
# distinctcount = distinctcount + 1
# for i in range(0,len(listmin)): #因为上面得到的listmin是[u'1', u'0', u'7', u'8', u'1', u'0', u'6', u'7', u'1', u'0', u'6', u'1', u'5'],需要进行转换
# listmin.append(int(listmin[0])) #取出list中的第0位数字转成int类型,然后添加到list的尾端
# listmin.remove(listmin[0]) #删除掉list中的第0位,整个for循环执行完之后就是原本长度的list
# avgsalarymin = np.median(listmin)
#
# for i in range(0, len(listmax)):
# listmax.append(int(listmax[0]))
# listmax.remove(listmax[0])
# avgsalarymax = np.median(listmax)
# print '' + city + '城市共有' + positionName + '职位' + str(distinctcount) + '条,大概薪资为' + str(avgsalarymin) + '-' + str(avgsalarymax) + 'k' #打印出最大薪资的中位数
#处理方式二
#对取到的数据进行分析:求薪资的最大值中位数,最小值中位数
for resultandhash in sheet_table.distinct('resultandhash'): # 根据公司全称companyFullName对数据进行去重
getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash}) # 根据去重查出来的companyFullName去查询该条数据的完整信息
min = int(getresult['salary_min']) # 取出所查询结果里面的工资最小值salary_min
max = int(getresult['salary_max']) # 取出所查询结果里面的工资最大值salary_max
listmin.append(min) #将新取到的薪资最小值添加到list中
listmax.append(max)
distinctcount = distinctcount + 1
avgsalarymin = np.median(listmin)
avgsalarymax = np.median(listmax)
print '' + city + '城市共有' + positionName + '职位' + str(distinctcount) + '条,大概薪资为' + str(avgsalarymin) + '-' + str(avgsalarymax) + 'k' # 打印出最大薪资的中位数
if __name__ == "__main__":
unittest.main()
#coding=utf-8
import requests,pymongo,math,json,time
import sys,re,ConfigParser,unittest,random
import hashlib
reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')
headers = { # 请求头文件
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest'
}
data = { # 请求参数
'first':'true',
'kd':config.get('lagoumsg','kd'), #搜索条件:职位名称
# 'pn':config.get('lagoumsg','pn'), #页码
'city':config.get('lagoumsg','city') #搜索条件:地址
}
class lagouspiders(unittest.TestCase):
def test_crawler(self):
result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=headers,data=data) #发起请求,获取拉钩数据
result_json1 = result1.json() #将获取到的数据转换为json格式
totalCount = result_json1['content']['positionResult']['totalCount'] # 获取所查询到的信息条数
city = result_json1['content']['positionResult']['locationInfo']['city'] # 获取所查询的城市信息
querypositionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName'] # 获取所查询的职位名称
pageSize1 = result_json1['content']['pageSize']
page=math.ceil(float(totalCount) / pageSize1)
page=int(page)#页数
distinctcount = 0
totalsalary = 0
for j in range(1,page+1):
proxy_list = [ #设置代理
{'http':'202.117.120.242:8080'},#
{'http':'113.200.214.164:9999'},#
{'http':'27.46.5.97:9797'}, #
{'http':'113.200.214.164:9999'}, #
{'http':'42.157.5.154:9999'},#
{'http':'113.118.96.46:9797'},#
{'http':'210.26.125.142:8080'},#
]
# 随机选择一个代理
proxy = random.choice(proxy_list)
print proxy #打印当前所使用的是哪个代理
result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=headers,data=data,proxies = proxy)
# print 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'' #打印目前请求的完整地址
result_json = result.json() #将获取到的数据转换为json格式
# print result_json #打印出json的结果
# line = json.dumps(result_json,ensure_ascii=False)
# print line.encode('utf-8') # 打印解码之后的结果
resultSize = result_json['content']['positionResult']['resultSize']
# print resultSize #显示当前是第几页
for i in range(0,resultSize): #将每一页的数据写入到mongo里面
salary = result_json['content']['positionResult']['result'][i]['salary'] # 薪资范围
salary_num = re.findall(r"\d+", str(salary)) #将得到的薪资范围转换
salary_max = salary_num[1] # 工资上限
salary_min = salary_num[0] # 工资下限
companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
salary_max = str(salary_max)
salary_min = str(salary_min)
resultand = positionName + companyFullName + salary_min + salary_max
md5 = hashlib.md5()
md5.update(resultand)
resultandhash = md5.hexdigest()
mongomsg = {} #定义一个空字典
mongomsg['positionName'] = str(result_json['content']['positionResult']['result'][i]['positionName']) #将取到的内容添加到空字典里形成新字典
mongomsg['companyFullName'] = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
mongomsg['salary_max'] = str(salary_max)
mongomsg['salary_min'] = str(salary_min)
mongomsg['district'] = str(result_json['content']['positionResult']['result'][i]['district'] )
mongomsg['industryField'] = str(result_json['content']['positionResult']['result'][i]['industryField'])
mongomsg['companySize'] = str(result_json['content']['positionResult']['result'][i]['companySize'])
mongomsg['education'] = str(result_json['content']['positionResult']['result'][i]['education'])
mongomsg['createTime'] = str(result_json['content']['positionResult']['result'][i]['createTime'])
mongomsg['resultandhash'] = str(resultandhash)
# 定义mongo数据库
client = pymongo.MongoClient('192.168.20.155',5555)
rent_info = client['rent_info'] # 给数据库命名
sheet_table = rent_info['sheet_table'] #创建表单
# sheet_table.insert_one(mongomsg) #将上面已经赋值的字典数据写入到mongo数据库中
time.sleep(9)
# 对取到的数据进行分析处理:
for resultandhash in sheet_table.distinct('resultandhash'): #根据公司全称companyFullName对数据进行去重
getresult = sheet_table.find_one({'resultandhash': '%s' % resultandhash}) #根据去重查出来的companyFullName去查询该条数据的完整信息
min = getresult['salary_min'] #取出所查询结果里面的工资最小值salary_min
max = getresult['salary_max'] #取出所查询结果里面的工资最大值salary_max
sum = int(min) + int(max) #将该条职位信息薪资的最小值与最大值相加
totalsalary = totalsalary + sum #将所有条数的薪资最小值与最大值相加
distinctcount = distinctcount + 1 #得到根据公司全称companyFullName对数据进行去重后公司职位信息条数
avgsalary = format(float(totalsalary) / distinctcount, '.2f') #所有去重条数的最大最小工资值总数除以不同公司的职位条数,并保留2位小数
print '' + city + '城市共有' + querypositionName + '职位' + str(distinctcount) + '条,平均薪资为每月 ' + avgsalary + 'k' #打印出最后的结果
if __name__ == "__main__":
unittest.main()