目标网站(智联招聘):http://sou.zhaopin.com/
职位输入java
,城市选择北京
,看看网址发生了什么变化:
http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京C&kw=java&sm=0&p=1
看到这个p,心里鸡贼一想,这个应该就是页码了
http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京C&kw=java&sm=0&p=2
卧槽,还真是这个,美滋滋 :)
url
构造url
很简单用个for
循环就行了
for pn in range(1,61):
print("当前页码:",pn)
url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京C&kw=java&sm=0&p="+str(pn)
url
请求url
我们可以下面这个万金油方法,基本不用修改:
headers ={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br'
}
# 最好加上headers 养成好习惯
def getHTMLText(url):
try:
r = requests.get(url,timeout=30,headers=headers)
r.encoding = "utf-8"
# 判断响应状态如果不是200 则抛出一个异常
r.raise_for_status()
return r.text
except:
# 打印异常信息
traceback.print_exc()
return ""
用BeautifulSoup
库去解析网页信息
soup = BeautifulSoup(html,"html.parser")
infolist = soup.find_all("table",{
"class":"newlist"})
for info in infolist:
nowInfo =[]
tds = info.find_all("td")
for td in tds:
nowInfo.append(td.get_text(strip=True))
最后将数据存储到数据库:
# 打开数据库连接 #数据库信息可能需要自己修改
db = pymysql.connect(
"localhost",
"root",
"123456",
"pythondb",
charset='utf8'
)
sql = 'insert into zhilian_info_javascript (positionname, company, salary, city,info_type,info,persent) value(%s, %s, %s, %s,%s,%s,%s)'
# 存入数据库
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
try:
# 执行SQL语句
cursor.execute(sql, (positionname, company, salary, city,info_type,info,persent))
# 提交到数据库执行
db.commit()
except:
print("出错了")
traceback.print_exc()
# 发生错误时回滚
db.rollback()
# 关闭数据库连接
db.close()
爬取了北京,上海,成都,苏州三个城市url
中jl=
后面的参数是城市
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/1/9 11:53
# @Author : glacier
# @Site :
# @File : zhilian.py
# @Software: PyCharm Edu
import requests
from bs4 import BeautifulSoup
import traceback
import pymysql
headers ={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br'
}
def getHTMLText(url):
try:
r = requests.get(url,timeout=30,headers=headers)
r.encoding = "utf-8"
r.raise_for_status()
return r.text
except:
traceback.print_exc()
return ""
def getHTMLInfo(html):
soup = BeautifulSoup(html,"html.parser")
infolist = soup.find_all("table",{
"class":"newlist"})
for info in infolist:
nowInfo =[]
tds = info.find_all("td")
for td in tds:
nowInfo.append(td.get_text(strip=True))
# 删除空项
# while "" in nowInfo:
# nowInfo.remove("")
positionname = nowInfo[0:1]
persent = nowInfo[1:2]
company = nowInfo[2:3]
salary = nowInfo[3:4]
city = nowInfo[4:5]
info_type = nowInfo[5:6]
info = nowInfo[6:7]
print(positionname, company, salary, city,info_type,info,persent)
# 打开数据库连接 #数据库信息可能需要自己修改
db = pymysql.connect(
"localhost",
"root",
"123456",
"pythondb",
charset='utf8'
)
sql = 'insert into zhilian_info_javascript (positionname, company, salary, city,info_type,info,persent) value(%s, %s, %s, %s,%s,%s,%s)'
# 存入数据库
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
try:
# 执行SQL语句
cursor.execute(sql, (positionname, company, salary, city,info_type,info,persent))
# 提交到数据库执行
db.commit()
except:
print("出错了")
traceback.print_exc()
# 发生错误时回滚
db.rollback()
# 关闭数据库连接
db.close()
if __name__ == '__main__':
# key_word = "java"
key_word = input("请输入想要查询的关键词:")
for pn in range(1,61):
print("当前页码:",pn)
url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E4%B8%8A%E6%B5%B7%2b%E5%8C%97%E4%BA%AC%2b%E6%88%90%E9%83%BD%2b%E8%8B%8F%E5%B7%9E%2b%E6%9D%AD%E5%B7%9E&kw="+ key_word +"&isadv=0&" \
"sg=a88505b61f9a438a83ff16e07c18a2cb&p="+str(pn)+""
try:
html = getHTMLText(url)
getHTMLInfo(html)
except:
traceback.print_exc()
continue
print("爬取完成!请打开数据库查看!")
再附上爬取拉钩网的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/1/9 9:57
# @Author : glacier
# @Site :
# @File : lagou.py
# @Software: PyCharm Edu
import requests
from bs4 import BeautifulSoup
import traceback
import pymysql
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br'
}
def getHTMLText(url):
try:
r = requests.get(url,headers=headers,timeout=30)
r.raise_for_status()
r.encoding = "utf-8"
return r.text
except:
traceback.print_exc()
print("异常啦")
def getHTMLInfo(html):
soup = BeautifulSoup(html,"html.parser")
Infos = soup.find_all("li",{
"class":"con_list_item default_list"})
for Info in Infos:
labs= Info.find('div',{
'class': 'li_b_r'}).get_text(strip=True)
positionid = Info.attrs["data-positionid"]
salary = Info.attrs["data-salary"]
company = Info.attrs["data-company"]
positionname = Info.attrs["data-positionname"]
companyid = Info.attrs["data-companyid"]
hrid = Info.attrs["data-hrid"]
adword = Info.attrs["data-adword"]
# print("{0:10}{1:15}{2:25}{3:25}".format(positionid,positionname,salary,company))
# 打开数据库连接 #数据库信息可能需要自己修改
db = pymysql.connect(
"localhost",
"root",
"123456",
"pythondb",
charset='utf8'
)
sql = 'insert into lagou_info(positionid, salary, company, positionname,companyid, hrid,adword,labs) value(%s, %s, %s, %s,%s, %s,%s,%s)'
# 存入数据库
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
try:
# 执行SQL语句
cursor.execute(sql, (positionid, salary, company, positionname,companyid, hrid,adword,labs))
# 提交到数据库执行
db.commit()
except:
print("出错了")
traceback.print_exc()
# 发生错误时回滚
db.rollback()
# 关闭数据库连接
db.close()
if __name__ == '__main__':
key_word = input("输入查询的职位:")
# key_word = "Java"
for i in range(1,31):
try:
url = 'https://www.lagou.com/zhaopin/'+ key_word +'/'+ str(i) +'/?filterOption=3'
print(url)
html = getHTMLText(url)
getHTMLInfo(html)
except:
traceback.print_exc()
continue
print("爬取完成!")
然后分析了一下数据:
这个平均薪资是根据招聘信息给出的薪资取的平均。