import urllib.request
from bs4 import BeautifulSoup
import time
import pymongo
import pymysql
#https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,2.html
def handle_request(keyword,page,url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36’,
}
url = url.format(keyword,page)
request = urllib.request.Request(url=url,headers=headers)
return request
#用bs4解析
def parse_content(content,db):
soup = BeautifulSoup(content,‘lxml’)
div_list = soup.select(’#resultList > .el’)[1:]
# print(div_list)
#逐一解析
for os in div_list:
#公司职业
jobname = os.select('.t1 > span > a')[0]['title']
#公司名称
company = os.select('.t2 > a ')[0]['title']
#工作地点
area = os.select('.t3')[0].string
#薪资
salary = os.select('.t4')[0].string
#发布时间
publish_time = os.select('.t5')[0].string
#print(salary,publishtime)
items = {
'公司职业':jobname,
'公司名称':company,
'工作地点':area,
'薪资':salary,
'发布时间':publish_time,
}
#String = str(items)
#print(items)
#fp.write(String,'\n')
save_to_mysql(db,items)
#fp.insert(items)
#第一种是保存到mysql中
def connect_db():
db=pymysql.Connect(host=‘localhost’,port=3306,user=‘root’,password=‘123456’,database=‘51job’,charset=‘utf8’)
#两种引擎,一种是innodb 一种是myisam
return db
#第二种是保存到mongodb中
def connect_mongodb():
#连接mongodb
client = pymongo.MongoClient(host=‘localhost’,port=27017)
return client
#如果用mysql需要自己建立数据库,再创建对应的表格
def save_to_mysql(db,items):
#获取cursor
cursor = db.cursor()
#拼接sql语句
sql = 'insert into job(jobname, company, area, salary, publish_time) values("%s","%s","%s","%s","%s")' % (items['公司职业'], items['公司名称'], items['工作地点'], items['薪资'], items['发布时间'])
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print(e)
db.rollback()
def main():
keyword = input(‘请输入要搜索的关键字-’)
start_page = int(input(‘请输入起始页码-’))
end_page = int(input(‘请输入结束页码-’))
url = ‘https://search.51job.com/list/010000,000000,0000,00,9,99,{},2,{}.html’
#fp = open(‘job.txt’,‘w’,encoding=‘utf8’)
db = connect_db()
#一次遍历每一页的数据
#client = connect_mongodb()
#选择mongodb的数据库
#db = client.job51
#选择mongodb的集合
#fp = db.job
for page in range(start_page,end_page + 1):
print('正在爬取--第%s页--....' % page)
request = handle_request(keyword,page,url)
content = urllib.request.urlopen(request).read().decode('gbk')
parse_content(content,db)
print('结束爬取--第%s页--...' % page)
time.sleep(2)
db.close()
#fp.close()
#client.close()
if name == ‘main’:
main()