在python爬虫之51job工作搜索我们只是通过python函数将其简单打印,在此我们将其进行封装成类,打印并写入mysql数据库中。
因此我们首先需要设计mysql的库和表结构,在此我们只用了简单的一个表,sql语句如下:
#建库
#create database 51job;
#建表
#create table job(job_id int not null auto_increment,keyword varchar(100) not null,position varchar(100) not null,p_link varchar(100) not null,company varchar(100) not null,location varchar(50) not null,salary varchar(20),publish varchar(40) not null,primary key (job_id));
在设计表之前曾想了很多,考虑到如果以后随着数据的增多,简单的一个表肯定会越来越慢,因此将其按城市、关键字、job信息等分表,有兴趣的可以仔细考虑下。目前我们就先按照一个表来运行。
要点:
1.安装MySQLdb驱动
2.安装mysql数据库
3.安装selenium,beatifulsoup,phantomjs等python模块请参考之前的博文
#!/usr/bin/env python2.7
#-*- coding: utf-8 -*-
#date: 2016-09-06
#comment: 51job工作搜索并存入mysql数据库
import sys
import time
import MySQLdb
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
class Job:
'工作类'
def __init__(self, position, p_link, company, location, salary, publish):
self.position = position
self.p_link = p_link
self.company = company
self.location = location
self.salary = salary
self.publish = publish
def displayJob(self):
print u"职位名:" + self.position
print u"职位链接:" + self.p_link
print u"公司名:" + self.company
print u"公司地点:" + self.location
print u"薪资:" + self.salary
print u"发布时间:" + self.publish
class MySql:
'数据库类'
def __init__(self, host, user, password, port, db, charset='utf8'):
self.host = host
self.user = user
self.password = password
self.port = port
self.db = db
self.charset = charset
self.conn = MySQLdb.connect(host=self.host, user=self.user, passwd=self.password, port=self.port, db=self.db)
self.conn.set_character_set(self.charset)
self.cursor = self.conn.cursor()
#查询
def queryDB(self, table_name, param):
sql = 'select * from ' + table_name + " where keyword=%s and position=%s and p_link=%s and company=%s and location=%s and salary=%s and publish=%s;"
self.cursor.execute(sql, param)
def fetchRow(self):
result = self.cursor.fetchone()
return result
def rowCount(self):
self.cursor.rowcount
#插入,插入前判断是否存在该条记录
def insertDB(self, table_name, param):
self.queryDB(table_name, param)
count = self.fetchRow()
if count == None:
sql = "insert into " + table_name + "(keyword, position, p_link, company, location, salary, publish) values(%s, %s, %s, %s, %s, %s, %s);"
self.cursor.execute(sql, param)
def commitDB(self):
self.conn.commit()
def closeDB(self):
self.cursor.close()
self.conn.close()
class SearchJob:
'搜索类'
def __init__(self, url, keys, driver):
self.url = url
self.keys = keys
def open51job(self):
driver.get(url)
print u"进入...." + driver.title
#默认搜索框
elem = driver.find_element_by_class_name("textbox1")
elem.clear()
elem.send_keys(keys)
elem.submit()
#关闭当前tab页
driver.close()
#切换到下一tab页
for handle in driver.window_handles:
driver.switch_to_window(handle)
print u"进入...." + driver.title
time.sleep(2)
#搜索当前页工作信息
def searchJob(self):
data = driver.page_source
content = BeautifulSoup(data, "lxml")
return content
#切换到下一页
def nextPage(self):
try:
page_num = driver.find_element_by_link_text("下一页")
page_num.click()
except NoSuchElementException:
print u"搜索完毕"
flag = 0
return flag
def closeWeb(self):
driver.close()
if __name__ == "__main__":
#编码需要
reload(sys)
sys.setdefaultencoding('utf-8')
url = "http://www.51job.com/qingdao"
keys = raw_input("请输入搜索关键词:").decode(sys.stdin.encoding)
print "请稍等片刻...."
driver = webdriver.PhantomJS(executable_path="/usr/local/phantomjs/bin/phantomjs")
searchtask = SearchJob(url, keys, driver)
searchtask.open51job()
num = 1
while True:
print u"##################################第" + str(num) + u"页工作信息如下#####################"
content = searchtask.searchJob()
position = content.find_all("p", {"class":"t1"})
company = content.find_all("span", {"class":"t2"})
location = content.find_all("span", {"class":"t3"})
salary = content.find_all("span", {"class":"t4"})
publish = content.find_all("span", {"class":"t5"})
i = 1
for each in position:
print "####第" + str(i) + "个job####"
if salary[i].string == None:
salary[i].string = u"0/月"
#job类实例化
job = Job(each.a.get("title"), each.a.get("href"), company[i].string, location[i].string, salary[i].string, publish[i].string)
#数据库操作
mysql = MySql('127.0.0.1', 'root', '', 3306, '51job')
param = (keys, job.position, job.p_link, job.company, job.location, job.salary, job.publish)
mysql.insertDB("job", param)
mysql.commitDB()
mysql.closeDB()
job.displayJob()
i = i + 1
flag = searchtask.nextPage()
if flag == 0:
break
num = num + 1
searchtask.closeWeb()
代码要点:
1.sql语句插入中文时,需要是使用utf8
reload(sys)
sys.setdefaultencoding(‘utf-8’)
及在定义类的时指定utf8
def init(self, host, user, password, port, db, charset=’utf8’):
2.job信息重复
sql插入前,需要判断按照关键字、职位、公司及发布时间等信息判断此条记录是否存在,当不存在时才插入
查看数据库
mysql> select * from job limit 10;
+--------+-----------+-------------------------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------+----------------+---------+
| job_id | keyword | position | p_link | company | location | salary | publish |
+--------+-----------+-------------------------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------+----------------+---------+
| 2 | 韩国语 | 韩国语外教(工作地点:中国山东省青岛市) | http://jobs.51job.com/qingdao-snq/76738306.html?s=0 | 青岛亚联教育咨询管理有限公司 | 青岛-市南区 | 60元/小时 | 09-06 |
| 3 | 韩国语 | 文案编辑(韩国语) | http://jobs.51job.com/qingdao-cyq/77621062.html?s=0 | 青岛倡仪医疗器有限公司 | 青岛-城阳区 | 3000-4499/月 | 09-06 |
| 4 | 韩国语 | 朝鲜语/韩国语 | http://jobs.51job.com/qingdao-lcq/78931983.html?s=0 | 青岛大星电子有限公司 | 青岛-李沧区 | 3500-5000/月 | 09-06 |
| 5 | 韩国语 | 韩国语翻译 | http://jobs.51job.com/qingdao-snq/80744014.html?s=0 | 赢联科技集团有限公司 | 青岛-市南区 | 3000-4499/月 | 09-06 |
| 6 | 韩国语 | 韩国语翻译 | http://jobs.51job.com/qingdao-jms/52580378.html?s=0 | 青岛东一胶带有限公司 | 青岛-即墨市 | 3000-5000/月 | 09-04 |
| 7 | 韩国语 | 韩国语相关 | http://jobs.51job.com/qingdao-lcq/71126401.html?s=0 | 青岛杰亚希教育咨询有限公司 | 青岛-李沧区 | 3000-4499/月 | 08-30 |
| 8 | 韩国语 | 汽车零部件项目管理 | http://jobs.51job.com/qingdao-pds/80814703.html?s=0 | 青岛东明汽车配件有限公司 | 青岛-平度市 | 6000-7999/月 | 09-06 |
| 9 | 韩国语 | 全&***韩语老师(外企模式-待遇福利升职等制度完善) | http://jobs.51job.com/qingdao/79670539.html?s=0 | 青岛锦才教育 | 青岛 | 4500-5999/月 | 09-06 |
| 10 | 韩国语 | 日本免税店/奢侈品/店员/翻译担当15-20万 | http://jobs.51job.com/qingdao/74415559.html?s=0 | 青岛知行国际经济技术合作有限公司 | 青岛 | 15-20万/年 | 09-06 |
| 11 | 韩国语 | 韩译英翻译员 | http://jobs.51job.com/qingdao/66920356.html?s=0 | 成都朗恒智讯科技有限公司 | 青岛 | 5000-10000/月 | 09-06 |
+--------+-----------+-------------------------------------------------------------------------+-----------------------------------------------------+--------------------------------------------------+------------------+----------------+---------+
10 rows in set (0.00 sec)