代码如下
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
lagou_python = ceshi['lagou_web']
url = 'https://www.lagou.com/'
def lagou(city,job):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(1)
driver.find_element_by_id("cboxClose").click()
time.sleep(1)
driver.find_element_by_id("search_input").send_keys(city)
driver.find_element_by_id("search_button").click()
driver.find_element_by_id("keyword").send_keys(job)
driver.find_element_by_id("submit").click()
time.sleep(1)
t = 1
while True:
print('抓取第{}页数据'.format(str(t)))
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
work_list = soup.select('li.con_list_item')
for work in work_list:
job_ = work.select('h3')[0].text
salary = work.select('span.money')[0].text
company = work.select('div.company_name a')[0].text
area = work.select('div.position em')[0].text
requirement = work.select('div.li_b_l')[0].text.strip().strip(salary).strip('\n')
print({'area':area,'salary':salary,'company':company,'job':job_,'requirement':requirement,})
lagou_python.insert_one({'area':area,'salary':salary,'company':company,'job':job_,'requirement':requirement,})
time.sleep(1)
t = t+1
if t == 31:
break
driver.find_element_by_class_name("pager_next ").click()
lagou('杭州','前端')
由于web页面只显示前30页的数据,因此设置了一个判断语句,爬到30页后停止爬取。
爬取数据存储到MongoDB数据库中,下面代码将数据存储到excel表格中
import pymongo
import xlwt
client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
lagou_python = ceshi['lagou_web']
my_work_book = xlwt.Workbook()
my_sheet = my_work_book.add_sheet('TestSheet')
my_sheet.write(0,0,'company')
my_sheet.write(0,1,'area')
my_sheet.write(0,2,'job')
my_sheet.write(0,3,'salary')
my_sheet.write(0,4,'requirement')
num = 1
for item in lagou_python.find():
my_sheet.write(num,0,item['company'])
my_sheet.write(num,1,item['area'])
my_sheet.write(num,2,item['job'])
my_sheet.write(num,3,item['salary'])
my_sheet.write(num,4,item['requirement'])
print('写入第{}行'.format(num))
num = num + 1
my_work_book.save('fiel_web.xls')