使用selenium爬取拉勾数据,并存入excel表格中。

代码如下

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pymongo

client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
lagou_python = ceshi['lagou_web']

url = 'https://www.lagou.com/'
def lagou(city,job):
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(1)
    driver.find_element_by_id("cboxClose").click()
    time.sleep(1)
    driver.find_element_by_id("search_input").send_keys(city)
    driver.find_element_by_id("search_button").click()
    driver.find_element_by_id("keyword").send_keys(job)
    driver.find_element_by_id("submit").click()
    time.sleep(1)
    t = 1
    while True:
        print('抓取第{}页数据'.format(str(t)))
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')
        work_list = soup.select('li.con_list_item')
        for work in work_list:
            job_ = work.select('h3')[0].text
            salary = work.select('span.money')[0].text
            company = work.select('div.company_name a')[0].text
            area = work.select('div.position em')[0].text
            requirement = work.select('div.li_b_l')[0].text.strip().strip(salary).strip('\n')
            print({'area':area,'salary':salary,'company':company,'job':job_,'requirement':requirement,})
            lagou_python.insert_one({'area':area,'salary':salary,'company':company,'job':job_,'requirement':requirement,})
        time.sleep(1)
        t = t+1
        if t == 31:
            break
        driver.find_element_by_class_name("pager_next ").click()
lagou('杭州','前端')

 由于web页面只显示前30页的数据,因此设置了一个判断语句,爬到30页后停止爬取。

爬取数据存储到MongoDB数据库中,下面代码将数据存储到excel表格中

import pymongo
import xlwt

client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
lagou_python = ceshi['lagou_web']

my_work_book = xlwt.Workbook()
my_sheet = my_work_book.add_sheet('TestSheet')
my_sheet.write(0,0,'company')
my_sheet.write(0,1,'area')
my_sheet.write(0,2,'job')
my_sheet.write(0,3,'salary')
my_sheet.write(0,4,'requirement')
num = 1
for item in lagou_python.find():
    my_sheet.write(num,0,item['company'])
    my_sheet.write(num,1,item['area'])
    my_sheet.write(num,2,item['job'])
    my_sheet.write(num,3,item['salary'])
    my_sheet.write(num,4,item['requirement'])
    print('写入第{}行'.format(num))
    num = num + 1
my_work_book.save('fiel_web.xls')

 

你可能感兴趣的:(python)