Scrapy+Mysql+SqlAlchemy爬取招聘网站信息

爬虫目的:

爬取拉勾网站杭州分站的python岗位基本信息和岗位详情,并写入mysql数据库。后附经验总结。

知识点,
1、解析函数多个yield以及多个pipeline的使用
2、Sqlalchemy框架的使用
3、Scrapy FormRequest请求,以及反爬

难点:
岗位详细信息的url不能直接从爬虫返回信息获得,而需要另外构建url。然后通过
request再次请求,以及回调的解析函数处理后交给pipeline入库。即同一个岗位的信息需要两次入库。经尝试,用同一个mysql表格入库容易出现信息缺失,最终的方案是分两个表格入库,然后在mysql里面连接为一个表格。

代码如下:

settings.py

BOT_NAME = 'lagou1'
SPIDER_MODULES = ['lagou1.spiders']
NEWSPIDER_MODULE = 'lagou1.spiders'
MYSQL_CONNECTION='mysql+mysqlconnector://XXX:XXXX@localhost:3306/pydb?charset=utf8'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False #这个改成True,经常会没有结果。
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 3
CONCURRENT_REQUESTS_PER_IP = 3
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
    'lagou1.middlewares.Lagou1DownloaderMiddleware': 400,
#    'lagou1.middlewares.CustomerMiddleware':200
}
MEDIA_ALLOW_REDIRECTS =True #^^^^^^^^^^^^^^^^^^^^^^重要,解决图片无法下载
ITEM_PIPELINES = {
      'lagou1.pipelines.Lagou1Pipeline': 300,
    'lagou1.pipelines.Lagou1Pipeline2': 310,
#    'lagou1.pipelines.DownloadFile':320,
}

items.py

import scrapy
class Lagou1Item(scrapy.Item):
    positionId=scrapy.Field()
    positionName=scrapy.Field()
    createTime=scrapy.Field()
    companyId=scrapy.Field()
    companyShortName=scrapy.Field()
    companyFullName=scrapy.Field()
    city=scrapy.Field()
    salary=scrapy.Field()
    positionLables=scrapy.Field()
    job_trigger=scrapy.Field()
    job_description=scrapy.Field()
    job_detail_url=scrapy.Field() 

lagouspider.py

# -*- coding: utf-8 -*-
import scrapy
from lagou1.items import Lagou1Item
import json
import requests
import time
from bs4 import BeautifulSoup

class LagouspiderSpider(scrapy.Spider):
    name = 'lagouspider'
    allowed_domains = ['lagou.com']
    start_urls = ['https://www.lagou.com/jobs/positionAjax.json?city=%E6%9D%AD%E5%B7%9E&needAddtionalResult=false']#'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    proxies={"http":"http://112.111.217.45:9999"}
    h1={'User-Agent': 'Opera/9.80 (iPhone; Opera Mini/7.1.32694/27.1407; U; en) Presto/2.8.119 Version/11.10',\
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='} #创建不同的headers
    h2={'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17',\
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
    h3={'User-Agent': 'Opera/9.80 (iPhone; Opera Mini/7.1.32694/27.1407; U; en) Presto/2.8.119 Version/11.10',\
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
    h4={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}
    def get_Cookies(self,proxies,headers): #构建获得cookies的函数
        url = 'https://www.lagou.com/jobs/list_python'
        session = requests.session()
        session.post(url,headers=headers,proxies=proxies)
        cookies = session.cookies
        return cookies.get_dict()
    def start_requests(self):
        for i in range(1,11):
            para = {'first': 'true', 'pn': str(i), 'kd': 'python'}
            yield scrapy.FormRequest(self.start_urls[0],headers=self.h3,formdata=para,cookies=self.get_Cookies(self.proxies,self.h1),callback=self.parse)#也可以使用如下的request,不过结果不同???
#            yield scrapy.Request(self.start_urls[0],method=POST,headers=self.h3,body=json.dumps(para),cookies=self.cookies,callback=self.parse) #功能同上条Formrequest,不过要增加method参数,formdata改成body,且接受json数据
        
    def parse(self, response):
#        print(response.text)   #查看请求结果
#        with open('e:/testlagou1.txt','w') as fp1:
#            fp1.write(response.text)
#        with open('e:/testlagou2.txt','wb') as fp2:    
#            fp2.write(response.body)
        item= Lagou1Item()
        t=json.loads(response.body_as_unicode())#注意loads函数的使用
        positionId_list=[]
        positionName_list=[]
        createTime_list=[]
        companyId_list=[]
        companyShortName_list=[]
        companyFullName_list=[]
        city_list=[]
        salary_list=[]
        positionLables_list=[]        
        job_detail_url_list=[]
        
        for results in t['content']['positionResult']['result']:
            positionId=results['positionId']
            positionName = results['positionName']
            createTime = results['createTime']
            companyShortName =results['companyShortName']
#            workYear = results['workYear']
            city = results['city']
            salary = results['salary']
            companyId=results['companyId']
            companyFullName=results['companyFullName']
            positionLables = results['positionLables']
            positionLables= ','.join(positionLables)
#            positionAdvantage = results['positionAdvantage']
            job_detail_url = 'https://www.lagou.com/jobs/' + str(positionId) + '.html'
#            companyLogo=results['companyLogo']
#            print(positionName,companyShortName,workYear,city,salary,positionLables,positionAdvantage,companyLogo) #查看是否顺利爬取数据
            positionId_list.append(positionId)
            positionName_list.append(positionName)
            createTime_list.append(createTime)
            companyId_list.append(companyId)
            companyShortName_list.append(companyShortName)
            companyFullName_list.append(companyFullName)
            positionLables_list.append(positionLables)
            city_list.append(city)
            salary_list.append(salary)
            job_detail_url_list.append(job_detail_url)
        item['positionId']=positionId_list
        item['positionName']=positionName_list
        item['createTime']=createTime_list
        item['companyShortName']=companyShortName_list
        item['companyFullName']=companyFullName_list
        item['city']= city_list
        item['companyId']=companyId_list
        item['salary']=salary_list
        item['positionLables']= positionLables_list
        item['job_detail_url']=job_detail_url_list
        yield item 返回岗位基本信息供pipeline入库。
        for i in item['job_detail_url']: #请求岗位详细信息url,回调函数parse_job_detail继续解析
            yield scrapy.Request(i,headers=self.h4,cookies=self.get_Cookies(self.proxies,self.h4),callback=self.parse_job_details) #此处cookies很重要,否则反爬起作用,得不到结果
#        
#    
    def parse_job_details(self,response): #用来获取详细的职位描述,以及职位编号,用来和parse函数获取的信息进行匹配。
        soup=BeautifulSoup(response.body_as_unicode(),'lxml')     
        item=Lagou1Item()
        positionId=response.url.split('/')[-1].split('.')[0].strip()
#        job_trigger=response.xpath('//*[@id="job_detail"]/dd[1]/p/text()').extract()
#        job_description =response.xpath('//*[@id="job_detail"]/dd[2]/div/text()').extract().strip()
        try: #beatifulsoup解析
            job_trigger=soup.select_one('#job_detail > dd.job-advantage >p').text.strip()
            job_description = soup.select_one('#job_detail > dd.job_bt > div').text.strip()
        except Exception as e:
            print(e)
# 以下是测试时面临反爬,无法得到结果时尝试的xpath方式,后来解决反爬的为,也就用不到了。        job_description=response.xpath('//*@id="job_detail"]/dd[2]/div/text()').extract()
#            job_trigger=response.xpath('//*[@id="job_detail"]/dd[1]/p/text()').extract()
#            job_description=''.join(job_description)
#            job_trigger=''.join(job_trigger)
#            for i in range(3):
#                print(response.body)
#                print('I am in parsejobdetals:',e)            
            
        item['positionId']=positionId        
        item['job_description'] =job_description
        item['job_trigger']=job_trigger
        return item

pipelines.py

    # -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
#from scrapy.pipelines.files import FilesPipeline
from scrapy import Request 
from scrapy.exceptions import DropItem
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column,Integer,String,DateTime
from sqlalchemy.orm import sessionmaker
import mysql.connector #注意格式

#sqlalchemy映射数据表
Base=declarative_base()
class Lagoujob(Base): #这个参数父类名
    __tablename__='lagoujob' #存储岗位基本信息的数据表
    id=Column(Integer,primary_key=True)
    positionId=Column(String(50),unique=True)
    positionName=Column(String(50))
    createTime=Column(DateTime)
    companyId=Column(String(50))
    companyShortName=Column(String(100))
    companyFullName=Column(String(100)) #公司简称的长度超过想象,所以数值要大些
    city=Column(String(50))
    salary=Column(String(30))
    positionLables=Column(String(100))
    job_detail_url=Column(String(100)) 
    
class Lagoudetails(Base): 
    __tablename__='lagoudetails' #存储岗位详细信息的数据表
    id=Column(Integer,primary_key=True)
    positionId=Column(String(50),unique=True)
    job_trigger=Column(String(100))
    job_description=Column(String(1000)) 
    
class Lagou1Pipeline(object):
    def __init__(self):
        connection='mysql+mysqlconnector://pyuser:888888@localhost:3306/pydb?charset=utf8' #UTF8MB4
        engine=create_engine(connection,echo=True) #数据库连接
        DBSession=sessionmaker(bind=engine) #创建会话对象,用于数据表的操作
        self.Sqlsession=DBSession()
        Base.metadata.create_all(engine) #创建数据表
        
    def process_item(self, item, spider):
        if 'job_trigger' not in item.keys():  两个pipeline处理
            for i in range(0,len(item['positionId'])): #这里不能用len(item),因为只有10个字段,只能存入前10条记录
                try:
                    jobs=Lagoujob(positionId=item['positionId'][i],positionName=item['positionName'][i],createTime=item['createTime'][i],companyId=item['companyId'][i],companyShortName=item['companyShortName'][i],companyFullName=item['companyFullName'][i],city=item['city'][i],salary=item['salary'][i],positionLables=item['positionLables'][i],job_detail_url=item['job_detail_url'][i])
                    self.Sqlsession.add(jobs)
                    self.Sqlsession.commit() 
                except Exception as e:  
                    self.Sqlsession.rollback()   #如果需要执行异常语句,此句不可少!                 
#                    pass
                    print(e)
        return item
    
     #以下需要另外做一个类
class Lagou1Pipeline2(object):     
    def __init__(self):
        connection='mysql+mysqlconnector://XXX:XXXXXlocalhost:3306/pydb?charset=utf8' #UTF8MB4
        engine=create_engine(connection,echo=True) #数据库连接
        DBSession=sessionmaker(bind=engine) #创建会话对象,用于数据表的操作
        self.Sqlsession2=DBSession()
        Base.metadata.create_all(engine) #创建数据表
    def process_item(self, item, spider):
        if 'job_trigger' in item:  jobdetails=Lagoudetails(positionId=item['positionId'],job_trigger=item['job_trigger'], job_description=item['job_description'])
            try:
                self.Sqlsession2.add(jobdetails)
                self.Sqlsession2.commit()
            except Exception as e:
                print(e)
#                self.Sqlsession2.rollback() 
        return item
        
class DownloadFile(ImagesPipeline): #爬取图片pipeline未使用。
    def get_media_requests(self,item,info):
        for url,filename in zip(item['FileUrl'],item['FileName']):
            cookies={'X_HTTP_TOKEN': '42daf4b72327b2815637417751bf5e71415983ed09', 'user_trace_token': '20191224082925-308fab45-3629-4198-be34-cbb2eb78a270', 'JSESSIONID': 'ABAAABAAAGGABCB228791B35A6AD371A8A3D1C8FF1D6C88', 'SEARCH_ID': 'fd50d5fe208f4940a22a2bd69d76a576'} 
            h3={'User-Agent': 'Opera/9.80 (iPhone; Opera Mini/7.1.32694/27.1407; U; en) Presto/2.8.119 Version/11.10',\
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
            yield Request(url,headers=h3,cookies=cookies,meta={'name':filename})
            
    def file_path(self,request,response=None,info=None):
        file_name='E:\\scrapypro\\lagou1\\pic\\'+(request.meta['name'])+'.jpg'
        return file_name
 
    def item_completed(self, results, item, info):
        image_path = [x['path'] for ok, x in results if ok]
        if not image_path:
            raise DropItem('Item contains no images')
        item['image_paths'] = image_path
        return item
爬取结果:
lagoujob表格

在这里插入图片描述
lagoudetails
Scrapy+Mysql+SqlAlchemy爬取招聘网站信息_第1张图片

最后通过Mysql的 inner join语句将两个表格连接成一个新的表格。

create table lagoujobinfo(select lagoujob.*,lagoudetails.job_trigger,lagoudetails.job_description from lagoujob
LEFT JOIN lagoudetails on lagoujob.positionId=lagoudetails.positionId);

在这里插入图片描述

经验:
1、得不到预期结果,可以在代码相应环节中增加print信息,查看问题所在。
2、很多时候是反爬导致了运行出错,并非代码有问题。所以需要解决反爬的问题。
3、尝试了使用一个数据表的操作,发现总有信息遗漏,最后稳妥起价,干脆分两个表格,然后连接成一个。
4、一开始数据数据一直没有入库,后来找到原因是映射数据表的时候字段长度设置太保守,爬取的信息过
长,写不下导致的。

你可能感兴趣的:(python爬虫,数据库)