scrapy.cfg scrapy启用不同的配置文件settings来区分生产和本地环境

scrapy 用py 启动


# -*- coding: utf-8 -*-

"""
@Softwhare:win10  Python 3.6.3 |Anaconda, Inc.| (default, Oct 15 2017, 03:27:45) [MSC v.1900 64 bit (AMD64)] on win32
@IDE--Env : PyCharm--
@Time    : 2020/7/29 15:49
@Author  : DELL--bob
@connect : [email protected]
@File    : runcmd_58tc.py
@Version :  1.0.0 
@Desc    :
@LastTime: 
"""


import os
import platform
import requests
import json
import multiprocessing as mp
from datetime import datetime, timedelta
from apscheduler.schedulers.twisted import TwistedScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
from apscheduler.triggers.cron import CronTrigger
# scheduler = TwistedScheduler()
scheduler = BlockingScheduler()

import logging
from scrapy import cmdline
from scrapy.utils.log import configure_logging
configure_logging()
print = logging.warning  ## 把print输出到log


def ding_push_message(msg):
    """钉钉推送"""
    msg = 'spider: ' + msg  ##  关键词spider, 必加
    url = 'https://oapi.dingtalk.com/robot/send?access_token=xxxxx'
    # 构建请求头部
    headers = {"Content-Type": "application/json","Charset": "UTF-8"}
    # 构建请求数据
    message = {"msgtype": "text", "text": {"content": msg},
               # atMobiles被@人的手机号,"isAtAll": False ##True,  # 控制@所有人
               "at": {"atMobiles": ['188xxxxxxxxx'],  "isAtAll": False}
               }
    ding_res = requests.post(url=url, data=json.dumps(message), headers=headers)
    print (ding_res.text)

def apscheduler_listener(event):
    """定时监听"""
    if event.exception:
        # print(event.traceback)
        # print(event.exception)
        print('任务出错了!!!!!!')
        ding_msg = f"""您的任务已完成(有错误),\nspider名称: {(__file__)}
                        \n{event.traceback}{event.exception}
                        \n{datetime.now()}"""
        print(ding_msg)
        ding_push_message(ding_msg)
    else:
        print('任务照常运行...')
        ding_msg = f"""您的任务已完成(无错误),\nspider名称: {(__file__)}
                        \n{event.traceback}{event.exception}
                        \n{datetime.now()}"""
        print(ding_msg)
        ding_push_message(ding_msg)


def run():
    start = datetime.now()
    print(start)
    spidernames = [
                "spiders_xq_58tc_areahref_scrapy",
                "spiders_xq_58tc_scrapy"
    ]
    spidernames = spidernames[:1]  ## 测试,正式需注释
    for spidername in spidernames:
        system_current = platform.system()
        """判断当前系统,选用不同的scrapy settings配置文件,区分生产,本地环境"""
        if (system_current == 'Windows'):
            # a = os.system('export SCRAPY_PROJECT=company')
            # srcip = os.environ['SRCIP'].strip('\r')
            os.environ['SCRAPY_PROJECT'] = "default"
            logging.warning(f'##*current_scrapy.cfg,win,{os.getenv("SCRAPY_PROJECT")}')
        else:
            os.environ['SCRAPY_PROJECT'] = "companyLinux"
            logging.warning(f'##*current_scrapy.cfg,companyLinux,{os.getenv("SCRAPY_PROJECT")}')
            print(f'当前环境companyLinux,{os.getenv("SCRAPY_PROJECT")}')

        ##01 cmdline scrapy自带不支持redis定时
        cmdline.execute(f"scrapy  crawl {spidername}".split())

        #02 系统shell 支持redis定时
        # os.popen(r'python  D:\\home\\baseuser\\spider\\spider\\BD\\spiders-gd-placeapi-base-site-lat-lon-bd-geohash-get-scrapy-redis\\sf_newrates\\runcmd无定时.py ', 'r')
        # os.system(r'python  -u D:\home\baseuser\spider\spider\BD\spiders-gd-placeapi-base-site-lat-lon-bd-geohash-get-scrapy-redis\sf_newrates\runcmd无定时.py')
        # os.system(f'scrapy crawl {spidername}')

    end = datetime.now()
    print(end)
    print((end - start))  # 5
    print("项目结束-----")

def thread_main():
    print("thread")
    pool = mp.Pool(processes=3) #即一次只能同时处理3个请求
    pool.apply_async(run, )
    # pool.apply(run,)
    pool.close()
    pool.join()
    print ("All's done!")



if __name__ == '__main__':

    # scheduler.add_job(run, trigger=CronTrigger.from_crontab('48 9 * * *'), misfire_grace_time=120)  ## 每天早上5:00执行
    # scheduler.add_listener(apscheduler_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    # scheduler._logger = logging
    # scheduler.start()
    run()



参考

scrapy项目如何通过scrapy.cfg切换配置文件
python设置环境变量(临时和永久)

https://www.osgeo.cn/scrapy/topics/commands.html
shell export的变量在python中使用

Scrapy-实用的命令行工具实现方法

Python执行Linux系统命令的4种方法

你可能感兴趣的:(22_爬虫,23_mysql,python,scrapy,settngs)