# -*- coding: utf-8 -*-
"""
@Softwhare:win10 Python 3.6.3 |Anaconda, Inc.| (default, Oct 15 2017, 03:27:45) [MSC v.1900 64 bit (AMD64)] on win32
@IDE--Env : PyCharm--
@Time : 2020/7/29 15:49
@Author : DELL--bob
@connect : [email protected]
@File : runcmd_58tc.py
@Version : 1.0.0
@Desc :
@LastTime:
"""
import os
import platform
import requests
import json
import multiprocessing as mp
from datetime import datetime, timedelta
from apscheduler.schedulers.twisted import TwistedScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
from apscheduler.triggers.cron import CronTrigger
# scheduler = TwistedScheduler()
scheduler = BlockingScheduler()
import logging
from scrapy import cmdline
from scrapy.utils.log import configure_logging
configure_logging()
print = logging.warning ## 把print输出到log
def ding_push_message(msg):
"""钉钉推送"""
msg = 'spider: ' + msg ## 关键词spider, 必加
url = 'https://oapi.dingtalk.com/robot/send?access_token=xxxxx'
# 构建请求头部
headers = {"Content-Type": "application/json","Charset": "UTF-8"}
# 构建请求数据
message = {"msgtype": "text", "text": {"content": msg},
# atMobiles被@人的手机号,"isAtAll": False ##True, # 控制@所有人
"at": {"atMobiles": ['188xxxxxxxxx'], "isAtAll": False}
}
ding_res = requests.post(url=url, data=json.dumps(message), headers=headers)
print (ding_res.text)
def apscheduler_listener(event):
"""定时监听"""
if event.exception:
# print(event.traceback)
# print(event.exception)
print('任务出错了!!!!!!')
ding_msg = f"""您的任务已完成(有错误),\nspider名称: {(__file__)}
\n{event.traceback}{event.exception}
\n{datetime.now()}"""
print(ding_msg)
ding_push_message(ding_msg)
else:
print('任务照常运行...')
ding_msg = f"""您的任务已完成(无错误),\nspider名称: {(__file__)}
\n{event.traceback}{event.exception}
\n{datetime.now()}"""
print(ding_msg)
ding_push_message(ding_msg)
def run():
start = datetime.now()
print(start)
spidernames = [
"spiders_xq_58tc_areahref_scrapy",
"spiders_xq_58tc_scrapy"
]
spidernames = spidernames[:1] ## 测试,正式需注释
for spidername in spidernames:
system_current = platform.system()
"""判断当前系统,选用不同的scrapy settings配置文件,区分生产,本地环境"""
if (system_current == 'Windows'):
# a = os.system('export SCRAPY_PROJECT=company')
# srcip = os.environ['SRCIP'].strip('\r')
os.environ['SCRAPY_PROJECT'] = "default"
logging.warning(f'##*current_scrapy.cfg,win,{os.getenv("SCRAPY_PROJECT")}')
else:
os.environ['SCRAPY_PROJECT'] = "companyLinux"
logging.warning(f'##*current_scrapy.cfg,companyLinux,{os.getenv("SCRAPY_PROJECT")}')
print(f'当前环境companyLinux,{os.getenv("SCRAPY_PROJECT")}')
##01 cmdline scrapy自带不支持redis定时
cmdline.execute(f"scrapy crawl {spidername}".split())
#02 系统shell 支持redis定时
# os.popen(r'python D:\\home\\baseuser\\spider\\spider\\BD\\spiders-gd-placeapi-base-site-lat-lon-bd-geohash-get-scrapy-redis\\sf_newrates\\runcmd无定时.py ', 'r')
# os.system(r'python -u D:\home\baseuser\spider\spider\BD\spiders-gd-placeapi-base-site-lat-lon-bd-geohash-get-scrapy-redis\sf_newrates\runcmd无定时.py')
# os.system(f'scrapy crawl {spidername}')
end = datetime.now()
print(end)
print((end - start)) # 5
print("项目结束-----")
def thread_main():
print("thread")
pool = mp.Pool(processes=3) #即一次只能同时处理3个请求
pool.apply_async(run, )
# pool.apply(run,)
pool.close()
pool.join()
print ("All's done!")
if __name__ == '__main__':
# scheduler.add_job(run, trigger=CronTrigger.from_crontab('48 9 * * *'), misfire_grace_time=120) ## 每天早上5:00执行
# scheduler.add_listener(apscheduler_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
# scheduler._logger = logging
# scheduler.start()
run()
scrapy项目如何通过scrapy.cfg切换配置文件
python设置环境变量(临时和永久)
https://www.osgeo.cn/scrapy/topics/commands.html
shell export的变量在python中使用
Scrapy-实用的命令行工具实现方法
Python执行Linux系统命令的4种方法