工具:需要执行的脚本文件(.py文件)、Linux服务器、本地主机、Xshell连接工具(前提是Linux的网络配置都已设好)
quit( )或者Ctrl - D 退出python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 作者:陈龙
# 时间:2016-7-22
# 名称:script_broadPublic_latestNews最新动态Key-Value脚本
# 功能:从数据库取和筛选微博微信报刊消息,并将数据写入redis
import os
import sys
import datetime,time
import cx_Oracle #引入cx_Oracle模块,Oracle接口
import redis
import json
import ConfigParser
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' # 中文编码
reload(sys) # 默认编码设置为utf-8
sys.setdefaultencoding('utf-8')
def conn_redis(redis_host,redis_port,redis_db):
try:
global redis_connect
redis_connect = redis.Redis(host=redis_host,port=redis_port,db=redis_db)
print 'redis 连接成功'
#redis.Redis(host='localhost', port=6379, db=0)
except IOError:
print '连接不上redis,请检查connect参数是否正确,或者模块版本是否匹配'
sys.exit(1)
def connect_Oracle(name, password, address):
try:
global Oracle_connect
Oracle_connect = cx_Oracle.connect(name, password, address) # 链接远程数据库
print 'Oracle连接成功'
except IOError:
print 'ES数据同步脚本连接不上数据库,请检查connect参数是否正确,或者模块版本是否匹配'
sys.exit(1)
def tuple_to_list(list):
new_list = [ ]
for i in list:
new_list.append(i[0])
return new_list
def source_type(accountcode):
start = accountcode.find('_')
number = accountcode[0:start].strip()
return number
def list_to_object(list): # 将数据库中取出的数据转换成符合要求的对象形式
new_object = { } #'tableName': 'T_SOCIAL', 'tableId': '2', 'accountCode': '','title''author''publishTime''url''content''commentNum''forwardNum''likeNum''browseNum''layoutNum''sourceType'
if source_type(list[1]) == '8':
new_object['tableName'] = 'T_PRESS'
new_object['tableId'] = list[0]
new_object['accountCode'] = list[1]
new_object['title'] = unicode(list[2])
new_object['author'] = unicode(list[3])
new_object['publishTime'] = str(list[4])
new_object['url'] = list[5]
new_object['content'] = ''
new_object['commentNum'] = new_object['forwardNum'] = new_object['likeNum'] = new_object['browseNum'] = 0
new_object['layoutNum'] = list[6]
new_object['sourceType'] = source_type(list[1])
if source_type(list[1]) == '1' or source_type(list[1]) == '2':
new_object['tableName'] = 'T_SOCIAL'
new_object['tableId'] = list[0]
new_object['accountCode'] = list[1]
new_object['title'] = unicode(list[2])
new_object['author'] = unicode(list[3])
new_object['publishTime'] = str(list[4])
new_object['url'] = list[5]
new_object['content'] = ''
new_object['commentNum'] = list[6]
new_object['forwardNum'] = list[7]
new_object['likeNum'] = list[8]
new_object['browseNum'] = list[9]
new_object['layoutNum'] = 0
new_object['sourceType'] = source_type(list[1])
new_object = json.dumps(new_object, ensure_ascii=False)
return new_object
def date_now():#当前的日期
now = datetime.datetime.now()
return now.strftime('%Y-%m-%d')
#创建报刊消息类
# class press_class:
# tableName = 'T_PRESS'
# tableId = ''
# accountCode = ''
# title = ''
# author = ''
# publishTime = ''
# url = ''
# content = ''
# commentNum = 0
# forwardNum = 0
# likeNum = 0
# browseNum = 0
# layoutNum = ''
# sourceType = 8
# def __init__(self,list):
# self.tableId = list[0]
# self.accountCode = list[1]
# self.title = unicode(list[2])
# self.author = unicode(list[3])
# self.publishTime = str(list[4])
# self.url = list[5]
# # self.content = ''
# self.layoutNum = list[6]
"""
2、 读黑龙江日报头版头条,保证其位于消息第一条。
3、 微博、微信(除去8个)取当天的消息按时间排序,排完后不够39条数的就取目前的N条。
4、 剩下的39-N条消息,取其他报刊的头版头条。
5、 清除redis之前的数据,将最新40条数据写入value。
"""
def readOracle_writeRedis(redisKey,N): # 写N条数据到redis相应的key
cur = Oracle_connect.cursor()
press_accountcode = cur.execute("select accountCode from T_Base_Account where Type = '8' order by OrderBy")
press_accountcode_list = press_accountcode.fetchall()
press_accountcode_list = tuple_to_list(press_accountcode_list)
press_accountcode_list1 = tuple(press_accountcode_list)
today = date_now()
result_press = cur.execute("select T_PRESS_ID,accountCode,title,author,PublishDate,url,layoutOrder from T_PRESS where to_char(publishDate,'YYYY-MM-DD') = '2016-08-12' and newsOrder = '1' and layoutOrder = '1' order by decode(ACCOUNTCODE,'8_HLJ_HLJRB_00','1')") # 测试 正式使用下面的注释这一条
# result_press = cur.execute("select T_PRESS_ID,accountCode,title,author,PublishDate,url,layoutOrder from T_PRESS where to_char(publishDate,'YYYY-MM-DD') = '{}' and newsOrder = '1' and layoutOrder = '1' ") order by decode(ACCOUNTCODE,'8_HLJ_HLJRB_00','1')".format(today)) #取各个报刊的最新时间头版头条消息,第一条是黑龙江日报
press_lists = result_press.fetchmany(N) #如果数据太多的话 fetchall会出现问题
result_social = cur.execute("select T_Social_ID,AccountCode,Title,Author,PublishTime,Url,CommentNum,ForwardNum,LikeNum,BrowseNum from T_social where AccountCode in (select accountCode from T_BASE_ACCOUNT where type = 1 or (type = 2 and WeechatType = 0)) and PublishDate = to_date('{}','YYYY-MM-DD') order by spiderTime desc".format("2016-08-12")) # 正式使用下面的注释这一条
# result_social = cur.execute("select T_Social_ID,AccountCode,Title,Author,PublishTime,Url,CommentNum,ForwardNum,LikeNum,BrowseNum from T_social where AccountCode in (select accountCode from T_BASE_ACCOUNT where type = 1 or (type = 2 and WeechatType = 0)) and PublishDate = to_date('{}','YYYY-MM-DD') order by spiderTime desc".format(today)) # 取微博 微信(去除8个微信) 今日最新更新数据 最后时间排序
social_lists = result_social.fetchmany(N)
len_press_lists = len(press_lists)
len_social_lists = len(social_lists)
if len_press_lists == 0 and len_social_lists == 0:#报刊和微信微博都没数据 这获取不到数据
print 'no data availablem'
if len_press_lists == 0 and len_social_lists != 0:#报刊没数据 微信微博有数据
for i in range(0, min(N,len_social_lists)):
object1 = list_to_object(social_lists[i])
redis_connect.rpush(redisKey, object1)
if len_press_lists != 0 and len_social_lists >=(N-1):#报刊有数据 微信微博数据大于N-1 写一条报刊 其他写微博微信
object1 = list_to_object(press_lists[0])
redis_connect.rpush(redisKey, object1)
for i in range(0,N-1):
object1 = list_to_object(social_lists[i])
redis_connect.rpush(redisKey, object1)
if len_press_lists != 0 and len_social_lists < (N-1):#报刊有数据 微信微博数据小于N-1 前面写报刊 后面写微博微信
for i in range(0, min((len_press_lists - 1),(N-1-len_social_lists))):
object1 = list_to_object(press_lists[i])
redis_connect.rpush(redisKey, object1)
for i in range(0, len_social_lists):
object1 = list_to_object(social_lists[i])
redis_connect.rpush(redisKey, object1)
cur.close()
Oracle_connect.close()
if __name__ == "__main__":
cf = ConfigParser.ConfigParser()
cf.read("/usr/local/TrinityAres-POM-scripts/conf.ini")
redis_url = cf.get("redis","url")
redis_port = cf.get("redis","port")
redis_db = cf.get("redis", "db")
conn_redis(redis_url, redis_port, redis_db) #通过配置文件读取地址 并连接redis
oracle_url = cf.get("oracle","url")
oracle_port = cf.get("oracle","port")
oracle_serve = cf.get("oracle", "serve")
oracle_account = cf.get("oracle", "account")
oracle_password = cf.get("oracle", "password")
connect_Oracle(oracle_account,oracle_password,"{}:{}/{}".format(oracle_url,oracle_port,oracle_serve)) #通过配置文件读取地址 并连接oracle
redis_key = 'broadPublic_latestNews'
if redis_connect.exists(redis_key):
redis_connect.delete(redis_key)
readOracle_writeRedis(redis_key,40)#往redis中写入40条
print "{}消息数: {}".format(redis_key,redis_connect.llen(redis_key))
a = redis_connect.client_list()
redis_connect.client_kill(a[0]['addr'])
#清除之前数据,将最新数据写入value
MAILTO=""
*/10 7-22 * * * /root/.pyenv/versions/2.7.10/bin/python /usr/local/TrinityAres-POM-scripts/script_broadPublic_latestNews.py >> /usr/local/TrinityAres-POM-scripts/log/broadPublic_latestNews.log
*/10 7-22 * * * /root/.pyenv/versions/2.7.10/bin/python /usr/local/TrinityAres-POM-scripts/script_hotTopics_news.py >> /usr/local/TrinityAres-POM-scripts/log/hotTopics_news.log
# Description
包含的文件和功能介绍:
1 conf.ini 配置文件。配置elasticsearch、redis、oracle的链接地址和端口等
2 script_broadPublic_latestNews.py 广电宣传最新动态Key-Value脚本
3 script_hotTopics_video.py 热播视频Key-Value脚本
4 script_hotTopics_news.py 热点新闻Key-Value脚本
5 script_hotTopics_post.py 热门帖子Key-Value脚本
6 script_hotTopics_words.py 热搜词汇Key-Value脚本
7 script_involve_ceefax_hot.py 涉我图文资讯热度Key-Value脚本
8 script_involve_ceefax_time.py 涉我图文资讯时间Key-Value脚本
9 script_involve_ceefax_type.py 涉我图文资讯类型Key-Value脚本
10 script_involve_video.py 涉我视音频Key-Value脚本
11 script_sensitive_publicOpinion_hot.py 敏感舆情热度Key-Value脚本
12 script_sensitive_publicOpinion_time.py 敏感舆情时间Key-Value脚本
13 script_infoCount_CarryDistribut.py 载体分布Key-Value脚本
14 script_infoCount_CollectionTrend.py 采集趋势Key-Value脚本
# How to use these scripts
-->将所有的文件上传到Linux服务器/usr/local/TrinityAres-POM-scripts文件下
-->在Linux服务器上安装python2.7.10 或更高级版本。并且修改相应的环境变量,查看当前python路径,确保修改成功
-->通过pip命令安装cx_Oracle包(需要先安装oracle客户端并修改oracle环境变量:oracle-instantclient11.2-basic-11.2.0.1.0-1.x86_64.rpm和oracle-instantclient11.2-hever-11.2.0.1.0-1.x86_64.rpm),安装elasticsearch包,安装redis包。并检查是否安装成功
-->检查配置文件conf.txt。各个脚本中会调用该文件,实现对elasticsearch、redis、oracle的操作。注意各个参数是否正确。在脚本中调用配置文件是绝对路径
-->在Linux下打开crontab文件进行编辑,将crontab.txt的内容拷贝到该文件中,保存退出
-->重新启动crond服务 ,并查看定时任务是否生效。在Linux服务器中通过crontab定时执行任务的方法实现对.py 脚本的定时启动
-->错误日志位于crontab.txt 中描述的文件/usr/local/TrinityAres-POM-scripts/log 下。查看各个脚本运行的结果状态
# Detailed description
2对应于界面中广电宣传-->最新动态 消息的筛选和排序(oracle、redis)
3对应于界面中热点聚焦-->热播视频 消息的筛选和排序(oracle、redis)
4对应于界面中热点聚焦-->热点新闻 消息的筛选和排序(oracle、redis)
5对应于界面中热点聚焦-->热门帖子 消息的筛选和排序(oracle、redis)
6对应于界面中热点聚焦-->热搜词汇(词云) 的筛选和赋值(oracle、redis)
7对应于界面中涉我内容-->涉我图文资讯热度 的筛选和排序(elasticsearch、redis)
8对应于界面中涉我内容-->涉我图文资讯时间 的筛选和排序(elasticsearch、redis)
9对应于界面中涉我内容-->涉我图文资讯内容 的筛选和排序(elasticsearch、redis)
10对应于界面中涉我内容-->涉我视音频 消息的筛选和排序(elasticsearch、redis)
11对应于界面中敏感舆情-->敏感舆情热度 消息的筛选和排序(elasticsearch、redis)
12对应于界面中敏感舆情-->敏感舆情时间 消息的筛选和排序(elasticsearch、redis)
13对应于界面中首页-->载体分布图 数量统计(oracle、redis)
14对应于界面中首页-->采集趋势 数量统计(oracle、redis)
# More information
AUTHOR:陈龙
TIME:2016/8/18
EMAIL:[email protected]