https://www.elastic.co/products/logstash#
logstash -e ""
input {...}
filter {...}
output {...}
input {
file { path =>"/var/log/messages" type =>"syslog"}
file { path =>"/var/log/apache/access.log" type =>"apache"}
}
input {
file {
#指定监听的文件路径,注意必须是绝对路径
path => "E:/software/logstash-1.5.4/logstash-1.5.4/data/test.log"
start_position => beginning
}
}
filter {
}
output {
stdout {}
}
hello,this is first line in test.log!
hello,my name is xingoo!
goodbye.this is last line in test.log!
input {
file {
path => "E:/software/logstash-1.5.4/logstash-1.5.4/data/*"
}
}
filter {
}
output {
stdout {}
}
input {
file {
path => ["E:/software/logstash-1.5.4/logstash-1.5.4/data/*","F:/test.txt"]
}
}
filter {
}
output {
stdout {}
}
input {
file {
#监听文件的路径
path => ["E:/software/logstash-1.5.4/logstash-1.5.4/data/*","F:/test.txt"]
#排除不想监听的文件
exclude => "1.log"
#添加自定义的字段
add_field => {"test"=>"test"}
#增加标签
tags => "tag1"
#设置新事件的标志
delimiter => "\n"
#设置多长时间扫描目录,发现新文件
discover_interval => 15
#设置多长时间检测文件是否修改
stat_interval => 1
#监听文件的起始位置,默认是end
start_position => beginning
#监听文件读取信息记录的位置
sincedb_path => "E:/software/logstash-1.5.4/logstash-1.5.4/test.txt"
#设置多长时间会写入读取的位置信息
sincedb_write_interval => 15
}
}
filter {
}
output {
stdout {}
}
file {
add_field => {"test"=>"test"}
path => "D:/tools/logstash/path/to/groksample.log"
start_position => beginning
}
启动zookeeper:
$zookeeper/bin/zkServer.sh start
启动kafka:
$kafka/bin/kafka-server-start.sh $kafka/config/server.properties &
创建主题:
$kafka/bin/kafka-topics.sh --zookeeper 127.0.0.1:2181 --create --topic hello --replication-factor 1 --partitions 1
查看主题:
$kafka/bin/kafka-topics.sh --zookeeper 127.0.0.1:2181 --describe
执行生产者脚本:
$kafka/bin/kafka-console-producer.sh --broker-list 10.0.67.101:9092 --topic hello
执行消费者脚本,查看是否写入:
$kafka/bin/kafka-console-consumer.sh --zookeeper 127.0.0.1:2181 --from-beginning --topic hello
input{
stdin{}
}
output{
kafka{
topic_id => "hello"
bootstrap_servers => "192.168.0.4:9092,172.16.0.12:9092"
# kafka的地址
batch_size => 5
codec => plain {
format => "%{message}"
charset => "UTF-8"
}
}
stdout{
codec => rubydebug
}
}
input{
kafka {
codec => "plain"
group_id => "logstash1"
auto_offset_reset => "smallest"
reset_beginning => true
topic_id => "hello"
zk_connect => "192.168.0.5:2181"
}
}
output{
stdout{
codec => rubydebug
}
}
vi /bigdata/elasticsearch-2.3.1/config/elasticsearch.yml
#集群名称,通过组播的方式通信,通过名称判断属于哪个集群
cluster.name: bigdata
#节点名称,要唯一
node.name: es-1
#数据存放位置
path.data: /data/es/data
#日志存放位置
path.logs: /data/es/logs
#es绑定的ip地址
network.host: 172.16.0.14
#初始化时可进行选举的节点
discovery.zen.ping.unicast.hosts: ["node-4.itcast.cn", "node-5.itcast.cn", "node-6.itcast.cn"]
scp -r elasticsearch-2.3.1/ node-5.itcast.cn:$PWD
scp -r elasticsearch-2.3.1/ node-6.itcast.cn:$PWD
/bigdata/elasticsearch-2.3.1/bin/elasticsearch –d
http://172.16.0.14:9200/
{
"name" : "es-1",
"cluster_name" : "bigdata",
"version" : {
"number" : "2.3.1",
"build_hash" : "bd980929010aef404e7cb0843e61d0665269fc39",
"build_timestamp" : "2016-04-04T12:25:05Z",
"build_snapshot" : false,
"lucene_version" : "5.5.0"
},
"tagline" : "You Know, for Search"
}
kill `ps -ef | grep Elasticsearch | grep -v grep | awk '{print $2}'`
curl -XPUT 'http://172.16.0.14:9200/store/books/1' -d '{
"title": "Elasticsearch: The Definitive Guide",
"name" : {
"first" : "Zachary",
"last" : "Tong"
},
"publish_date":"2015-02-06",
"price":"49.99"
}'
#通过浏览器查询
http://172.16.0.14:9200/store/books/1
#在linux中通过curl的方式查询
curl -XGET 'http://172.16.0.14:9200/store/books/1'
#在添加一个书的信息
curl -XPUT 'http://172.16.0.14:9200/store/books/2' -d '{
"title": "Elasticsearch Blueprints",
"name" : {
"first" : "Vineeth",
"last" : "Mohan"
},
"publish_date":"2015-06-06",
"price":"35.99"
}'
# 通过ID获得文档信息
curl -XGET 'http://172.16.0.14:9200/bookstore/books/1'
#在浏览器中查看
http://172.16.0.14:9200/bookstore/books/1
# 通过_source获取指定的字段
curl -XGET 'http://172.16.0.14:9200/store/books/1?_source=title'
curl -XGET 'http://172.16.0.14:9200/store/books/1?_source=title,price'
curl -XGET 'http://172.16.0.14:9200/store/books/1?_source'
curl -XPUT 'http://172.16.0.14:9200/store/books/1' -d '{
"title": "Elasticsearch: The Definitive Guide",
"name" : {
"first" : "Zachary",
"last" : "Tong"
},
"publish_date":"2016-02-06",
"price":"99.99"
}'
curl -XPOST 'http://172.16.0.14:9200/store/books/1/_update' -d '{
"doc": {
"price" : 88.88
}
}'
curl -XGET 'http://172.16.0.14:9200/store/books/1'
#删除一个文档
curl -XDELETE 'http://172.16.0.14:9200/store/books/1'
# 最简单filter查询
# SELECT * FROM books WHERE price = 35.99
# filtered 查询价格是35.99的
curl -XGET 'http://172.16.0.14:9200/store/books/_search' -d '{
"query" : {
"filtered" : {
"query" : {
"match_all" : {}
},
"filter" : {
"term" : {
"price" : 35.99
}
}
}
}
}'
curl -XGET 'http://172.16.0.14:9200/store/books/_search' -d '{
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"price" : [35.99, 88.88]
}
}
}
}
}'
curl -XGET 'http://172.16.0.14:9200/store/books/_search' -d '{
"query" : {
"filtered" : {
"filter" : {
"term" : {
"publish_date" : "2015-02-06"
}
}
}
}
}'
# bool过滤查询,可以做组合过滤查询
# SELECT * FROM books WHERE (price = 35.99 OR price = 99.99) AND (publish_date != "2016-02-06")
# 类似的,Elasticsearch也有 and, or, not这样的组合条件的查询方式
# 格式如下:
# {
# "bool" : {
# "must" : [],
# "should" : [],
# "must_not" : [],
# }
# }
#
# must: 条件必须满足,相当于 and
# should: 条件可以满足也可以不满足,相当于 or
# must_not: 条件不需要满足,相当于 not
curl -XGET 'http://172.16.0.14:9200/bookstore/books/_search' -d '{
"query" : {
"filtered" : {
"filter" : {
"bool" : {
"should" : [
{ "term" : {"price" : 35.99}},
{ "term" : {"price" : 99.99}}
],
"must_not" : {
"term" : {"publish_date" : "2016-02-06"}
}
}
}
}
}
}'
curl -XGET 'http://172.16.0.14:9200/bookstore/books/_search' -d '{
"query" : {
"filtered" : {
"filter" : {
"bool" : {
"should" : [
{ "term" : {"price" : 35.99}},
{ "bool" : {
"must" : [
{"term" : {"publish_date" : "2016-02-06"}},
{"term" : {"price" : 99.99}}
]
}}
]
}
}
}
}
}'
# range范围过滤
# SELECT * FROM books WHERE price >= 20 AND price < 100
# gt : > 大于
# lt : < 小于
# gte : >= 大于等于
# lte : <= 小于等于
curl -XGET 'http://172.16.0.14:9200/store/books/_search' -d '{
"query" : {
"filtered" : {
"filter" : {
"range" : {
"price" : {
"gt" : 20.0,
"lt" : 100
}
}
}
}
}
}'
curl -XGET 'http://172.16.0.14:9200/bookstore/books/_search' -d '{
"query": {
"filtered": {
"filter": {
"and": [
{
"term": {
"price":59.99
}
},
{
"term": {
"publish_date":"2015-02-06"
}
}
]
},
"query": {
"match_all": {}
}
}
}
}'
input {
file {
path => "/var/nginx_logs/*.log"
codec => "json"
discover_interval => 5
start_position => "beginning"
}
}
output {
elasticsearch {
index => "flow-%{+YYYY.MM.dd}"
hosts => ["172.16.0.14:9200", "172.16.0.15:9200", "172.16.0.16:9200"]
}
}
input {
kafka {
type => "accesslogs"
codec => "plain"
auto_offset_reset => "smallest"
group_id => "elas1"
topic_id => "accesslogs"
zk_connect => "172.16.0.11:2181,172.16.0.12:2181,172.16.0.13:2181"
}
kafka {
type => "gamelogs"
auto_offset_reset => "smallest"
codec => "plain"
group_id => "elas2"
topic_id => "gamelogs"
zk_connect => "172.16.0.11:2181,172.16.0.12:2181,172.16.0.13:2181"
}
}
filter {
if [type] == "accesslogs" {
json {
source => "message"
remove_field => [ "message" ]
target => "access"
}
}
if [type] == "gamelogs" {
mutate {
split => { "message" => " " }
add_field => {
"event_type" => "%{message[3]}"
"current_map" => "%{message[4]}"
"current_X" => "%{message[5]}"
"current_y" => "%{message[6]}"
"user" => "%{message[7]}"
"item" => "%{message[8]}"
"item_id" => "%{message[9]}"
"current_time" => "%{message[12]}"
}
remove_field => [ "message" ]
}
}
}
output {
if [type] == "accesslogs" {
elasticsearch {
index => "accesslogs"
codec => "json"
hosts => ["172.16.0.14:9200", "172.16.0.15:9200", "172.16.0.16:9200"]
}
}
if [type] == "gamelogs" {
elasticsearch {
index => "gamelogs"
codec => plain {
charset => "UTF-16BE"
}
hosts => ["172.16.0.14:9200", "172.16.0.15:9200", "172.16.0.16:9200"]
}
}
}
package game
/**
* 事件类型枚举
* 0 管理员登陆
* 1 首次登陆
* 2 上线
* 3 下线
* 4 升级
* 5 预留
* 6 装备回收元宝
* 7 元宝兑换RMB
* 8 PK
* 9 成长任务
* 10 领取奖励
* 11 神力护身
* 12 购买物品
*/
object EventType {
val REGISTER = "1"
val LOGIN = "2"
val LOGOUT = "3"
val UPGRADE = "4"
}
package game
import org.apache.commons.lang3.time.FastDateFormat
/**
* Created by root on 2016/7/10.
*
* 我们定义的单例对象,每个Executor进程中只有一个FilterUtils实例
* 但是Executor进程中的Task是多线程的
*/
object FilterUtils {
//为什么不用SimpleDateFormat,因为SimpleDateFormat不是线程安全的
val dateFormat = FastDateFormat.getInstance("yyyy年MM月dd日,E,HH:mm:ss")
def filterByTime(fields: Array[String], startTime: Long, endTime: Long): Boolean = {
val time = fields(1)
val logTime = dateFormat.parse(time).getTime
logTime >= startTime && logTime < endTime
}
def filterByType(fields: Array[String], eventType: String) = {
val _type = fields(0)
_type == eventType
}
def filterByTypeAndTime(fields: Array[String], eventType: String, beginTime: Long, endTime: Long) = {
val _type = fields(0)
val time = fields(1)
val timeLong = dateFormat.parse(time).getTime
_type == eventType && timeLong >= beginTime && timeLong < endTime
}
def filterByTypes(fields: Array[String], eventTypes: String*): Boolean = {
val _type = fields(0)
for(et <- eventTypes){
if(_type == et)
return true
}
false
}
}
package game
import java.text.SimpleDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by root on 2016/7/10.
*/
object GameKPI {
def main(args: Array[String]) {
val queryTime = "2016-02-01 00:00:00"
val beginTime = TimeUtils(queryTime)
val endTime = TimeUtils.getCertainDayTime(+1)
//t1 02-02
val t1 = TimeUtils.getCertainDayTime(+1)
//t2 02-03
val t2 = TimeUtils.getCertainDayTime(+2)
val conf = new SparkConf().setAppName("GameKPI").setMaster("local")
//非常重要的一个对象SparkContext
val sc = new SparkContext(conf)
val splitedLog: RDD[Array[String]] = sc.textFile("c://GameLog.txt").map(_.split("\\|"))
//新增用户
class FilterUtils
// splitedLog.filter(_(0) == "1").filter(x => {
// val time = x(1)
// //不好,每filter一次就会new一个SimpleDateFormat实例,会浪费资源
// val sdf = new SimpleDateFormat("yyyy年MM月dd日,E,HH:mm:ss")
// val timeLong = sdf.parse(time).getTime
// })
//过滤后并缓冲
val filteredLogs = splitedLog.filter(fields => FilterUtils.filterByTime(fields, beginTime, endTime))
.cache()
//日新增用户数,Daily New Users 缩写 DNU
val dnu: RDD[Array[String]] = filteredLogs.filter(arr => FilterUtils.filterByType(arr, EventType.REGISTER))
println(dnu.count())
//日活跃用户数 DAU (Daily Active Users)
val dau = filteredLogs.filter(arr => FilterUtils.filterByTypes(arr, EventType.REGISTER, EventType.LOGIN))
.map(_(3)).distinct()
println(dau.count())
//(次日留存)用户留存
val dnuMap: RDD[(String, Int)] = dnu.map(arr =>(arr(3), 1))
val d2Login: RDD[Array[String]] = splitedLog.filter(arr => FilterUtils.filterByTypeAndTime(arr, EventType.LOGIN, t1, t2))
val d2UnameMap: RDD[(String, Int)] = d2Login.map(_(3)).distinct().map((_, 1))
// 留存率:某段时间的新增用户数记为A,经过一段时间后,仍然使用的用户占新增用户A的比例即为留存率
// // 次日留存率(Day 1 Retention Ratio) Retention [rɪ'tenʃ(ə)n] Ratio ['reɪʃɪəʊ]
// // 日新增用户在+1日登陆的用户占新增用户的比例
val d1rr: RDD[(String, (Int, Int))] = dnuMap.join(d2UnameMap)
val rdda = sc.parallelize(Array("a", "b", "c"))
val rddb = sc.parallelize(Array("a", "d", "e", "f"))
val r = rdda.subtract(rddb).collect().toBuffer
println(r)
// println(d1rr.collect().toBuffer)
//
//println(d1rr.count())
//
// println("dnu" + dnu.count())
//
// println("dau" + dau.count())
sc.stop()
}
}
package game
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark._
object ElasticSpark {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("ElasticSpark").setMaster("local")
conf.set("es.nodes", "172.16.0.14,172.16.0.15,172.16.0.16")
conf.set("es.port", "9200")
conf.set("es.index.auto.create", "true")
val sc = new SparkContext(conf)
//val query: String = "{\"query\":{\"match_all\":{}}}"
val start = 1472290002
val end = 1472290047
// val query: String =
// s"""{
// "query": {"match_all": {}},
// "filter": {
// "bool": {
// "must": {
// "range": {
// "access.time": {
// "gte": "$start",
// "lte": "$end"
// }
// }
// }
// }
// }
// }"""
val tp = 1
val query: String =
s"""{
"query": {"match_all": {}},
"filter" : {
"bool": {
"must": [
{"term" : {"access.type" : $tp}},
{
"range": {
"access.time": {
"gte": "$start",
"lte": "$end"
}
}
}
]
}
}
}"""
val rdd1 = sc.esRDD("accesslog", query)
println(rdd1.collect().toBuffer)
println(rdd1.collect().size)
}
}
package game
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
/**
* Created by root on 2016/5/24.
*/
object JedisConnectionPool{
val config = new JedisPoolConfig()
//最大连接数,
config.setMaxTotal(10)
//最大空闲连接数,
config.setMaxIdle(5)
//当调用borrow Object方法时,是否进行有效性检查 -->
config.setTestOnBorrow(true)
val pool = new JedisPool(config, "172.16.0.101", 6379)
def getConnection(): Jedis = {
pool.getResource
}
def main(args: Array[String]) {
val conn = JedisConnectionPool.getConnection()
val r = conn.keys("*")
println(r)
}
}
package game
import kafka.serializer.StringDecoder
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by root on 2016/5/24.
*/
object ScannPlugins {
def main(args: Array[String]) {
val Array(zkQuorum, group, topics, numThreads) = Array("node-1.itcast.cn:2181,node-2.itcast.cn:2181,node-3.itcast.cn:2181", "g0", "gamelog", "1")
val dateFormat = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
val conf = new SparkConf().setAppName("ScannPlugins").setMaster("local[4]")
val sc = new SparkContext(conf)
//C产生数据批次的时间间隔10s
val ssc = new StreamingContext(sc, Milliseconds(10000))
//如果想要在集群中运行该程序,CheckpointDir设置一个共享存储的目录:HDFS
sc.setCheckpointDir("c://ck0710")
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val kafkaParams = Map[String, String](
"zookeeper.connect" -> zkQuorum,
"group.id" -> group,
"auto.offset.reset" -> "smallest"
)
val dstream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicMap, StorageLevel.MEMORY_AND_DISK_SER)
//取出kafka的内容
val lines = dstream.map(_._2)
//用制表符切分数据
val splitedLines = lines.map(_.split("\t"))
val filteredLines = splitedLines.filter(f => {
val et = f(3)
val item = f(8)
et == "11" && item == "强效太阳水"
})
val userAndTime: DStream[(String, Long)] = filteredLines.map(f => (f(7), dateFormat.parse(f(12)).getTime))
//安装时间窗口进行分组
val grouedWindow: DStream[(String, Iterable[Long])] = userAndTime.groupByKeyAndWindow(Milliseconds(30000), Milliseconds(20000))
val filtered: DStream[(String, Iterable[Long])] = grouedWindow.filter(_._2.size >= 5)
val itemAvgTime = filtered.mapValues(it => {
val list = it.toList.sorted
val size = list.size
val first = list(0)
val last = list(size - 1)
val cha: Double = last - first
cha / size
})
val badUser: DStream[(String, Double)] = itemAvgTime.filter(_._2 < 10000)
badUser.foreachRDD(rdd => {
rdd.foreachPartition(it => {
// val connection = JedisConnectionPool.getConnection()
// it.foreach(t => {
// val user = t._1
// val avgTime = t._2
// val currentTime = System.currentTimeMillis()
// connection.set(user + "_" + currentTime, avgTime.toString)
// })
// connection.close()
it.foreach(t => {
println(t._1)
})
})
})
//filteredLines.print()
ssc.start()
ssc.awaitTermination()
}
}
package game
import java.text.SimpleDateFormat
import java.util.Calendar
/**
* Created by root on 2016/5/23.
*/
object TimeUtils {
val simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val calendar = Calendar.getInstance()
def apply(time: String) = {
calendar.setTime(simpleDateFormat.parse(time))
calendar.getTimeInMillis
}
def getCertainDayTime(amount: Int): Long ={
calendar.add(Calendar.DATE, amount)
val time = calendar.getTimeInMillis
calendar.add(Calendar.DATE, -amount)
time
}
}