python spark streaming 从 kafka获取nginx日志,秒级统计url pv,并写入mysql

mysql建库建表

>create database nginx;

>use nginx

> create table url_access (id int NOT NULL AUTO_INCREMENT primary key, timestamp varchar(256), url varchar(256), pv long);

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/01/09 23:30
# @Author  : xuanda
# @File    : kafka_to_sparkstreaming_to_mysql.py

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

import MySQLdb, time

def save(x):
    row=x.collect()
    conn=MySQLdb.connect(host="192.168.1.1", port=3306, user="root", passwd="test",charset='utf8')
    cur=conn.cursor()
    try:
        conn.commit()
    except BaseException as err:
        print('Exception: ', err)
        conn.rollback()
    timestamp = int(round(time.time()))
    for url, pv in row:
        try:
            sql="insert into nginx.url_access (timestamp, url, pv) values ('%s', '%s', '%s')" % (timestamp, url, pv)
            cur.execute(sql)
            conn.commit()
        except BaseException as err:
            print('Exception: ', err)
            conn.rollback()

if __name__ == '__main__':
    zkQuorum = '192.168.1.20:2181'
    topic = {'nginx-access-log': 1}
    groupid = "kafka-to-sparkstreaming"
    appName = "KafkaToSparkstreaming"
    timecell = 1

# init spark streaming
    sc = SparkContext(master="spark://192.168.1.20:7077", appName=appName)
    ssc = StreamingContext(sc, timecell)

# Create an input stream that pulls messages from a Kafka Broker
    lines = KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)
    url_add_reduce = lines.map(lambda x:x[1]).map(lambda line:line.split(" ")).map(lambda line:line[8]).map(lambda w:(str(w),1)).reduceByKey(lambda x,y:x+y)
    url_add_reduce.foreachRDD(save)

    ssc.start()
    ssc.awaitTermination()

启动

spark-2.4.0-bin-hadoop2.7/bin/spark-submit --jars /tmp/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar kafka_to_sparkstreaming_to_mysql.py

mysql

python spark streaming 从 kafka获取nginx日志,秒级统计url pv,并写入mysql_第1张图片

明天写个简单页面展示下统计效果

你可能感兴趣的:(大数据相关)