大数据实战手册-开发篇之spark实战案例:实时日志分析

2.6 spark实战案例:实时日志分析

  • 2.6.1 交互流程图

大数据实战手册-开发篇之spark实战案例:实时日志分析_第1张图片

  • 2.6.2 客户端监听器(java)
@SuppressWarnings("static-access")
  private void handleSocket() {
      lock.lock();
      Writer writer = null;
      RandomAccessFile raf = null;
      
      try {
          File file = new File(filepath);
          raf = new RandomAccessFile(file, "r");
          raf.seek(pointer);
          writer = new OutputStreamWriter(socket.getOutputStream(), "UTF-8");
          
          String line = null;
          
          while ((line = raf.readLine()) != null) {
              if (Strings.isBlank(line)) {
                  continue;
              }

              line = new String(line.getBytes("ISO-8859-1"), "UTF-8");
              writer.write(line.concat("\n"));
              writer.flush();
              logger.info("线程:{}----起始位置:{}----读取文件\n{} :",Thread.currentThread().getName(), pointer, line);
              
              pointer = raf.getFilePointer();
          }
          Thread.currentThread().sleep(2000);
      } catch (Exception e) {
          logger.error(e.getMessage());
          e.printStackTrace();
      } finally {
          lock.unlock();
          fclose(writer, raf);
      }

  }
  • 2.6.3 sparkStream实时数据接收(python)
conf = SparkConf()
conf.setAppName("HIS实时日志分析")
conf.setMaster('yarn') # spark standalone
conf.set('spark.executor.instances', 8) # cluster on yarn
conf.set('spark.executor.memory', '1g')
conf.set('spark.executor.cores', '1')
# conf.set('spark.cores.max', '2')
# conf.set('spark.logConf', True)
conf.set('spark.streaming.blockInterval', 1000*4)  # restart receiver interval

sc = SparkContext(conf = conf)
sc.setLogLevel('ERROR')
sc.setCheckpointDir('hdfs://hadoop01:9000/hadoop/upload/checkpoint/')

ssc = StreamingContext(sc, 30)  # time interval at which splits streaming data into block

lines = ssc.socketTextStream(str(ip), int(port))
# lines.pprint()
lines.foreachRDD(requestLog)
lines.foreachRDD(errorLog)
ssc.start()
ssc.awaitTermination()
  • 2.6.4 sparklSQL、RDD结算、结构化搜索、结构存储mongoDB(python)
 def getSparkSessionInstance(sparkConf):
    '''
    :@desc 多个RDD全局共享sparksession
     .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.coll") \
     .config("spark.mongodb.output.uri", "mongodb://adxkj:[email protected]:27017/") \
    :param sparkConf:
    :return:
    '''
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession \
            .builder \
            .config(conf=sparkConf) \
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']


def timeFomate(x):
    '''
    :@desc 处理时间
    :param x:
    :return:
    '''
    if not isinstance(x, list):
        return None

    # filter microsenconds
    x.insert(0, ' '.join(x[0:2]))
    x.pop(1)
    x.pop(1)

    # filter '[]'
    rx = re.compile('([\[\]\',])')
    # text = rx.sub(r'\\\1', text)
    x = [rx.sub(r'', x[i]) for i in range(len(x))]

    # string to time
    x[0] = x[0][: x[0].find('.')]
    x[0] = ''.join(x[0])
    x[0] = datetime.strptime(x[0], '%Y-%m-%d %H:%M:%S')

    return x


def sqlMysql(sqlResult, table, url="jdbc:mysql://192.168.0.252:3306/hisLog", user='root', password=""):
    '''
    :@desc sql结果保存
    :param sqlResult:
    :param table:
    :param url:
    :param user:
    :param password:
    :return:
    '''
    try:
        sqlResult.write \
            .mode('append') \
            .format("jdbc") \
            .option("url", url) \
            .option("dbtable", table) \
            .option("user", user) \
            .option("password", password) \
            .save()
    except:
        excType, excValue, excTraceback = sys.exc_info()
        traceback.print_exception(excType, excValue, excTraceback, limit=3)
        # print(excValue)
        # traceback.print_tb(excTraceback)


def sqlMongodb(sqlResult, table):
    '''
    :@desc sql结果保存
    :param sqlResult:
    :param table:
    :param url:
    :param user:
    :param password:
    :return:
    '''
    try:
        sqlResult.\
            write.\
            format("com.mongodb.spark.sql.DefaultSource"). \
            options(uri="mongodb://adxkj:[email protected]:27017/hislog",
                    database="hislog", collection=table, user="adxkj", password="123456").\
            mode("append").\
            save()
    except:
        excType, excValue, excTraceback = sys.exc_info()
        traceback.print_exception(excType, excValue, excTraceback, limit=3)
        # print(excValue)
        # traceback.print_tb(excTraceback)


def decodeStr(x) :
    '''
    :@desc base64解码
    :param x:
    :return:
    '''

    try:
        if x[9].strip() != '' :
            x[9] = base64.b64decode(x[9].encode("utf-8")).decode("utf-8")
            # x[9] = x[9][:5000] #mysql

        if x[11].strip() != '':
            x[11] = base64.b64decode(x[11].encode("utf-8")).decode("utf-8")
            # x[11] = x[11][:5000] #mysql

        if len(x) > 12 and x[12].strip() != '':
            x[12] = base64.b64decode(x[12].encode("utf-8")).decode("utf-8")

    except Exception as e:
        print("不能解码:", x, e)

    return x

def analyMod(x) :
    '''
    :@desc 通过uri匹配模块
    :param x:
    :return:
    '''
    if x[6].strip() == ' ':
        return None

    hasMatch = False

    for k, v in URI_MODULES.items() :
        if x[6].strip().startswith('/' + k) :
            hasMatch = True
            x.append(v)

    if not hasMatch:
        x.append('公共模块')

    return x


def requestLog(time, rdd):
    '''
    :@desc 请求日志分析
    :param time:
    :param rdd:
    :return:
    '''

    logging.info("+++++handle request log:length:%d,获取内容:++++++++++" % (rdd.count()))

    if rdd.isEmpty():
        return None

    logging.info("++++++++++++++++++++++处理requestLog+++++++++++++++++++++++++++++++")

    reqrdd = rdd.map(lambda x: x.split(' ')).\
        filter(lambda x: len(x) > 12 and x[4].find('http-nio-') > 0 and x[2].strip() == 'INFO').\
        filter(lambda x: x[8].strip().upper().startswith('POST') or x[8].strip().upper().startswith('GET')).\
        map(timeFomate).\
        map(decodeStr).\
        map(analyMod)

    reqrdd.cache()
    reqrdd.checkpoint()  # checkpoint先cache避免计算两次,以前的rdd销毁

    sqlRdd = reqrdd.map(lambda x: Row(time=x[0], level=x[1], clz=x[2], thread=x[3], user=x[4], depart=x[5],
                          uri=x[6], method=x[7], ip=x[8], request=x[9], oplen=x[10],
                          respone=x[11], mod=x[12]))

    # rdd持久化,降低内存消耗, cache onliy for StorageLevel.MEMORY_ONLY
    # reqrdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK_SER)

    if reqrdd.isEmpty():
        return None
    
    spark = getSparkSessionInstance(rdd.context.getConf())
    df = spark.createDataFrame(sqlRdd)
    df.createOrReplaceTempView(REQUEST_TABLE)

    # 结构化后再分析
    sqlresult = spark.sql("SELECT * FROM " + REQUEST_TABLE)
    sqlresult.show()

    # 保存
    sqlMongodb(sqlresult, REQUEST_TABLE)


def errorLog(time, rdd):
    '''
    :@desc 错误日志分析
    :param time:
    :param rdd:
    :return:
    '''

    logging.info("+++++handle error log:length:%d,获取内容:++++++++++" % (rdd.count()))

    if rdd.isEmpty():
        return None

    logging.info("++++++++++++++++++++++处理errorLog+++++++++++++++++++++++++++++++")

    errorrdd = rdd.map(lambda x: x.split(' ')). \
        filter(lambda x: len(x) > 13 and x[2].strip().upper().startswith('ERROR')). \
        map(timeFomate). \
        map(decodeStr). \
        map(analyMod). \
        map(lambda x: Row(time=x[0], level=x[1], clz=x[2], thread=x[3], user=x[4], depart=x[5],
                          uri=x[6], method=x[7], ip=x[8], request=x[9], oplen=x[10],
                          respone=x[11], stack=x[12], mod=x[13]))
    # rdd持久化,降低内存消耗
    errorrdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK_SER)

    if errorrdd.isEmpty():
        return None

    spark = getSparkSessionInstance(rdd.context.getConf())
    df = spark.createDataFrame(errorrdd)
    df.createOrReplaceTempView(ERROR_TABLE)

    # 结构化后再分析
    sqlresult = spark.sql("SELECT * FROM " + ERROR_TABLE)
    sqlresult.show()

    # 保存
    sqlMongodb(sqlresult, ERROR_TABLE)

备注:需要完整代码请联系作者@狼

转载于:https://www.cnblogs.com/wolf-song/p/10369775.html

你可能感兴趣的:(大数据,数据库,python)