2.6 spark实战案例:实时日志分析
- 2.6.1 交互流程图
- 2.6.2 客户端监听器(java)
@SuppressWarnings("static-access") private void handleSocket() { lock.lock(); Writer writer = null; RandomAccessFile raf = null; try { File file = new File(filepath); raf = new RandomAccessFile(file, "r"); raf.seek(pointer); writer = new OutputStreamWriter(socket.getOutputStream(), "UTF-8"); String line = null; while ((line = raf.readLine()) != null) { if (Strings.isBlank(line)) { continue; } line = new String(line.getBytes("ISO-8859-1"), "UTF-8"); writer.write(line.concat("\n")); writer.flush(); logger.info("线程:{}----起始位置:{}----读取文件\n{} :",Thread.currentThread().getName(), pointer, line); pointer = raf.getFilePointer(); } Thread.currentThread().sleep(2000); } catch (Exception e) { logger.error(e.getMessage()); e.printStackTrace(); } finally { lock.unlock(); fclose(writer, raf); } }
- 2.6.3 sparkStream实时数据接收(python)
conf = SparkConf() conf.setAppName("HIS实时日志分析") conf.setMaster('yarn') # spark standalone conf.set('spark.executor.instances', 8) # cluster on yarn conf.set('spark.executor.memory', '1g') conf.set('spark.executor.cores', '1') # conf.set('spark.cores.max', '2') # conf.set('spark.logConf', True) conf.set('spark.streaming.blockInterval', 1000*4) # restart receiver interval sc = SparkContext(conf = conf) sc.setLogLevel('ERROR') sc.setCheckpointDir('hdfs://hadoop01:9000/hadoop/upload/checkpoint/') ssc = StreamingContext(sc, 30) # time interval at which splits streaming data into block lines = ssc.socketTextStream(str(ip), int(port)) # lines.pprint() lines.foreachRDD(requestLog) lines.foreachRDD(errorLog) ssc.start() ssc.awaitTermination()
- 2.6.4 sparklSQL、RDD结算、结构化搜索、结构存储mongoDB(python)
def getSparkSessionInstance(sparkConf): ''' :@desc 多个RDD全局共享sparksession .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.coll") \ .config("spark.mongodb.output.uri", "mongodb://adxkj:[email protected]:27017/") \ :param sparkConf: :return: ''' if ('sparkSessionSingletonInstance' not in globals()): globals()['sparkSessionSingletonInstance'] = SparkSession \ .builder \ .config(conf=sparkConf) \ .getOrCreate() return globals()['sparkSessionSingletonInstance'] def timeFomate(x): ''' :@desc 处理时间 :param x: :return: ''' if not isinstance(x, list): return None # filter microsenconds x.insert(0, ' '.join(x[0:2])) x.pop(1) x.pop(1) # filter '[]' rx = re.compile('([\[\]\',])') # text = rx.sub(r'\\\1', text) x = [rx.sub(r'', x[i]) for i in range(len(x))] # string to time x[0] = x[0][: x[0].find('.')] x[0] = ''.join(x[0]) x[0] = datetime.strptime(x[0], '%Y-%m-%d %H:%M:%S') return x def sqlMysql(sqlResult, table, url="jdbc:mysql://192.168.0.252:3306/hisLog", user='root', password=""): ''' :@desc sql结果保存 :param sqlResult: :param table: :param url: :param user: :param password: :return: ''' try: sqlResult.write \ .mode('append') \ .format("jdbc") \ .option("url", url) \ .option("dbtable", table) \ .option("user", user) \ .option("password", password) \ .save() except: excType, excValue, excTraceback = sys.exc_info() traceback.print_exception(excType, excValue, excTraceback, limit=3) # print(excValue) # traceback.print_tb(excTraceback) def sqlMongodb(sqlResult, table): ''' :@desc sql结果保存 :param sqlResult: :param table: :param url: :param user: :param password: :return: ''' try: sqlResult.\ write.\ format("com.mongodb.spark.sql.DefaultSource"). \ options(uri="mongodb://adxkj:[email protected]:27017/hislog", database="hislog", collection=table, user="adxkj", password="123456").\ mode("append").\ save() except: excType, excValue, excTraceback = sys.exc_info() traceback.print_exception(excType, excValue, excTraceback, limit=3) # print(excValue) # traceback.print_tb(excTraceback) def decodeStr(x) : ''' :@desc base64解码 :param x: :return: ''' try: if x[9].strip() != '' : x[9] = base64.b64decode(x[9].encode("utf-8")).decode("utf-8") # x[9] = x[9][:5000] #mysql if x[11].strip() != '': x[11] = base64.b64decode(x[11].encode("utf-8")).decode("utf-8") # x[11] = x[11][:5000] #mysql if len(x) > 12 and x[12].strip() != '': x[12] = base64.b64decode(x[12].encode("utf-8")).decode("utf-8") except Exception as e: print("不能解码:", x, e) return x def analyMod(x) : ''' :@desc 通过uri匹配模块 :param x: :return: ''' if x[6].strip() == ' ': return None hasMatch = False for k, v in URI_MODULES.items() : if x[6].strip().startswith('/' + k) : hasMatch = True x.append(v) if not hasMatch: x.append('公共模块') return x def requestLog(time, rdd): ''' :@desc 请求日志分析 :param time: :param rdd: :return: ''' logging.info("+++++handle request log:length:%d,获取内容:++++++++++" % (rdd.count())) if rdd.isEmpty(): return None logging.info("++++++++++++++++++++++处理requestLog+++++++++++++++++++++++++++++++") reqrdd = rdd.map(lambda x: x.split(' ')).\ filter(lambda x: len(x) > 12 and x[4].find('http-nio-') > 0 and x[2].strip() == 'INFO').\ filter(lambda x: x[8].strip().upper().startswith('POST') or x[8].strip().upper().startswith('GET')).\ map(timeFomate).\ map(decodeStr).\ map(analyMod) reqrdd.cache() reqrdd.checkpoint() # checkpoint先cache避免计算两次,以前的rdd销毁 sqlRdd = reqrdd.map(lambda x: Row(time=x[0], level=x[1], clz=x[2], thread=x[3], user=x[4], depart=x[5], uri=x[6], method=x[7], ip=x[8], request=x[9], oplen=x[10], respone=x[11], mod=x[12])) # rdd持久化,降低内存消耗, cache onliy for StorageLevel.MEMORY_ONLY # reqrdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK_SER) if reqrdd.isEmpty(): return None spark = getSparkSessionInstance(rdd.context.getConf()) df = spark.createDataFrame(sqlRdd) df.createOrReplaceTempView(REQUEST_TABLE) # 结构化后再分析 sqlresult = spark.sql("SELECT * FROM " + REQUEST_TABLE) sqlresult.show() # 保存 sqlMongodb(sqlresult, REQUEST_TABLE) def errorLog(time, rdd): ''' :@desc 错误日志分析 :param time: :param rdd: :return: ''' logging.info("+++++handle error log:length:%d,获取内容:++++++++++" % (rdd.count())) if rdd.isEmpty(): return None logging.info("++++++++++++++++++++++处理errorLog+++++++++++++++++++++++++++++++") errorrdd = rdd.map(lambda x: x.split(' ')). \ filter(lambda x: len(x) > 13 and x[2].strip().upper().startswith('ERROR')). \ map(timeFomate). \ map(decodeStr). \ map(analyMod). \ map(lambda x: Row(time=x[0], level=x[1], clz=x[2], thread=x[3], user=x[4], depart=x[5], uri=x[6], method=x[7], ip=x[8], request=x[9], oplen=x[10], respone=x[11], stack=x[12], mod=x[13])) # rdd持久化,降低内存消耗 errorrdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK_SER) if errorrdd.isEmpty(): return None spark = getSparkSessionInstance(rdd.context.getConf()) df = spark.createDataFrame(errorrdd) df.createOrReplaceTempView(ERROR_TABLE) # 结构化后再分析 sqlresult = spark.sql("SELECT * FROM " + ERROR_TABLE) sqlresult.show() # 保存 sqlMongodb(sqlresult, ERROR_TABLE)
备注:需要完整代码请联系作者@狼