#import pyhdfs #顶级项目目录 #from spark.demo import demo2 #! /usr/bin/python # -*- coding:utf-8 -*- import sys from pyspark.sql import SparkSession import operator as op class PropertiesUtil: def __init__(self): print("a") def formatPrint(x): strRes =str() ''' if isinstance(x,tuple): print ("true") else : print("false") ''' if "" != x : strRes = str(x).split(",") else: "" return strRes if __name__ =="__main__": #自动调用init方法 d = PropertiesUtil() #print("b") #hdfsClinet = pyhdfs.HdfsClient #flag = pyhdfs.HdfsClient.exists() #conf = SparkConf().setMaster("local[*]").setAppName("Test") #sc = SparkContext(conf) spark = SparkSession.builder\ .master("local") \ .appName("Test") \ .enableHiveSupport() \ .config("spark.executor.memory","1g") \ .getOrCreate() sc = spark.sparkContext rdd = sc.textFile("hdfs://hadoop:9000/dev/nginx/logs/nongfu.mw/status/mergeonlinefile/2018-07-17") #rdd.foreach(lambda x: print(x)) a = "" b = "" res = str() #rdd.foreach(lambda x: d.formatPrint()) #rdd.top(1) iterator = rdd.toLocalIterator() #获取保存状态的小文件数据 for i in iterator: a = str(i).split(",")[0] b = str(i).split(",")[1] print('a: ',a," ,b: ",b) #复杂处理 rdd.foreach(d.formatPrint)