pyspark RDD 入门

#import  pyhdfs
#顶级项目目录
#from spark.demo import  demo2

#! /usr/bin/python
# -*- coding:utf-8 -*-

import  sys
from  pyspark.sql import  SparkSession
import operator as op


class PropertiesUtil:
    def __init__(self):
        print("a")


    def formatPrint(x):
        strRes =str()
        '''
        if isinstance(x,tuple):
            print ("true")
        else :
            print("false")
        '''
        if "" != x :
           strRes = str(x).split(",")
        else:
            ""
        return   strRes



if __name__ =="__main__":
    #自动调用init方法
    d = PropertiesUtil()
    #print("b")

    #hdfsClinet = pyhdfs.HdfsClient


    #flag = pyhdfs.HdfsClient.exists()

    #conf = SparkConf().setMaster("local[*]").setAppName("Test")
    #sc = SparkContext(conf)
    spark = SparkSession.builder\
            .master("local") \
            .appName("Test") \
            .enableHiveSupport() \
            .config("spark.executor.memory","1g") \
            .getOrCreate()

    sc = spark.sparkContext


    rdd = sc.textFile("hdfs://hadoop:9000/dev/nginx/logs/nongfu.mw/status/mergeonlinefile/2018-07-17")

    #rdd.foreach(lambda x: print(x))
    a = ""
    b = ""

    res = str()
    #rdd.foreach(lambda x: d.formatPrint())
    #rdd.top(1)
    iterator = rdd.toLocalIterator()
    
    
    #获取保存状态的小文件数据
    for i in iterator:
        a = str(i).split(",")[0]
        b = str(i).split(",")[1]

    print('a: ',a," ,b: ",b)
    
    #复杂处理
    rdd.foreach(d.formatPrint)

你可能感兴趣的:(hadoop,spark,python)