weixin_39657249

pyspark sparksession_PySpark 处理数据和数据建模

安装相关包

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, when, count, countDistinct
from pyspark.sql.types import IntegerType,StringType
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder
        .config("spark.some.config.option", "some-value") 
        .config('spark.debug.maxToStringFields', '50') 
        .appName("Python Spark SQL Hive integration example")
        .enableHiveSupport()
        .getOrCreate()
sc = spark.sparkContext

1.读入数据

读入数据库中的数据X

data = spark.sql('''select * from db_so_default_tenant.entity_clueinfo
                    where custom_username not like '%测试%' 
                 ''')
# 时间部分的code报错，尚未修改
#                           and FROM_UNIXTIME(custom_create_time,'%Y-%m-%d') between date_format(date_sub(current_date,365), '%Y-%m-01') 
#                           and date_format(date_sub(current_date, 15), '%Y-%m-%d')
## Let's have a look at the data type
data.printSchema()

# 保留部分列：

keep_var_lst=['custom_clue_id', 'custom_create_time', 'custom_post_time', 'custom_username', 'custom_sex', 'custom_mobile', 
              'custom_mobile_area', 'custom_approach_id', 'custom_channel_id', 'custom_product_id', 'custom_pattern_id','custom_media_id',
              'custom_ctype_id', 'custom_activity_id','custom_detail','custom_province_id','custom_city_id','custom_district_id',
              'custom_utm_source','custom_utm_content','custom_utm_medium', 'custom_utm_campaign', 'custom_resource','custom_detail', 
              'custom_dealer_id', 'custom_area_id','custom_two_area_id'
               ]

data = data.select(keep_var_lst)

读入数据库中的Y

# 读入数据，查看数据结构

lead_feedback = spark.sql("select * from db_so_default_tenant.entity_clueinfosync")
lead_feedback.printSchema()

# 仅保留部分列

keep_var_lst2 = ['custom_clue_id', 'custom_verify_status', 'custom_sync_time']
lead_feedback = lead_feedback.select(keep_var_lst2)

# print((lead_feedback.count(), len(lead_feedback.columns)))
## (1577626, 3)

join表，得到包含X和Y的基础表

# data表append lead flag需要的字段
df = data.join(lead_feedback, on=['custom_clue_id'], how='left')

# print((df.count(), len(df.columns)))
## (1466832, 29)
# (1560986, 29)

2. 数据整合

定义Y值

# 利用pyspark.sql.functions中的when进行数据重塑

df = df.withColumn('label',when(df['custom_verify_status']==2,1).otherwise(0))

日期数据的处理

# 时间戳转换为日期

#注册临时表供SQL查询使用
df.createOrReplaceTempView("temp")
# newDF = spark.sql("select *, to_date('create_time', 'dim_month_id'), to_date('create_time', 'dim_day_id')  from df_sql ")
newDF = spark.sql("""select *, 
                  from_unixtime(custom_create_time, 'yyyy-MM')as dim_month_id,
                  from_unixtime(custom_create_time, 'yyyy-MM-dd')as dim_day_id,
                  from_unixtime(custom_create_time, 'yyyy-MM-dd HH:mm:ss')as create_time_new,
                  from_unixtime(custom_post_time, 'yyyy-MM-dd HH:mm:ss')as post_time_new
                  from temp """)

# 提取相应日期字段

#注册临时表供SQL查询使用
newDF.createOrReplaceTempView("temp")
# newDF = spark.sql("select *, to_date('create_time', 'dim_month_id'), to_date('create_time', 'dim_day_id')  from df_sql ")
newDF = spark.sql("""select *, 
                  month(create_time_new) as create_monthofyear,
                  FLOOR((day(create_time_new)-1)/7)+1 as create_weekofmonth,
                  dayofweek(create_time_new) as create_dayofweek,
                  weekofyear(create_time_new) as create_weekofyear,
                  hour(create_time_new) as create_hourofday,
                  floor(hour(create_time_new)/2) as create_hourofday2,
                  case when hour(create_time_new) between 8  and 11 then 'a.8-11'
                       when hour(create_time_new) =12               then 'b.12'
                       when hour(create_time_new) between 13 and 17 then 'c.13-17'
                       when hour(create_time_new) between 18 and 19 then 'd.18-19'
                       when hour(create_time_new) between 20 and 23 then 'e.20-23'
                       when hour(create_time_new) =0                then 'f.0'
                       when hour(create_time_new) between 1  and 2  then 'g.1-2'
                       when hour(create_time_new) =3                then 'h.3'
                       when hour(create_time_new) between 4 and 5   then 'j.4-5'
                       when hour(create_time_new) between 6 and 7   then 'k.6-7'
                       end as create_hour_flag,
                       
                  month(post_time_new) as post_monthofyear,
                  FLOOR((day(post_time_new)-1)/7)+1 as post_weekofmonth,
                  dayofweek(post_time_new) as post_dayofweek,
                  weekofyear(post_time_new) as post_weekofyear,
                  hour(post_time_new) as post_hourofday,
                  floor(hour(post_time_new)/2) as post_hourofday2,
                  case when hour(post_time_new) between 9  and 11 then 'a.9-11'
                       when hour(post_time_new) =12 then 'b.12'
                       when hour(post_time_new) between 13 and 19 then 'c.13-19'
                       when hour(post_time_new) =20 then 'd.20'
                       when hour(post_time_new) between 21 and 23 then 'e.21-23'
                       when hour(post_time_new) between 0  and 2  then 'f.0-2'
                       when hour(post_time_new) =3  then 'g.3'
                       when hour(post_time_new) between 4  and 8  then 'h.4-8'
                    end as post_hour_flag

                  from temp""")

1.创建简单flag，判断是否为null值，返回0,1

df2 = newDF
# 创建简单flag，如果为null值则为0，否则为1
def func_var_flag(var):
    if var == None or var == 0 or var == '' or var == '0':
        return 0
    else:
        return 1
    
func_var_flag_udf = udf(func_var_flag, IntegerType())

unknown_flag=['custom_username','custom_mobile_area','custom_approach_id','custom_channel_id','custom_product_id','custom_pattern_id',
              'custom_media_id','custom_ctype_id','custom_activity_id','custom_utm_source', 'custom_utm_content','custom_utm_medium',
              'custom_utm_campaign','custom_province_id', 'custom_city_id','custom_district_id','custom_dealer_id',
              'custom_area_id','custom_two_area_id','custom_resource','custom_detail'
             ]
for column in unknown_flag:
      df2=df2.withColumn(column + '_flag',  func_var_flag_udf(df2[column]))
        
        
# df2.limit(2).toPandas()
# df2.groupBy('mobile_area', 'mobile_area_flag').count().sort("count",ascending=False).show(4)

2.创建简单flag，是否为null值，是返回'Unk’,否则返回本身的结果

对于数值型的数据未做处理

# 创建简单flag，如果为null值则返回unk，否则返回其本身，字符型数据的处理，数值型呢？？？？？
def func_var_grp_flag(var):
    if var == None or var == '':
        return 'Unk'
    else:
        return var
    
func_var_grp_udf = udf(func_var_grp_flag, StringType())

unknown_grp_flag=['custom_sex','custom_utm_medium']
for column in unknown_grp_flag:
      df2=df2.withColumn(column + '_grp',  func_var_grp_udf(df2[column]))
        
# df2.limit(2).toPandas()
# df2.dtypes
len(df2.columns)

3.字符串格式的case when，使用sql

#注册临时表供SQL查询使用
df2.createOrReplaceTempView("temp")
# newDF = spark.sql("select *, to_date('create_time', 'dim_month_id'), to_date('create_time', 'dim_day_id')  from df_sql ")
df3 = spark.sql("""select *, 
                   
                    CHAR_LENGTH(trim(custom_username)) as name_len,
                    case when CHAR_LENGTH(custom_username) = 1 then 'len=1'
                         when custom_username in ('400用户','询价客户','客户','团购用户','微聊客户','网友','报价用户','匿名用户'
                                           ,'汽车之家用户','车主','佚名',
                                           '爱卡用户','询价用户','17汽车来电客户','团购客户','匿名','意向客户') then custom_username
                         when custom_username like '%先生%' or custom_username like '%女士%' then 'x Mr/Mrs'
                         when SUBSTR(trim(custom_mobile),1,1) ="1" and CHAR_LENGTH(trim(custom_username))=11 then 'phone_num'
                         when substr(custom_username,1,1) in ('0','1','2','3','4','5','6','7','8','9') then 'numbers'
                         when CHAR_LENGTH(custom_username) > 3 then 'len>3'
                         else 'Normal'
                    end as name_flag2,
 
                    CHAR_LENGTH(trim(custom_mobile)) as mobile_len,
                    case when SUBSTR(trim(custom_mobile),1,1) =0 then 'fixed-line telephone'
                         when SUBSTR(trim(custom_mobile),1,1) =1 and CHAR_LENGTH(trim(custom_mobile)) =11 then 'mobile phone'
                         else 'No-valid'
                    end as tel_flag,
                    case when SUBSTR(trim(custom_mobile),1,1) =1 and CHAR_LENGTH(custom_mobile)=11 then SUBSTR(trim(custom_mobile),1,2)
                    end as tel_head2,
                    case when SUBSTR(trim(custom_mobile),1,1) =1 and CHAR_LENGTH(custom_mobile)=11 then SUBSTR(trim(custom_mobile),1,3)
                    end as tel_head3,
                    case when CHAR_LENGTH(custom_mobile)<>11 then 'Not-Phone'
                         when SUBSTR(trim(custom_mobile),1,3) in ('186','138','139','135','136','137','159','158','150','151',
                                                         '187','182','189','152','188','176','185','180','183','133',
                                                         '181','177','131','130','132','156','134','153','155','173',
                                                         '157','199','178','175','166','184','198','147','191','170','171'
                                                         ) then 'valid'
                         else 'No-Valid' 
                    end as tel_head3_grp,      
                     case when custom_mobile_area is null or custom_mobile_area="" then 'Unk'
                          when custom_mobile_area in ('海口市','大连市','昆明市','吉林市','江门市','西宁市','珠海市','呼和浩特市','张家口市') 
                               then 'level1'
                          when custom_mobile_area in ('金华市','赣州市','湖州市','徐州市','盐城市') then 'level2'
                          when custom_mobile_area in ('沈阳市','成都市') then 'level3'
                          when custom_mobile_area in ('杭州市','南京市','宜春市','吉安市') then 'level4' 
                          else 'Others'
                      end as mobile_area_grp,  
                      
                    case when custom_channel_id in ('73','72','10070','62','10063','61','10012','10061','65','60','10072','76',
                                            '10062','10071','63','10073','36','77') then custom_channel_id
                         else 'Others' 
                    end as channel_grp,
                    case when custom_media_id in ('4f15069347ea4') then 'level1'
                         when custom_media_id in ('4f15069348034') then 'level2'
                         when custom_media_id in ('5c7397fa8c5f3') then 'level3'
                         when custom_media_id in ('5aa8e618a1915','58107fdf18a64') then 'level4'
                         when custom_media_id in ('588176b5dc052','4f150a09d9a7d','541994c0e4126','54068f14cde9b',
                                                                '5a308c5df0537',
                                                                '54052681387a5',
                                                                '54068f14cde9h',
                                                                '5c6d2672f1f95',
                                                                '57d2a59bc8dbb',
                                                                '4f15053feac73',
                                                                '5c233d3561514',
                                                                '4f150693481c2',
                                                                '4f15069348647',
                                                                '4f150a09db456',
                                                                '4f150a09d608c') then 'level5'
                         when custom_media_id in ('0') then 'Unk'
                         else 'Others'
                    end as media_grp,
                    
                    case when custom_detail is null or custom_detail= "" then NULL
                         when custom_detail like '%询价%'   then 'Inquire'
                         when custom_detail like '%经销商%' then 'Retail'
                         when custom_detail like '%试驾%'   then 'Trial run'
                         when custom_detail like '2.0L %' or custom_detail like '2.5L %' then 'car_type'
                         when custom_detail like '%通话%'   then 'comment6'

                         when custom_detail like '%失败%'   then 'comment2'
                         when custom_detail like '%成功%'   then 'comment1'
                         when custom_detail like '%无效%'   then 'comment3'
                         when custom_detail like '%黑名单%' then 'comment4'
                         when custom_detail like '%姓名%'   then 'comment5'
                         end as comment_type,

                     case when custom_province_id in ('150000','460000','630000','530000','620000','520000','650000','24') then 'level1'
                          when custom_province_id in ('440000','610000','31','220000','640000') then 'level2'
                          when custom_province_id in ('130000','430000','370000','25','410000','210000','340000') then 'level3'
                          when custom_province_id in ('420000','350000','230000') then 'level4'
                          when custom_province_id in ('320000','450000','510000','360000','140000','330000','2') then 'level5'
                          end as custom_province_grp,
                          
                    case when custom_area_id in ('215','499') then 'South'
                         when custom_area_id in ('497')       then 'North'
                         when custom_area_id in ('500')       then 'East2'
                         when custom_area_id in ('20004')     then 'East1'
                         when custom_area_id in ('221','501') then 'North-East'
                         when custom_area_id in ('502')       then 'West'
                         end as area_grp

                  from temp """)

# df3.dtypes
len(df3.columns)
#  84

删除一些不需要的列

# 删除一些不需要的列
drop_list1 = ['custom_create_time','custom_post_time', 'create_time_new','post_time_new',
              'custom_verify_status', 'custom_sync_time',
              'custom_username','custom_mobile','custom_mobile_area','custom_media_id',
              'custom_utm_source','custom_utm_content','custom_utm_medium','custom_utm_campaign','custom_detail'
            ]
df4 = df3.select([column for column in df3.columns if column not in drop_list1])

len(df4.columns)

删除一些ID字段

# List of variables to drop - only independent variables should be left in final dataset
drop_attrs = [ "custom_clue_id", "dim_month_id","dim_day_id"]
df4 = df4.select([column for column in df4.columns if column not in drop_attrs])
# df4.select('resource_flag').distinct().show()
# df4.dtypes
len(df4.columns)

判断是否有唯一值的无关列，并进行删除

# 运行时间长
# Check if there are categorical vars with 25+ levels
one_value_flag=[]
for column in df4.columns:
    if df4.select(column).distinct().count()==1:
        one_value_flag.append(column)
one_value_flag
df4=df4.drop(*one_value_flag)
len(df4.columns)

数值转换为字符串格式

# 数值转为字符，有一些列读进来的时候转为了数值型
df5=df4
int_to_string_list=['custom_approach_id','custom_channel_id','custom_product_id','custom_pattern_id','custom_ctype_id',
                    'custom_activity_id','custom_province_id','custom_city_id','custom_district_id',
                    'custom_dealer_id','custom_area_id','custom_two_area_id'
                    ]
for col in int_to_string_list:
    df5 = df5.withColumn(col, df5[col].cast(StringType()))
    
# 单个列测试    
# df5 = df4.withColumn('approach_id', df4['approach_id'].cast(StringType()))
# df5.dtypes

numeric_cols = [x[0] for x in df5.dtypes if (x[1] != 'string')& (x[0] != 'label') ]
numeric_cols

# 字符串，其中'Attrition'是因变量
string_cols = [x[0] for x in df5.dtypes if (x[1] == 'string') ]
string_cols

字符串填充缺失值

# 当字符串中包含null值时，onehot编码会报错
for col in string_cols:
    df5 = df5.na.fill(col, 'EMPTY')
    df5 = df5.na.replace('', 'EMPTY',col)

判断每一个分类列，其分类是否大于25

方便之后进行管道处理，分类大于25的只进行stringindex转换，小于25的进行onehot变换

If any column has > 25 categories, add that column to drop list (line 24) or convert to continious variable if possible

# 运行时间长
# Check if there are categorical vars with 25+ levels
string_more_than32=[]
string_more_than25=[]
string_less_than25=[]

for column in string_cols:
    if df5.select(column).distinct().count()>=32:
        string_more_than32.append(column)
    if df5.select(column).distinct().count()>=25:
        string_more_than25.append(column)
    else:
        string_less_than25.append(column)
        
# df_long_factors = df5.select([when(countDistinct(column) >=25, 'T').otherwise('F').alias(column) for column in string_cols]) 
# df5.select('custom_sex').distinct().count()

string_more_than32
# df5.select('custom_district_id').distinct().count() # 166

删除取值大于32分类的分类变量

#删除在drop_attrs中的列
df5 = df5.drop(*string_more_than32)
len(df5.columns)
string_more_than25
string_25_than32 = list(set(string_more_than25).difference(set(string_more_than32)))
string_25_than32
string_less_than25
string_cols = [x[0] for x in df5.dtypes if (x[1] == 'string') ]
string_cols

3、管道处理

# 1. Encode the categorical data
main_stages = []
for col in string_cols:
    indexer = StringIndexer(inputCol = col, outputCol = col + 'Index', handleInvalid="keep")
    main_stages += [indexer]
    
# ?StringIndexer

# 2. OneHotEncoder for string_less_than25
for col in string_less_than25:
    encoder = OneHotEncoderEstimator(inputCols = [col + 'Index'], outputCols = [col + 'Vec'])
    main_stages += [encoder]

# 1.Variables numericas
assemblerInputs = numeric_cols
# 2.Index the label feature
assemblerInputs = assemblerInputs + [col + 'Index' for col in string_25_than32]

# 3.Add continuous variable -- one hot encoding
assemblerInputs = assemblerInputs + [col + 'Vec' for col in string_less_than25]

# len(df5.columns)# 65 Y
# len(assemblerInputs) # 64

# 4.Assemble the steps.pass all the steps in the VectorAssembler
# 导入VerctorAssembler 将多个列合并成向量列的特征转换器,即将表中各列用一个类似list表示，输出预测列为单独一列。
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
main_stages += [assembler]

# 5.Create a Pipeline.Now that all the steps are ready, you push the data to the pipeline
# 花费时间比较长
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = main_stages)
pipelineModel = pipeline.fit(df5)
df6 = pipelineModel.transform(df5)

4、建立模型

划分数据集

# 创建新的只有label和features的表
# dfi = data_features.select(['label', 'features'])
dfi = df6.select(['label', 'features'])


# 将数据集分为训练集和测试集
train, test = dfi.randomSplit([0.7,0.3], 100)
# train,test,validation = dfi.randomSplit([0.6,0.2,0.2],seed=2020)

# 运行时间超长
# print("Training Dataset Count: " + str(train.count()))
# print("Test Dataset Count: " + str(test.count()))

# Training Dataset Count: 1249630
# Test Dataset Count: 311356

Random Forest Classifier

# 模型配置
rf = RandomForestClassifier( labelCol='label', 
                             featuresCol='features', 
                             numTrees=100, 
                             maxBins=32
                            )

# 训练模型
# Fit the data to the model
rfModel = rf.fit(train)


# 用 transform() 方法在测试集上做预测
predictions = rfModel.transform(test)

#选择预测结果中字段进行查看
predictions.select( 'label', 'rawPrediction', 'prediction', 'probability')
           .orderBy('probability', ascending=False)
           .show(n=10, truncate=30)

+-----+------------------------------+----------+------------------------------+
|label| rawPrediction|prediction| probability|
+-----+------------------------------+----------+------------------------------+
| 0|[79.15890827146472,20.84109...| 0.0|[0.7915890827146475,0.20841...|
| 0|[79.10923525773862,20.89076...| 0.0|[0.7910923525773864,0.20890...|
| 0|[78.98945518105177,21.01054...| 0.0|[0.7898945518105179,0.21010...|
| 0|[78.9282993850366,21.071700...| 0.0|[0.7892829938503662,0.21071...|
| 0|[78.91212774787148,21.08787...| 0.0|[0.7891212774787151,0.21087...|
| 0|[78.89054837885494,21.10945...| 0.0|[0.7889054837885496,0.21109...|
| 0|[78.89054837885494,21.10945...| 0.0|[0.7889054837885496,0.21109...|
| 0|[78.89054837885494,21.10945...| 0.0|[0.7889054837885496,0.21109...|
| 0|[78.89054837885494,21.10945...| 0.0|[0.7889054837885496,0.21109...|
| 0|[78.89054837885494,21.10945...| 0.0|[0.7889054837885496,0.21109...|
+-----+------------------------------+----------+------------------------------+

#检验模型效果
evaluator = BinaryClassificationEvaluator() print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))  
# Test Area Under ROC: 0.6160155402990332

保存模型

# import sys, os
# os.getcwd() 
rfModel.write().overwrite().save('Model test/rfModel')

加载模型

from pyspark.ml.classification import RandomForestClassificationModel 
model_1 = RandomForestClassificationModel.load('Model test/rfModel')

Gradient-Boosted Tree Classifier

# 模型配置 train a GBTC model
gbt = GBTClassifier(maxIter=10)

# 训练模型
# Fit the data to the model
gbtModel = gbt.fit(train)

# 用 transform() 方法在测试集上做预测
predictions = gbtModel.transform(test)

#选择预测结果中字段进行查看
predictions.select( 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label| rawPrediction|prediction| probability|
+-----+--------------------+----------+--------------------+
| 0|[-0.0582178194283...| 1.0|[0.47092393217850...|
| 0|[-0.0667980984304...| 1.0|[0.46665053764714...|
| 0|[-0.0560469563372...| 1.0|[0.47200582803120...|
| 0|[0.04211971652931...| 0.0|[0.52104741320470...|
| 0|[0.08544882017875...| 0.0|[0.54262072878469...|
| 0|[-0.0728647167488...| 1.0|[0.46363198136231...|
| 0|[-0.0142166646760...| 1.0|[0.49289214652005...|
| 0|[0.08754857661758...| 0.0|[0.54366279043135...|
| 0|[-0.0676538770780...| 1.0|[0.46622457631215...|
| 0|[-0.0713656699888...| 1.0|[0.46437762010753...|
+-----+--------------------+----------+--------------------+

#模型检验
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

# 保存Gradient-Boosted 模型

gbtModel.write().overwrite().save('Model test/gbtModel')

你可能感兴趣的:(pyspark,sparksession)

Spark教程3：SparkSQL最全介绍 Cachel wood 大数据开发 spark 大数据分布式计算机网络 AHP 需求分析
文章目录SparkSQL最全介绍一、SparkSQL概述二、SparkSession：入口点三、DataFrame基础操作四、SQL查询五、SparkSQL函数六、与Hive集成七、数据源操作八、DataFrame与RDD互转九、高级特性十、性能优化十一、Catalyst优化器十二、SparkSQL应用场景十三、常见问题与解决方法SparkSQL最全介绍一、SparkSQL概述SparkSQL是A
Python与大数据：Spark和PySpark实战教程天天进步2015 python 大数据 python spark
引言在大数据时代，数据处理和分析能力成为核心竞争力。ApacheSpark作为新一代大数据计算引擎，以其高性能、易用性和强大的生态系统，成为数据工程师和分析师的首选工具。而PySpark作为Spark的Python接口，让Python开发者能够轻松驾驭大规模数据处理。本教程将带你系统了解Spark与PySpark的核心原理、环境搭建、典型应用场景及实战案例，助你快速上手大数据分析。目录Spark简
基于pyspark的北京历史天气数据分析及可视化_离线大数据CLUB spark数据分析可视化数据分析数据挖掘 hadoop 大数据 spark
基于pyspark的北京历史天气数据分析及可视化项目概况[]点这里,查看所有项目[]数据类型北京历史天气数据开发环境centos7软件版本python3.8.18、hadoop3.2.0、spark3.1.2、mysql5.7.38、scala2.12.18、jdk8开发语言python开发流程数据上传(hdfs)->数据分析(spark)->数据存储(mysql)->后端(flask)->前端(
基于pyspark的北京历史天气数据分析及可视化_实时大数据CLUB spark数据分析可视化数据分析数据挖掘 spark hadoop 大数据
基于pyspark的北京历史天气数据分析及可视化项目概况[]点这里,查看所有项目[]数据类型北京历史天气数据开发环境centos7软件版本python3.8.18、hadoop3.2.0、spark3.1.2、mysql5.7.38、scala2.12.18、jdk8、kafka2.8.2开发语言python开发流程数据上传(hdfs)->数据分析(spark)->数据写kafka(python)
Pyspark中的int 闯闯桑 python spark pandas 大数据
在PySpark中，整数类型（int）与Python或Pandas中的int有所不同，因为它基于SparkSQL的数据类型系统。以下是PySpark中整数类型的详细说明：1.PySpark的整数类型PySpark主要使用IntegerType（32位）和LongType（64位）表示整数，对应SQL中的INT和BIGINT：PySpark类型SQL类型位数取值范围占用存储IntegerTypeIN
pyspark底层浅析 lo_single Spark spark python
pyspark底层浅析pyspark简介pyspark是Spark官方提供的API接口，同时pyspark也是Spark中的一个程序。在terminal中输入pyspark指令，可以打开python的shell，同时其中默认初始化了SparkConf和SparkContext在编写Spark应用的.py文件时，可以通过importpyspark引入该模块，并通过SparkConf对Spark的启动
PySpark 使用pyarrow指定版本 SLUMBER_PARTY_ pyspark
背景说明在PySpark3.1.3环境中，当需要使用与集群环境不同版本的PyArrow(如1.0.0版本)时，可以通过以下方法实现，而无需更改集群环境配置完整操作说明去pyarrow·PyPI下载对应版本的whl文件后缀whl直接改成zip解压后有两个文件夹，分别是pyarrow和pyarrow-1.0.0.dist-info直接把那两个文件夹打包成pyarrow.zip因为pyarrow里不是单
Spark入门指南：大数据处理的第一个Hello World程序 AI天才研究院 ChatGPT AI大模型应用入门实战与进阶 spark 大数据分布式 ai
Spark入门指南：大数据处理的第一个HelloWorld程序关键词：Spark、大数据处理、RDD、WordCount、PySpark、分布式计算、HelloWorld程序摘要：本文以经典的WordCount程序为切入点，系统讲解ApacheSpark的核心概念、开发流程与实战技巧。通过从环境搭建到代码实现的全流程解析，帮助大数据初学者快速掌握Spark的基础操作，理解分布式计算的核心逻辑。文章
面向小白的 Spark MLlib 入门教学路人与大师 spark-ml
目标：介绍SparkMLlib框架及其在机器学习中的应用。培养学生基本的数据处理、建模和评估技能，使其能够独立进行简单的机器学习任务。教学大纲：介绍SparkMLlib了解ApacheSpark和MLlib。MLlib的主要功能和优势。演示如何在Spark中启动MLlib。基本概念和环境设置理解Spark的RDD概念。安装和配置Spark环境。使用SparkSession创建Spark应用程序。数
pyspark==windows单机搭建一个java开发数据分析 spark
下载安装JDK17,配置JAVA_HOME下载安装hadoop-3.3.5并完整替换bin目录,配置HADOOP_HOMEIndexof/hadoop/common/hadoop-3.3.5GitHub-cdarlint/winutils:winutils.exehadoop.dllandhdfs.dllbinariesforhadoopwindows下载spark配置SPARK_HOME安装py
大数据领域的数据工程：从理论到实践 AI天才研究院 ChatGPT AI大模型企业级应用开发实战大数据 ai
大数据领域的数据工程：从理论到实践关键词：数据工程、大数据处理、ETL/ELT、数据湖、数据仓库、数据治理、云计算摘要：本文系统解析大数据领域的数据工程体系，从理论架构到实战落地展开深度探讨。首先构建数据工程核心概念框架，解析数据集成、存储、处理、治理的技术原理；其次通过Python和PySpark代码实现数据清洗、分布式处理等关键算法；结合真实项目案例演示数据管道搭建与优化；最后分析金融、电商等
pyspark依赖环境设置
pypspark异常py49-protocol.Py433avaError:Anerroroccurredwhilecalling0117.sql.org.apache.spark.SparkException:Jobabortedduetostagefailure:Task®instage0.0failed4times,mostrecentfailure:Losttask0.3instage0.
使用 PySpark 从 Kafka 读取数据流并处理为表 Bug Spray kafka linq 分布式
使用PySpark从Kafka读取数据流并处理为表下面是一个完整的指南，展示如何通过PySpark从Kafka消费数据流，并将其处理为可以执行SQL查询的表。1.环境准备确保已安装:ApacheSpark(包含SparkSQL和SparkStreaming)KafkaPySpark对应的Kafka连接器(通常已包含在Spark发行版中)2.完整代码示例frompyspark.sqlimportSp
Hugging Face + Spark：打造高效的 NLP 大数据处理引擎(一)
在自然语言处理（NLP）领域，HuggingFace是不可或缺的处理库，而Spark则是大数据处理的必备工具。将两者的优势结合起来，可以实现高效的NLP大数据处理。以下是结合HuggingFace和Spark的两种方法，基于Spark&PySpark3.3.1版本进行探索。方法一：升级Spark版本至3.4及以上如果你愿意升级Spark版本到3.4或更高版本，那么结合HuggingFace和Spa
linux下载pyspark并修改默认python版本 yishan_3 chrome 前端
使用deadsnakesPPA（适用于旧版Ubuntu）如果官方仓库没有Python3.8，可通过第三方PPA安装。步骤1：添加PPA仓库bash复制下载sudoadd-apt-repositoryppa:deadsnakes/ppasudoaptupdate步骤2：安装Python3.8bash复制下载sudoaptinstallpython3.8设置Python3.8为默认版本（可选）如果需要
SparkSQL基本操作 Eternity...... spark 大数据
以下是SparkSQL的基本操作总结，涵盖数据读取、转换、查询、写入等核心功能：一、初始化SparkSessionscalaimportorg.apache.spark.sql.SparkSessionvalspark=SparkSession.builder().appName("SparkSQLDemo").master("local[*]")//本地模式（集群用`spark://host:p
关于Spark Shell的使用 2301_78557870 spark 大数据分布式
Spark带有交互式的Shell，可在SparkShell中直接编写Spark任务，然后提交到集群与分布式数据进行交互，并且可以立即查看输出结果。SparkShell提供了一种学习SparkAPI的简单方式，可以使用Scala或Python语言进行程序的编写。一、SparkShell简介SparkShell是Spark提供的交互式命令行工具，支持Scala（默认）和Python（PySparkSh
Spark，数据提取和保存 Freedom℡ 数据库 spark hadoop
以下是使用Spark进行数据提取（读取）和保存（写入）的常见场景及代码示例（基于Scala/Java/Python，不含图片操作）：一、数据提取（读取）1.读取文件数据（文本/CSV/JSON/Parquet等）Scalascalaimportorg.apache.spark.sql.SparkSessionvalspark=SparkSession.builder().appName("Data
RDD的自定义分区器-案例依年南台大数据
以下是一个更具体的RDD自定义分区器案例，展示如何根据业务需求实现自定义分区逻辑。案例：按用户地区进行数据分区假设我们有一个电商交易数据集，包含user_id（用户ID）和region（地区）字段。我们希望根据用户所在地区将数据分区，以便后续对每个地区的数据进行独立分析。实现步骤定义地区到分区的映射规则实现自定义分区器应用分区器并验证结果代码实现python运行frompysparkimportS
Spark SQL 多数据源操作（Scala） Ssaty. spark sql scala
第1关：加载与保存操作任务描述本关任务：根据编程要求，编写Spark程序读取指定数据源，完成任务。相关知识为了完成本关任务，你需要掌握：数据加载；SQL语句加载数据；文件保存；保存模式；持久化存储到Hive；分区与排序。importorg.apache.spark.sql.{DataFrame,SaveMode,SparkSession}objectFirst_Question{
使用Pyspark读取CSV文件并将数据写入数据库（大数据）雨中徜徉的思绪漫溢数据库大数据
使用Pyspark读取CSV文件并将数据写入数据库（大数据）近年来，随着大数据技术的快速发展，大数据处理和分析已经成为许多企业和组织的重要任务之一。Pyspark作为ApacheSpark的PythonAPI，为我们提供了强大的工具来处理和分析大规模数据集。在本文中，我们将学习如何使用Pyspark读取CSV文件，并将数据写入数据库。首先，我们需要安装和配置Pyspark。请确保你已经安装了Jav
Spark安装姬激薄 spark
一、本地环境安装（单机模式）适合开发和测试，支持Windows、Linux、macOS。1.前置条件Java：Java8或更高版本（建议OpenJDK11+）。bash#检查Java版本java-versionPython（可选）：PySpark需要Python3.6+。Scala（可选）：若使用ScalaAPI，需安装Scala2.12/2.13。2.下载与安装下载Spark：从ApacheSp
SparkSQL操作Mysql 依年南台大数据
SparkSQL提供了强大的功能来连接和操作MySQL数据库，支持读取数据、写入数据以及执行SQL查询。下面将详细介绍如何使用SparkSQL与MySQL进行交互，并提供完整的代码示例。一、环境准备安装MySQLJDBC驱动下载mysql-connector-java，并将JAR文件添加到Spark的classpath中。启动SparkSession在创建SparkSession时，通过confi
【小贪】程序员必备：Shell、Git、Vim常用命令贪钱算法还我头发小小宝典 git vim 编辑器 shell ssh linux
近期致力于总结科研或者工作中用到的主要技术栈，从技术原理到常用语法，这次查缺补漏当作我的小百科。主要技术包括：✅数据库常用：MySQL,HiveSQL,SparkSQL✅大数据处理常用：Pyspark,Pandas⚪图像处理常用：OpenCV,matplotlib⚪机器学习常用：SciPy,Sklearn⚪深度学习常用：Pytorch,numpy⚪常用数据结构语法糖：itertools,colle
pyspark on yarn 配置强强0007 pyspark hadoop 大数据分布式
1yarn模式出错pysparkonyarn在pycharm上执行出现以下问题：解决方案：在程序最前面添加如下程序importosos.environ["HADOOP_CONF_DIR"]="/opt/module/hadoop-3.1.3/etc/hadoop"2yarn模式配置2.1SparkSessionfrompyspark.sqlimportSparkSessionimportos
RDD有哪几种创建方式痕517 开发语言
RDD（弹性分布式数据集）有以下几种常见的创建方式：###从集合创建通过`parallelize()`方法将本地集合转换为RDD。这种方式适合在测试或处理小规模数据时使用，它能将本地的Python列表、Java数组等集合数据并行化到集群上。-**Python示例**：```pythonfrompysparkimportSparkContext#创建SparkContext对象sc=SparkCon
scala连接mongodb_Spark教程（二）Spark连接MongoDB weixin_39688035 scala连接mongodb
如何导入数据数据可能有各种格式，虽然常见的是HDFS，但是因为在Python爬虫中数据库用的比较多的是MongoDB，所以这里会重点说说如何用spark导入MongoDB中的数据。当然，首先你需要在自己电脑上安装spark环境，简单说下，在这里下载spark，同时需要配置好JAVA，Scala环境。这里建议使用Jupyternotebook，会比较方便，在环境变量中这样设置PYSPARK_DRIV
大数据毕业设计PySpark+Hadoop航班延误预测系统航班可视化 QQ21503882 javaweb 大数据课程设计 hadoop
1.选题背景和意义（1）选题背景在旅行规划中，机票价格一直是旅客关注的重点。机票价格的波动不仅受季节、航线、航空公司等因素的影响，还受到市场供求关系、经济形势等因素的影响。因此，通过对机票价格进行预测分析，可以帮助旅客选择更合适的出行时间和机票购买策略，从而节省旅行成本。（2）意义提高乘客购票决策：基于Hadoop的飞机票价格预测能够提供乘客准确的价格预测信息，帮助他们选择合适的购票时间和最优的价
Spark应用部署模式实例 qrh_yogurt spark 大数据分布式
Local模式新启动一个终端SparkSubmit#pyspark命令启动的进程，实际上就是启动了一个Spark应用程序SparkStandalone模式讲解：6321SecondaryNameNode#hadoop中HDFS第二数据存储节点，负责定期合并fsimage和editslog文件7475Jps6132DataNode#hadoop中HDFS的数据存储节点，负责存储实际的数据块，并响应来
Spark读取HDFS加密区数据乱码问题解决蹩脚法师 bug解决 spark hadoop hdfs spark 加密解密
因为项目需求，需要启用hdfs加密区，为了验证对现有程序的影响，我在自己的集群上配置了加密区，并测试spark和java程序读取数据。spark程序代码如下System.setProperty("HADOOP_USER_NAME","user1")valspark=SparkSession.builder().master("local").getOrCreate()//valdata=spark
rust的指针作为函数返回值是直接传递，还是先销毁后创建？ wudixiaotie 返回值
这是我自己想到的问题，结果去知呼提问，还没等别人回答，我自己就想到方法实验了。。 fn main() { let mut a = 34; println!("a's addr:{:p}", &a); let p = &mut a; println!("p's addr:{:p}", &a
java编程思想 -- 数据的初始化百合不是茶 java 数据的初始化
1.使用构造器确保数据初始化 /* *在ReckInitDemo类中创建Reck的对象 */ public class ReckInitDemo { public static void main(String[] args) { //创建Reck对象 new Reck(); } }
[航天与宇宙]为什么发射和回收航天器有档期 comsci
地球的大气层中有一个时空屏蔽层,这个层次会不定时的出现,如果该时空屏蔽层出现,那么将导致外层空间进入的任何物体被摧毁,而从地面发射到太空的飞船也将被摧毁... 所以,航天发射和飞船回收都需要等待这个时空屏蔽层消失之后,再进行 &
linux下批量替换文件内容商人shang linux 替换
1、网络上现成的资料　　格式: sed -i "s/查找字段/替换字段/g" `grep 查找字段 -rl 路径` 　　linux sed 批量替换多个文件中的字符串　　sed -i "s/oldstring/newstring/g" `grep oldstring -rl yourdir` 　　例如：替换/home下所有文件中的www.admi
网页在线天气预报 oloz 天气预报
网页在线调用天气预报 <%@ page language="java" contentType="text/html; charset=utf-8" pageEncoding="utf-8"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transit
SpringMVC和Struts2比较杨白白 springMVC
1. 入口 spring mvc的入口是servlet，而struts2是filter（这里要指出，filter和servlet是不同的。以前认为filter是servlet的一种特殊），这样就导致了二者的机制不同，这里就牵涉到servlet和filter的区别了。参见：http://blog.csdn.net/zs15932616453/article/details/8832343 2
refuse copy, lazy girl! 小桔子 copy
妹妹坐船头啊啊啊啊！都打算一点点琢磨呢。文字编辑也写了基本功能了。。今天查资料，结果查到了人家写得完完整整的。我清楚的认识到： 1.那是我自己觉得写不出的高度 2.如果直接拿来用，很快就能解决问题 3.然后就是抄咩~~ 4.肿么可以这样子，都不想写了今儿个，留着作参考吧！拒绝大抄特抄，慢慢一点点写！
apache与php整合 aichenglong php apache web
一 apache web服务器 1 apeche web服务器的安装 1)下载Apache web服务器 2)配置域名(如果需要使用要在DNS上注册) 3)测试安装访问http://localhost/验证是否安装成功 2 apache管理 1)service.msc进行图形化管理 2)命令管理，配
Maven常用内置变量 AILIKES maven
Built-in properties ${basedir} represents the directory containing pom.xml ${version} equivalent to ${project.version} (deprecated: ${pom.version}) Pom/Project properties Al
java的类和对象百合不是茶 JAVA面向对象类对象
java中的类： java是面向对象的语言，解决问题的核心就是将问题看成是一个类，使用类来解决 java使用 class 类名来创建类，在Java中类名要求和构造方法，Java的文件名是一样的创建一个A类： class A{ } java中的类：将某两个事物有联系的属性包装在一个类中，再通
JS控制页面输入框为只读 bijian1013 JavaScript
在WEB应用开发当中，增、删除、改、查功能必不可少，为了减少以后维护的工作量，我们一般都只做一份页面，通过传入的参数控制其是新增、修改或者查看。而修改时需将待修改的信息从后台取到并显示出来，实际上就是查看的过程，唯一的区别是修改时，页面上所有的信息能修改，而查看页面上的信息不能修改。因此完全可以将其合并，但通过前端JS将查看页面的所有信息控制为只读，在信息量非常大时，就比较麻烦。
AngularJS与服务器交互 bijian1013 JavaScript AngularJS $http
对于AJAX应用（使用XMLHttpRequests）来说，向服务器发起请求的传统方式是：获取一个XMLHttpRequest对象的引用、发起请求、读取响应、检查状态码，最后处理服务端的响应。整个过程示例如下： var xmlhttp = new XMLHttpRequest(); xmlhttp.onreadystatechange
[Maven学习笔记八]Maven常用插件应用 bit1129 maven
常用插件及其用法位于：http://maven.apache.org/plugins/ 1. Jetty server plugin 2. Dependency copy plugin 3. Surefire Test plugin 4. Uber jar plugin 1. Jetty Pl
【Hive六】Hive用户自定义函数(UDF) bit1129 自定义函数
1. 什么是Hive UDF Hive是基于Hadoop中的MapReduce，提供HQL查询的数据仓库。Hive是一个很开放的系统，很多内容都支持用户定制，包括：文件格式：Text File，Sequence File 内存中的数据格式： Java Integer/String, Hadoop IntWritable/Text 用户提供的 map/reduce 脚本：不管什么
杀掉nginx进程后丢失nginx.pid，如何重新启动nginx ronin47 nginx 重启 pid丢失
nginx进程被意外关闭，使用nginx -s reload重启时报如下错误：nginx: [error] open() “/var/run/nginx.pid” failed (2: No such file or directory)这是因为nginx进程被杀死后pid丢失了，下一次再开启nginx -s reload时无法启动解决办法：nginx -s reload 只是用来告诉运行中的ng
UI设计中我们为什么需要设计动效 brotherlamp UI ui教程 ui视频 ui资料 ui自学
随着国际大品牌苹果和谷歌的引领，最近越来越多的国内公司开始关注动效设计了，越来越多的团队已经意识到动效在产品用户体验中的重要性了，更多的UI设计师们也开始投身动效设计领域。但是说到底，我们到底为什么需要动效设计？或者说我们到底需要什么样的动效？做动效设计也有段时间了，于是尝试用一些案例，从产品本身出发来说说我所思考的动效设计。一、加强体验舒适度嗯，就是让用户更加爽更加爽的用你的产品。
Spring中JdbcDaoSupport的DataSource注入问题 bylijinnan java spring
参考以下两篇文章： http://www.mkyong.com/spring/spring-jdbctemplate-jdbcdaosupport-examples/ http://stackoverflow.com/questions/4762229/spring-ldap-invoking-setter-methods-in-beans-configuration Sprin
数据库连接池的工作原理 chicony 数据库连接池
随着信息技术的高速发展与广泛应用，数据库技术在信息技术领域中的位置越来越重要，尤其是网络应用和电子商务的迅速发展，都需要数据库技术支持动态Web站点的运行，而传统的开发模式是：首先在主程序（如Servlet、Beans）中建立数据库连接；然后进行SQL操作，对数据库中的对象进行查询、修改和删除等操作；最后断开数据库连接。使用这种开发模式，对
java 关键字 CrazyMizzz java
关键字是事先定义的，有特别意义的标识符，有时又叫保留字。对于保留字，用户只能按照系统规定的方式使用，不能自行定义。 Java中的关键字按功能主要可以分为以下几类：（1）访问修饰符 public,private,protected p
Hive中的排序语法 daizj 排序 hive order by DISTRIBUTE BY sort by
Hive中的排序语法 2014.06.22 ORDER BY hive中的ORDER BY语句和关系数据库中的sql语法相似。他会对查询结果做全局排序，这意味着所有的数据会传送到一个Reduce任务上，这样会导致在大数量的情况下，花费大量时间。与数据库中 ORDER BY 的区别在于在hive.mapred.mode = strict模式下，必须指定 limit 否则执行会报错。
单态设计模式 dcj3sjt126com 设计模式
单例模式（Singleton）用于为一个类生成一个唯一的对象。最常用的地方是数据库连接。使用单例模式生成一个对象后，该对象可以被其它众多对象所使用。 <?phpclass Example{ // 保存类实例在此属性中 private static&
svn locked dcj3sjt126com Lock
post-commit hook failed (exit code 1) with output: svn: E155004: Working copy 'D:\xx\xxx' locked svn: E200031: sqlite: attempt to write a readonly database svn: E200031: sqlite: attempt to write a
ARM寄存器学习 e200702084 数据结构 C++c C#F#
无论是学习哪一种处理器，首先需要明确的就是这种处理器的寄存器以及工作模式。 ARM有37个寄存器，其中31个通用寄存器，6个状态寄存器。 1、不分组寄存器（R0-R7）不分组也就是说说，在所有的处理器模式下指的都时同一物理寄存器。在异常中断造成处理器模式切换时，由于不同的处理器模式使用一个名字相同的物理寄存器，就是
常用编码资料 gengzg 编码
List<UserInfo> list=GetUserS.GetUserList(11); String json=JSON.toJSONString(list); HashMap<Object,Object> hs=new HashMap<Object, Object>(); for(int i=0;i<10;i++) {
进程 vs. 线程 hongtoushizi 线程 linux 进程
我们介绍了多进程和多线程，这是实现多任务最常用的两种方式。现在，我们来讨论一下这两种方式的优缺点。首先，要实现多任务，通常我们会设计Master-Worker模式，Master负责分配任务，Worker负责执行任务，因此，多任务环境下，通常是一个Master，多个Worker。如果用多进程实现Master-Worker，主进程就是Master，其他进程就是Worker。如果用多线程实现
Linux定时Job：crontab -e 与 /etc/crontab 的区别 Josh_Persistence linux crontab
一、linux中的crotab中的指定的时间只有5个部分：* * * * * 分别表示：分钟，小时，日，月，星期，具体说来：第一段代表分钟 0—59 第二段代表小时 0—23 第三段代表日期 1—31 第四段代表月份 1—12 第五段代表星期几，0代表星期日 0—6 如： */1 * * * * 每分钟执行一次。 *
KMP算法详解 hm4123660 数据结构 C++算法字符串 KMP
字符串模式匹配我们相信大家都有遇过，然而我们也习惯用简单匹配法（即Brute-Force算法)，其基本思路就是一个个逐一对比下去，这也是我们大家熟知的方法，然而这种算法的效率并不高，但利于理解。假设主串s="ababcabcacbab",模式串为t="
枚举类型的单例模式 zhb8015 单例模式
E.编写一个包含单个元素的枚举类型[极推荐]。代码如下： public enum MaYun {himself; //定义一个枚举的元素，就代表MaYun的一个实例private String anotherField;MaYun() {//MaYun诞生要做的事情//这个方法也可以去掉。将构造时候需要做的事情放在instance赋值的时候：/** himself = MaYun() {*
Kafka+Storm+HDFS ssydxa219 storm
cd /myhome/usr/stormbin/storm nimbus &bin/storm supervisor &bin/storm ui &Kafka+Storm+HDFS整合实践kafka_2.9.2-0.8.1.1.tgzapache-storm-0.9.2-incubating.tar.gzKafka安装配置我们使用3台机器搭建Kafk
Java获取本地服务器的IP 中华好儿孙 java Web 获取服务器ip地址
System.out.println("getRequestURL:"+request.getRequestURL()); System.out.println("getLocalAddr:"+request.getLocalAddr()); System.out.println("getLocalPort:&quo