pyspark OneHotEncoder用法实例

def encode_columns(df, col_list):
	indexers = [
	StringIndexer(inputCol=c, outputCol=f'{c}_indexed').setHandleInvalid("keep")
	for c in col_list
	]
	encoder = OneHotEncoderEstimator(
		inputCols = [indexer.getOutputCol()) for index in indexers]) #.setDropLast(False)
	newColumns = []
	for f in col_list:
		colMap = df.select(f'{f}', f'{f}_indexed').distinct().rdd.collectAsMap()
		colTuple = sorted( (v, f'{f}_{k}') for k,v in colMap.items())
		newColumns.append(v[1] for v in colTuple)

	pipeline = Pipeline(stages =indexers + [encoder])
	piped_encoder = pipeline.fit(df)
	encoded_df = piped_encoder.transfrom(df)
	return piped_encoder, encoded_df, newColumns

你可能感兴趣的:(Spark学习随笔)