/**
* Dataset.scala
* Returns a new Dataset partitioned by the given partitioning expressions into
* `numPartitions`. The resulting Dataset is hash partitioned.
*
* This is the same operation as "DISTRIBUTE BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T] = withTypedPlan {
RepartitionByExpression(partitionExprs.map(_.expr), logicalPlan, Some(numPartitions))
}
The resulting Dataset is hash partitioned,说的很清楚,使用hash 分区,那看看hash 分区的源码,
/**
* Partitioner.scala
* A [[org.apache.spark.Partitioner]] that implements hash-based partitioning using
* Java's `Object.hashCode`.
*
* Java arrays have hashCodes that are based on the arrays' identities rather than their contents,
* so attempting to partition an RDD[Array[_]] or RDD[(Array[_], _)] using a HashPartitioner will
* produce an unexpected or incorrect result.
*/
class HashPartitioner(partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case null => 0
case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
}
override def equals(other: Any): Boolean = other match {
case h: HashPartitioner =>
h.numPartitions == numPartitions
case _ =>
false
}
override def hashCode: Int = numPartitions
}
/* Calculates 'x' modulo 'mod', takes to consideration sign of x,
* i.e. if 'x' is negative, than 'x' % 'mod' is negative too
* so function return (x % mod) + mod in that case.
*/
def nonNegativeMod(x: Int, mod: Int): Int = {
val rawMod = x % mod
rawMod + (if (rawMod < 0) mod else 0)
}
看到这里,前面的相同分区存在不同的 name 的记录就不难理解了,不同的name值hashCode%分区数后落到相同的分区... 简单的调整方式,在遍历分区里面用hashMap兼容不同name值的记录处理,那如果我们想自定义分区呢,自定义分组分区代码写起来就比较直观容易理解,幸好spark提供了partitioner接口,可以自定义partitioner,支持这种自定义分组分区的方式,这里我也有个简单实现类,可以支持同个分区只有相同name的记录
import org.apache.commons.collections.CollectionUtils;
import org.apache.spark.Partitioner;
import org.junit.Assert;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* Created by lesly.lai on 2018/7/25.
*/
public class CuxGroupPartitioner extends Partitioner {
private int partitions;
/**
* map
* 主要为了区分不同分区
*/
private Map
查看分区分布情况工具类
import org.apache.spark.sql.{Dataset, Row}
/**
* Created by lesly.lai on 2017/12FeeTask/25.
*/
class SparkRddTaskInfo {
def getTask(dataSet: Dataset[Row]) {
val size = dataSet.rdd.partitions.length
println(s"==> partition size: $size " )
import scala.collection.Iterator
val showElements = (it: Iterator[Row]) => {
val ns = it.toSeq
import org.apache.spark.TaskContext
val pid = TaskContext.get.partitionId
println(s"[partition: $pid][size: ${ns.size}] ${ns.mkString(" ")}")
}
dataSet.foreachPartition(showElements)
}
}
调用方式
import com.vip.spark.db.ConnectionInfos;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.List;
import java.util.stream.Collectors;
/**
* Created by lesly.lai on 2018/7/23.
*/
public class SparkSimpleTestPartition {
public static void main(String[] args) throws InterruptedException {
SparkSession sparkSession = SparkSession.builder().appName("Java Spark SQL basic example").getOrCreate();
// 原始数据集
Dataset originSet = sparkSession.read().jdbc(ConnectionInfos.TEST_MYSQL_CONNECTION_URL, "people", ConnectionInfos.getTestUserAndPasswordProperties());
originSet.createOrReplaceTempView("people");
// 获取分区分布情况工具类
SparkRddTaskInfo taskInfo = new SparkRddTaskInfo();
Dataset groupSet = sparkSession.sql(" select name from people group by name");
List groupList = groupSet.javaRDD().collect().stream().map(row -> row.getAs("name")).collect(Collectors.toList());
// 创建pairRDD 目前只有pairRdd支持自定义partitioner,所以需要先转成pairRdd
JavaPairRDD pairRDD = originSet.javaRDD().mapToPair(row -> {
return new Tuple2(row.getAs("name"), row);
});
// 指定自定义partitioner
JavaRDD javaRdd = pairRDD.partitionBy(new CuxGroupPartitioner(groupList)).map(new Function, Row>(){
@Override
public Row call(Tuple2 v1) throws Exception {
return v1._2;
}
});
Dataset result = sparkSession.createDataFrame(javaRdd, originSet.schema());
// 打印分区分布情况
taskInfo.getTask(result);
}
}
冒泡排序
public static void sort(Integer[] param) {
for (int i = param.length - 1; i > 0; i--) {
for (int j = 0; j < i; j++) {
int current = param[j];
int next = param[j + 1];
方法一:
public class Zhidao {
public static void main(String args[]) {
String s = "sdf灭礌 kjl d{';\fdsjlk是";
int n=0;
for(int i=0; i<s.length(); i++) {
n = (int)s.charAt(i);
if((
IF OBJECT_ID('tempdb..#ABC') is not null
drop table tempdb..#ABC
create table #ABC
(
PATHNAME NVARCHAR(50)
)
insert into #ABC
SELECT N'/ABCDEFGHI'
UNION ALL SELECT N'/ABCDGAFGASASSDFA'
UNION ALL
http://www.sdn.sap.com/irj/boc/business-objects-for-sap-faq
Besides, I care that how to integrate tightly.
By the way, for BW consultants, please just focus on Query Designer which i
结构
继承关系
public static final class Manifest.permission_group extends Object
java.lang.Object
android. Manifest.permission_group 常量
ACCOUNTS 直接通过统计管理器访问管理的统计
COST_MONEY可以用来让用户花钱但不需要通过与他们直接牵涉的权限
D