官网算子介绍:
常用transformation算子
小案例1:使用mapPartition将数据保存到数据库
mysql</groupId>
mysql-connector-java</artifactId>
5.1.38</version>
</dependency>
CREATE TABLE `user` (
`id` int(10) NOT NULL AUTO_INCREMENT,
`name` varchar(32) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
import java.sql.PreparedStatement
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment}
object MapPartition2MySql {
def main(args: Array[String]): Unit = {
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val sourceDataset: DataSet[String] = environment.fromElements("1 zhangsan","2 lisi","3 wangwu")
sourceDataset.mapPartition(part => {
Class.forName("com.mysql.jdbc.Driver").newInstance()
val conn = java.sql.DriverManager.getConnection("jdbc:mysql://localhost:3306/flink_db", "flink", "123456")
part.map(x => {
val statement: PreparedStatement = conn.prepareStatement("insert into user (id,name) values(?,?)")
statement.setInt(1, x.split(" ")(0).toInt)
statement.setString(2, x.split(" ")(1))
statement.execute()
})
}).print()
environment.execute()
}
}
小案例2:连接操作
bject BatchDemoOuterJoinScala {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val data1 = ListBuffer[Tuple2[Int,String]]()
data1.append((1,"zs"))
data1.append((2,"ls"))
data1.append((3,"ww"))
val data2 = ListBuffer[Tuple2[Int,String]]()
data2.append((1,"beijing"))
data2.append((2,"shanghai"))
data2.append((4,"guangzhou"))
val text1 = env.fromCollection(data1)
val text2 = env.fromCollection(data2)
text1.leftOuterJoin(text2).where(0).equalTo(0).apply((first,second)=>{
if(second==null){
(first._1,first._2,"null")
}else{
(first._1,first._2,second._2)
}
}).print()
println("===============================")
text1.rightOuterJoin(text2).where(0).equalTo(0).apply((first,second)=>{
if(first==null){
(second._1,"null",second._2)
}else{
(first._1,first._2,second._2)
}
}).print()
println("===============================")
text1.fullOuterJoin(text2).where(0).equalTo(0).apply((first,second)=>{
if(first==null){
(second._1,"null",second._2)
}else if(second==null){
(first._1,first._2,"null")
}else{
(first._1,first._2,second._2)
}
}).print()
}
}
object FlinkPartition {
def main(args: Array[String]): Unit = {
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
environment.setParallelism(2)
import org.apache.flink.api.scala._
val sourceDataSet: DataSet[String] = environment.fromElements("hello world","spark flink","hive sqoop")
val filterSet: DataSet[String] = sourceDataSet.filter(x => x.contains("hello"))
.rebalance()
filterSet.print()
environment.execute()
}
}
import org.apache.flink.api.common.functions.Partitioner
class MyPartitioner extends Partitioner[String]{
override def partition(word: String, num: Int): Int = {
println("分区个数为" + num)
if(word.contains("hello")){
println("0号分区")
0
}else{
println("1号分区")
1
}
}
}
import org.apache.flink.api.scala.ExecutionEnvironment
object FlinkCustomerPartition {
def main(args: Array[String]): Unit = {
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
//设置我们的分区数,如果不设置,默认使用CPU核数作为分区个数
environment.setParallelism(2)
import org.apache.flink.api.scala._
//获取dataset
val sourceDataSet: DataSet[String] = environment.fromElements("hello world","spark flink","hello world","hive hadoop")
val result: DataSet[String] = sourceDataSet.partitionCustom(new MyPartitioner,x => x + "")
val value: DataSet[String] = result.map(x => {
println("数据的key为" + x + "线程为" + Thread.currentThread().getId)
x
})
value.print()
environment.execute()
}
}