Flink的内置DataSource+自定义DataSource

内置的DataSource

1. 读取文件

package org.feng.datasource;

import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.java.io.TextInputFormat;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.FileProcessingMode;

/**
 * Created by Feng on 2019/12/5 16:59
 * CurrentProject's name is flink
 * 文件读取
 * @author Feng
 */
public class FileDataSource {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 读取文件:args[0]是一个文件路径
        DataStream<String> dataStream = env.readTextFile(args[0]);

        // 读取文件:args[0]是一个文件路径,指定字符编码格式
        DataStream<String> dataStream1 = env.readTextFile(args[0], "UTF-8");

        // 按照指定格式读取文件
        DataStream<String> dataStream2 = env.readFile(new TextInputFormat(new Path(args[0])), args[0]);

        /*
         * inputFormat:数据流的输入格式
         * filePath:文件路径
         * watchType:读取方式
         *      FileProcessingMode.PROCESS_ONCE 读取一次
         *      FileProcessingMode.PROCESS_CONTINUOUSLY 文件更改后重新读取
         * interval:定期扫描的时间间隔
         * typeinformation:输入流中元素的类型
         */
        DataStream<String> dataStream3 = env.readFile(new TextInputFormat(new Path(args[0])), args[0],
                FileProcessingMode.PROCESS_ONCE,1, BasicTypeInfo.STRING_TYPE_INFO);

        dataStream.print();
        dataStream1.print();
        dataStream2.print();
        dataStream3.print();
        env.execute();
    }
}

2. GenerateDataSource

package org.feng.datasource;

import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;

/**
 * Created by Feng on 2019/12/5 17:19
 * CurrentProject's name is flink
 * 构建无序
 * 基于集合构建
 * 基于元素创建
 * 自定义迭代器
 * @author Feng
 */
public class GenerateDataSource {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 构建从0到10的序列(无序),再映射成字符串,指定返回类型,打印,并行为1
        env.generateSequence(0, 10)
                .map(num -> "feng" + num)
                .returns(Types.STRING)
                .print()
                .setParallelism(1);

        // 从集合创建源并打印
        env.fromCollection(Arrays.asList(1,2,3,4,5,6,7,8))
                .returns(Types.INT)
                .print()
                .setParallelism(1);

        // 从元素构建并打印
        env.fromElements("hello", "world", "Flink")
                .returns(Types.STRING)
                .print()
                .setParallelism(1);

        // 自定义迭代器构建一个字符串
        env.fromCollection(new StringIterator(), BasicTypeInfo.STRING_TYPE_INFO)
                .returns(Types.STRING)
                .print()
                .setParallelism(1);

        env.execute();
    }

    /**
     * 生成100个字符串
     */
    private static class StringIterator implements Iterator<String>, Serializable {

        private static final long serialVersionUID = 2746816923013094383L;
        private int index = 0;
        @Override
        public boolean hasNext() {
            return index < 100;
        }

        @Override
        public String next() {
            index ++;
            return "fengsoshuai" + index;
        }
    }
}

3.监听端口

这里若是不知道怎么运行,请查看我的Flink入门程序的博文。

package org.feng.datasource;

import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * Created by Feng on 2019/12/5 17:47
 * CurrentProject's name is flink
 * @author Feng
 */
public class SocketDataSource {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 主机地址
        String hostname = "localhost";
        // 监听端口
        int port = 12345;
        // 用于分隔每条记录的分隔符
        String delimiter = "\n";
        // 重试次数:负数表示一直重试;0表示不重试;
        int maxRetry = 3;
        env.socketTextStream(hostname, port, delimiter, maxRetry)
                .returns(Types.STRING)
                .print()
                .setParallelism(1);

        env.execute();
    }
}

4. 自定义数据源(addSource方法)

package org.feng.datasource;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.io.Serializable;

/**
 * Created by Feng on 2019/12/5 18:25
 * CurrentProject's name is flink
 * addSource方法的使用:自定义数据源
 * @author Feng
 */
public class DefineDataSource implements Serializable {
    private static final long serialVersionUID = 6883121931605774645L;

    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DefineDataSource dds = new DefineDataSource();
        // 普通方式:不并行
        env.addSource(new SourceFunction<String>() {
            private static final long serialVersionUID = -2595016291338802241L;

            private String temp = "fengsoshaui-SourceFunction-";
            // 次数
            private int count = 0;
            private volatile boolean running = true;

            @Override
            public void run(SourceContext<String> sourceContext) {
                int thousand = 1000;
                while(running && count++ < thousand){
                    sourceContext.collect(temp + count);
                }
            }

            @Override
            public void cancel() {
                running = false;
            }
        }).print().setParallelism(1);

        // SourceFunction的子接口:具有并行度
        env.addSource(new ParallelSourceFunction<String>() {
            private static final long serialVersionUID = -8628017068895567640L;

            private String temp = "fengsoshaui-ParallelSourceFunction-";
            // 次数
            private int count = 0;
            private volatile boolean running = true;
            @Override
            public void run(SourceContext<String> sourceContext) throws Exception {
                dds.run(sourceContext, running, count, temp);
            }

            @Override
            public void cancel() {
                running = false;
            }
        }).print().setParallelism(1);


        // ParallelSourceFunction的子接口:具有并行度
        env.addSource(new RichParallelSourceFunction<String>() {
            private static final long serialVersionUID = -3591733182357219507L;
            private String temp = "fengsoshaui-RichParallelSourceFunction-";
            // 次数
            private int count = 0;
            private volatile boolean running = true;
            @Override
            public void run(SourceContext<String> sourceContext) throws Exception {
                dds.run(sourceContext, running, count, temp);
            }

            @Override
            public void cancel() {
                running = false;
            }
        }).print().setParallelism(1);

        env.execute();
    }

    private void run(SourceFunction.SourceContext<String> sourceContext, boolean running, int count, String temp){
        int thousand = 1000;
        while(running && count++ < thousand) {
            sourceContext.collect(temp + count);
        }
    }
}

你可能感兴趣的:(Flink)