package javasssss;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import java.util.Iterator;
/**
* Created by shengjk1 on 2016/8/8.
* blog address :http://blog.csdn.net/jsjsjs1789
*/
public class SparkInsertHbase {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("SparkInsertHbase");
JavaSparkContext sc = new JavaSparkContext(conf);
HiveContext hiveContext = new HiveContext(sc.sc());
DataFrame df = hiveContext.sql("select id,name from test");
**//froeachPartition foreah 会报task not to serializer。但对mysql来说两者都ok,推荐使用foreachPartition**
df.toJavaRDD().foreachPartition(new VoidFunction>() {
private static final long serialVersionUID = -3496935835002029475L;
@Override
public void call(Iterator rowIterator) throws Exception {
HTable table = new HTable(HBaseConfiguration.create(), "test");
/*
hbase 新api
Configuration config = HBaseConfiguration.create();
//若此处配置zk,则写错程序会卡死。可通过界面查看日志,解决!
//也可以不配,但需要classpath路径有hbase-site.xml文件
config.set("hbase.zookeeper.quorum", "centos2");
Connection conn= ConnectionFactory.createConnection(config);
Table table=conn.getTable(TableName.valueOf("test"));
*/
while (rowIterator.hasNext()) {
Row row = rowIterator.next();
String id = row.getString(0);
String name = row.getString(1);
Put put = new Put("f".getBytes());
put.addColumn("f".getBytes(), "id".getBytes(), id.getBytes());
put.addColumn("f".getBytes(), "name".getBytes(), name.getBytes());
table.put(put);
}
// String tableName = "test";
// Table table=conn.getTable(TableName.valueOf(tableName));
};
});
}
}
解决task not to serializable总共有三种办法,具体参照:
http://stackoverflow.com/questions/25250774/writing-to-hbase-via-spark-task-not-serializable