最近研究了一下 phoenix 基于JDBC方式导入的性能问题,测试环境:
hadoop (2.3.0+cdh5.0.0+548)
hbase(0.96.1.1+cdh5.0.0+60)
phoenix4.0.0.0
创建表tab_ps_xdr2, 带有4个索引列分别创建4个索引表,如下:
create table TAB_PS_XDR2 ( K VARCHAR NOT NULL PRIMARY KEY, A VARCHAR, B VARCHAR, C VARCHAR, D VARCHAR, V VARCHAR) COMPRESSION='SNAPPY',SALT_BUCKETS=12,TTL='7200',VERSIONS='5';
create index CALLING_NUMBER2 on TAB_PS_XDR2 (A);
create index CALLED_NUMBER2 on TAB_PS_XDR2 (B);
create index IMSI2 on TAB_PS_XDR2 (C);
create index IMEI2 on TAB_PS_XDR2 (D);
测试代码:
public static void readCVS5(int pre, String path) { File file = new File(path); long fileLength = 0; fileLength = file.length(); int BUFFER_SIZE = (int) fileLength;// 100M的缓冲 Connection _Connection = null; PreparedStatement _PreparedStatement = null; String tSQL = "UPSERT INTO TAB_PS_XDR2 VALUES(?,?,?,?,?,?)"; try { _Connection = HBaseUtility.getConnection(); _Connection.setAutoCommit(false); _PreparedStatement = _Connection.prepareStatement(tSQL); long start = System.currentTimeMillis(); String pid = ManagementFactory.getRuntimeMXBean().getName().split("@")[0]; FileChannel fc = new RandomAccessFile(file, "r").getChannel(); MappedByteBuffer inputBuffer = fc.map( FileChannel.MapMode.READ_ONLY, 0, fileLength);// 读取大文件 byte[] dst = new byte[BUFFER_SIZE]; int j = 0; for (int offset = 0; offset < fileLength; offset += BUFFER_SIZE) { if (fileLength - offset >= BUFFER_SIZE) { for (int i = 0; i < BUFFER_SIZE; i++) dst[i] = inputBuffer.get(offset + i); } else { for (int i = 0; i < fileLength - offset; i++) dst[i] = inputBuffer.get(offset + i); } InputStream in = new ByteArrayInputStream(dst); BufferedReader reader = new BufferedReader( new InputStreamReader(in), BUFFER_SIZE); String line = reader.readLine(); String[] strs = null; while (line != null) { strs = parserLine2(line, ","); if (strs.length > 5) { _PreparedStatement.setString(1, strs[0]); _PreparedStatement.setString(2, strs[1]); _PreparedStatement.setString(3, strs[2]); _PreparedStatement.setString(4, strs[3]); _PreparedStatement.setString(5, strs[4]); _PreparedStatement.setString(6, strs[5]); _PreparedStatement.addBatch(); if ((++j) % pre == 0) { _PreparedStatement.executeBatch(); _PreparedStatement.clearBatch(); // _Connection.commit(); System.out.println( "executeInsert::" +pid); } } else { System.out.println("数据问题:" + j); } line = reader.readLine(); } _PreparedStatement.executeBatch(); _PreparedStatement.clearBatch(); _Connection.commit(); System.out.println("executeInsert-LashFlush!!"+pid); long totalTime = System.currentTimeMillis() - start; System.out.println("每秒处理数据:" + j * 1000 / totalTime); } } catch (Exception e) { e.printStackTrace(); }finally { try { _PreparedStatement.close(); _Connection.close(); } catch (Exception e) { e.printStackTrace(); } } }
单进程(万条提交)测试:
sh cmd0.sh 10000 /home/cloudil/limq/test/a-bssap-1.cdr.csv &
测试时间大约23秒。
十进程测试(万条提交):
测试大约79秒,按照平均每个文件 95M计算, 每秒处理大约12M左右数据。
每次10000-11000条提交比较合适。