CDH6 + IMPALA
本地数据上传到 - > hdfs - >导入impala
# 导入配置
import-config:
csv-tmp-path: D:/test/tmp
impala-url: jdbc:impala://x.x.x.x:21050/default
impala-user: hue
impala-password: hue
hdfs-uri: hdfs://x.x.x.x:8020
hdfs-user: hue
hdfs-tmp-path: /home/data/tmp
/**
* ImportConfig
*
* Description
*
* Creation Time: 2019/6/12 16:58.
*
* @author Hu-Weihui
*/
@Component
@Data
@ConfigurationProperties(prefix = "import-config")
public class ImportConfig {
private String csvTmpPath;
private String impalaUrl;
private String impalaUser;
private String impalaPassword;
private String hdfsUser;
private String hdfsUri;
private String hdfsTmpPath;
}
关键点:
1.APPEND/OVERWRITE我是自定义的枚举类用于对应IMPALA的追加OR覆盖
2.执行完LOAD DATA 命令后,IMPALA一定要执行REFRESH [TABLE]操作
3.很多坑爹博客会让你用HIVE执行,注意用HIVE-JDBC执行REFRESH [TABLE]会报错,无法识别
4.去CDH下载IMPALA的JDBC驱动包
/**
* IMPALA数据导入
*
* @param tableName 表名
* @param updateMethod APPEND/OVERWRITE(追加OR覆盖)
* @param multipartFile 客户端上传的文件
*/
@Override
public void importImpalaData(String tableName, String updateMethod, MultipartFile multipartFile) {
// 1.csv 保存到loacal(本机/本地服务器)
File localFile = saveToLocal(multipartFile);
String localFilePath = localFile.getPath();
String hdfsDstPath = importConfig.getHdfsTmpPath() + "/" + localFile.getName();
// 2.上传到hdfs上
Path srcPath = new Path(localFilePath);
Path dstPath = new Path(hdfsDstPath);
Path hdfsPath = new Path(importConfig.getHdfsTmpPath());
try {
Configuration configuration = new Configuration();
URI hdfsUri = new URI(importConfig.getHdfsUri());
FileSystem fileSystem = FileSystem.get(hdfsUri, configuration ,importConfig.getHdfsUser());
if (!fileSystem.exists(hdfsPath)) {
fileSystem.mkdirs(hdfsPath);
}
fileSystem.copyFromLocalFile(srcPath, dstPath);
} catch (URISyntaxException e) {
log.error("the uri have some error :", e);
throw new DataManagementException("上传到数据失败");
} catch (IOException e) {
log.error("con not get FileSystem :", e);
throw new DataManagementException("上传到数据失败");
} catch (InterruptedException e) {
log.error("InterruptedException :", e);
throw new DataManagementException("上传到数据失败");
}
Connection connection = null;
Statement statement = null;
try {
// 3. impala 使用 Load命令从 hdfs 导入数据
String url = importConfig.getImpalaUrl();
String user = importConfig.getImpalaUser();
String password = importConfig.getImpalaPassword();
Class.forName("com.cloudera.impala.jdbc41.Driver");
connection = DriverManager.getConnection(url, user, password);
// load data from hdfs
String loadSql = "LOAD DATA INPATH '" + hdfsDstPath + "' INTO TABLE " + tableName;
if (updateMethod.equals(UpdateMethod.OVERRIDE.getCode())) {
loadSql = "LOAD DATA INPATH '" + hdfsDstPath + "'OVERWRITE INTO TABLE " + tableName;
}
statement = connection.createStatement();
statement.execute(loadSql);
// refresh the impala table
String refreshResult = String.format("REFRESH %s", tableName);
statement.execute(refreshResult);
} catch (ClassNotFoundException e) {
log.error("load impala driver class fail :", e);
throw new DataManagementException("导入数据失败");
} catch (SQLException e) {
log.error("can not to load hdfs data into impala :", e);
throw new DataManagementException("导入数据失败");
} finally {
if (statement != null) {
try {
statement.close();
} catch (SQLException e) {
log.error(" can not close statement: ", e);
}
}
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
log.error(" can not close connection: ", e);
}
}
}
}
去官方网站
CDH官网:https://www.cloudera.com/
DOWNLOAD-> 下拉找到Database Drivers -> Impala JDBC Driver Downloads
下载地址:https://www.cloudera.com/downloads/connectors/impala/jdbc/2-6-12.html
<dependency>
<groupId>com.cloudera.impalagroupId>
<artifactId>jdbcartifactId>
<version>2.6.12version>
<scope>systemscope>
<systemPath>${project.basedir}/../lib/ImpalaJDBC41.jarsystemPath>
dependency>
<plugin>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-maven-pluginartifactId>
<configuration>
<fork>truefork>
<finalName>venus-gzzc-bi-befinalName>
<mainClass>com.richstonedt.ht.gzzc.ApplicationmainClass>
<fork>truefork>
<includeSystemScope>trueincludeSystemScope>
configuration>
<executions>
<execution>
<goals>
<goal>repackagegoal>
goals>
execution>
executions>
plugin>
Class.forName("com.cloudera.impala.jdbc41.Driver");
String url = "jdbc:impala://x.x.x.x:21050/default";
connection = DriverManager.getConnection(url, user, password);
集成了kerberos并踩了很多坑
https://blog.csdn.net/HuHui_/article/details/94741104
作者:HuHui
转载:欢迎一起讨论web和大数据问题,转载请注明作者和原文链接,感谢