阿里云ODPS使用总结

从阿里云官网下载ODPS的客户端odpscmd_public.zip,该客户端在linux和windows环境上都可用。

进入该客户端的配置文件odps_client/conf/odps_config.ini ,进行配置,主要配置前三项:

project_name=XXXXXX
access_id=XXXXXX
access_key=XXXXXX
end_point=http://service.odps.aliyun.com/api
tunnel_endpoint=http://dt.odps.aliyun.com
log_view_host=http://logview.odps.aliyun.com
https_check=true
# confirm threshold for query input size(unit: GB)
data_size_confirm=100.0

之后即可运行odps_client/bin/odpscmd进入命令行进行操作。


部分使用过的命令总结如下:

显示所有表:

show  tables;        

查询某张表所有分区:

ls partitions tablename;

查询某表的某分区数据前5条:

select * from tablename where partitionname='xxxxxx' limit 5;


odps dship客户端可以表进行一些操作,后将该客户端整合到了odps客户端中,关于odps dship客户端可参照odps dship客户端使用,

此处介绍的是本人在项目中使用到的整合到odps客户端中的:

在shell脚本中如下代码,下载表tablename的xxxxxx分区到本地文件localtablename中,使用“##”作为字段分隔符(若不写默认使用“,”作为分隔符):

cd /home/username/odpsclient/bin
./odpscmd -e "tunnel download -fd '##' tablename/partitionname='xxxxxx' /home/username/localtablename"

注:odps暂不支持下载成标准.csv文件,所以若表中字段本身就包含“,”,则应指定非“,”的其他字符作为分隔符。否则若作为.csv来解析的话就会把字段里的“,”也当作了分隔符,

导致解析出错!!!


JAVA API整理:

odps支持java api,具体可见阿里云官网;此处记录本人曾在项目中用到的java整合spark的api:

OdpsOps odpsOps = new OdpsOps(sc, ODPS_ACCESS_ID, ODPS_ACCESS_KEY, ODPS_END_POINT, ODPS_TUNNEL_URL);

(第一个参数sc 即spak的入口SparkContext)

JavaRDD tRDD  = odpsOps.readTableWithJava(ODPS_PROJECT_NAME, tName, partitionSpec,
new Function2() {
private static final long serialVersionUID = 1L;
@Override
public T call(Record record, TableSchema schema) throws Exception {
return cpFromRecord(record, schema);
}
}, numPartitoins);

(第二个参数是表名;第三个参数是分区名和分区值,如:data_date='20160701',第五个参数是spark的启动的线程数(SparkRDD的分区数))


下面是odps常规java api总结:

odps参数:



XXX
XXX
XXX
http://service.odps.aliyun.com/api
http://dt.odps.aliyun.com
/home/jianfeitech/odps_statistic/odps/dship/dship
/home/jianfeitech/odps_statistic/odps/odps-cli/bin/odps
../rep/res-db-odps-0.0.1-SNAPSHOT.jar:../rep/dmp-odps-0.0.1-SNAPSHOT.jar:../rep/log4j-1.2.17.jar:../rep/slf4j-api-1.6.6.jar:../rep/slf4j-log4j12-1.6.6.jar

下面是api:

package com.jianfeitech.bd.res.db.odps.access;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.lang.StringUtils;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;

import com.aliyun.odps.Column;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.Partition;
import com.aliyun.odps.PartitionSpec;
import com.aliyun.odps.Resources;
import com.aliyun.odps.Table;
import com.aliyun.odps.TableSchema;
import com.aliyun.odps.Tables;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.tunnel.TableTunnel;
import com.aliyun.odps.tunnel.TableTunnel.DownloadSession;
import com.aliyun.odps.tunnel.TableTunnel.UploadSession;
import com.aliyun.odps.tunnel.TunnelException;
import com.jianfeitech.bd.common.conf.access.ResConfAccess;
import com.jianfeitech.bd.common.conf.model.db.Odps;

public abstract class BaseOdpsAccess implements OdpsAccess, Closeable {
protected Logger logger = LoggerFactory.getLogger(BaseOdpsAccess.class);
protected Odps odps;
protected Account account;
protected com.aliyun.odps.Odps conn;
protected TableTunnel tunnel;
protected Tables tables;
protected UploadSession usession;
protected DownloadSession dsession;

public BaseOdpsAccess(Odps odps) {
this.account = new AliyunAccount(odps.getAccessKeyId(),odps.getAccessKeySecret());
this.conn = new com.aliyun.odps.Odps(account);
this.conn.setEndpoint(odps.getOdpsEndpoint());
this.conn.setDefaultProject(odps.getProjectName());
this.tables = this.conn.tables();
this.tunnel = new TableTunnel(conn);
this.tunnel.setEndpoint(odps.getTunnelEndpoint());

}


public BaseOdpsAccess(ResConfAccess rca) {
this(rca,"default");
}


public BaseOdpsAccess(ResConfAccess rca, String odpsName) {
this(rca.getDbOdps(odpsName));
}


public com.aliyun.odps.Odps getConn() {
return this.conn;
}


public TableTunnel gettTunnel() {
return this.tunnel;
}


// partitionPair example "data_date=20161227"
public UploadSession getUploadSession(String tableName, String partitionPair) {
String projectName = this.odps.getProjectName();
try {
if (StringUtils.isEmpty(partitionPair)) {
this.usession = this.tunnel.createUploadSession(projectName, tableName);
} else {
PartitionSpec partitionSpec = new PartitionSpec(partitionPair);
this.usession = this.tunnel.createUploadSession(projectName, tableName, partitionSpec);
}
logger.info("upload session state: "+this.usession.getStatus().toString());
} catch (TunnelException e) {
logger.error("upload session creation failed.",e);
} catch (IOException e) {
logger.error("please check network for upload session.",e);
}
return this.usession;
}


// partitionPair example "data_date=20161227"
public DownloadSession getDownloadSession(String tableName, String partitionPair) {
String projectName = this.getConn().getDefaultProject();
try {
if (StringUtils.isEmpty(partitionPair)) {
this.dsession = this.tunnel.createDownloadSession(projectName, tableName);
} else {
PartitionSpec partitionSpec = new PartitionSpec(partitionPair);
this.dsession = this.tunnel.createDownloadSession(projectName, tableName, partitionSpec);
}
logger.info("download session state: "+this.dsession.getStatus().toString());
} catch (TunnelException e) {
logger.error("download session creation failed.",e);
} catch (IOException e) {
logger.error("please check network for download session.",e);
}
return this.dsession;
}


public List> findAllTableName() {
List> odpsTables = new ArrayList<>();
for (Table table : this.tables) {
Map tableMap = new HashMap<>();
tableMap.put("tableCode", table.getName());
if (StringUtils.isNotBlank(table.getComment())) {
tableMap.put("tableName", table.getComment());
} else {
tableMap.put("tableName", table.getName());
}
odpsTables.add(tableMap);
}
return odpsTables;
}

public List> findAllColumnName(String tableName) {
List> tableColumns = new ArrayList<>();
Table table=this.tables.get(tableName);
TableSchema tableSchema = table.getSchema();
List columnList = tableSchema.getColumns();
for (Column column : columnList) {
Map columnMap = new HashMap();
columnMap.put("columnCode", column.getName());
if (StringUtils.isNotBlank(column.getComment())) {
columnMap.put("columnName", column.getComment());
} else {
columnMap.put("columnName", column.getName());
}
tableColumns.add(columnMap);
}


return tableColumns;
}


public boolean findPartition(String tableName, String partitionName, String partitionValue) {


boolean res = true;
if (null == tableName || null == partitionName) {
return false;
}
Table table = this.tables.get(tableName);
PartitionSpec ps = new PartitionSpec();
ps.set(partitionName, partitionValue);
try {
res = table.hasPartition(ps);
} catch (OdpsException e) {
logger.warn("partition finding faild.",e);;
res = false;
}
return res;
}


public Set findAllPartition(String tName, String pName) {
Set set = new TreeSet();
Table table = this.tables.get(tName);
List partitions = table.getPartitions();
for(Partition partition : partitions){
PartitionSpec partitionSpec = partition.getPartitionSpec();
String pValue = partitionSpec.get(pName);
set.add(pValue);
}
return set;
};

public boolean upsertPartition(String tableName, String partitionName, String partitionValue) {
try {
if (findPartition(tableName, partitionName, partitionValue) == false) {
Table table = this.tables.get(tableName);
PartitionSpec ps = new PartitionSpec();
ps.set(partitionName, partitionValue);
table.createPartition(ps);
}
return true;
} catch (OdpsException e) {
logger.warn("upsert partition failed",e);
return false;
}
}


public long getRecordCount(String tableName, String partitionName) {
return getDownloadSession(tableName,partitionName).getRecordCount();
}


public void close() throws IOException {

}

public Object unserializeFromFile(String resourceName) {
   Resources resources = this.conn.resources();
   Object readObject = null;
try {
InputStream resourceAsStream = resources.getResourceAsStream(resourceName);
ObjectInputStream objectInputStream = new ObjectInputStream(resourceAsStream);
readObject = objectInputStream.readObject();
} catch (Exception e) {
logger.error("read odps resource " + resourceName + "error,",e);
}
   return readObject;
}


public static Map getRecordData(Record record) {
Map res = new HashMap();
if (null == record) {
return res;
}
Column[] columns = record.getColumns();
Object[] values = record.toArray();
for (int i = 0; i < columns.length; i++) {
res.put(columns[i].getName(), values[i]);
}
return res;
}


}




补充:

ODPS的sql与常规sql类似,如 

select mac,count(1) as c from http_standard where data_date='20160701' group by mac order by c desc limit 5;









你可能感兴趣的:(大数据)