需求:解决海量数据的存储,并且能够实现海量数据的秒级查询.
实际生产中,一遍文章要分成标题和正文;但是正文的量是比较大的,那么我们一般会在es中存储标题,在hbase 中存储正文(hbase本身就是做海量数据的存储);这样通过es的倒排索引列表检索到关键词的文档id,然后根据文档id在hbase中查询出具体的正文。
分析,数据哪些字段需要构建索引: 文章数据(id、title、author、describe、conent)
字段名称 是否需要索引 是否需要存储
Id 默认索引 默认存储
Title 需要 需要
Author 看需求 看需求
Dscribe 需要 存储
Content 看需求(高精度查询,是需要的 ) 看需求
Time 需要 需要
PUT /articles
{
"settings":{
"number_of_shards":3,
"number_of_replicas":1,
"analysis" : {
"analyzer" : {
"ik" : {
"tokenizer" : "ik_max_word"
}
}
}
},
"mappings":{
"article":{
"dynamic":"strict",
"_source": {
"includes": [
"id","title","from","readCounts","times"
],
"excludes": [
"content"
]
},
"properties":{
"id":{"type": "keyword", "store": true},
"title":{"type": "text","store": true,"index" : true,"analyzer": "ik_max_word"},
"from":{"type": "keyword","store": true},
"readCounts":{"type": "integer","store": true},
"content":{"type": "text","store": false,"index": false},
"times": {"type": "keyword", "index": false}
}
}
}
}
创建maven工程并导入jar包
<dependencies>
<!--解析excel文件-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>6.7.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.0.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
</plugins>
</build>
定义Article实体类
public class Article {
private String id;
private String title;
private String from;
private String times;
private String readCounts;
private String content;
public Article() {
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Article(String id, String title, String from, String times, String readCounts, String content) {
this.id = id;
this.title = title;
this.from = from;
this.times = times;
this.readCounts = readCounts;
this.content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getFrom() {
return from;
}
public void setFrom(String from) {
this.from = from;
}
public String getTimes() {
return times;
}
public void setTimes(String times) {
this.times = times;
}
public String getReadCounts() {
return readCounts;
}
public void setReadCounts(String readCounts) {
this.readCounts = readCounts;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
定义excel解析工具类
public class ExcelUtil {
//读取exel,将文件内容打印出来
public static void main(String[] args) throws IOException {
List<Article> exceInfo = getExceInfo();
}
public static List<Article> getExceInfo() throws IOException {
FileInputStream fileInputStream = new FileInputStream("F:\\excel数据集\\baijia.xlsx");
//获取我们解析excel表格的对象
XSSFWorkbook xssfSheets = new XSSFWorkbook(fileInputStream);
//获取excel的第一个sheet页
XSSFSheet sheetAt = xssfSheets.getSheetAt(0);
//获取我们sheet页的最后一行的数字之,说白了就是看这个excel一共有多少行
int lastRowNum = sheetAt.getLastRowNum();
List<Article> articleList = new ArrayList<Article>();
for(int i =1 ;i<=lastRowNum;i++){
Article article = new Article();
//获取我们一行 行的数据
XSSFRow row = sheetAt.getRow(i);
//通过我们的row对象,解析里面一个个的字段
XSSFCell title = row.getCell(0);
XSSFCell from = row.getCell(1);
XSSFCell time = row.getCell(2);
XSSFCell readCount = row.getCell(3);
XSSFCell content = row.getCell(4);
// System.out.println(title.toString());
article.setId(i+"");
article.setTitle(title.toString());
article.setContent(content.toString());
article.setFrom(from.toString());
article.setReadCounts(readCount.toString());
article.setTimes(time.toString());
articleList.add(article);
}
fileInputStream.close();
return articleList;
}
}
定义main方法
public class AppMain {
private static final String tableName = "hbase_es_article";
private static final String familyName = "f1";
private static final String title = "title";
private static final String from = "from";
private static final String times ="times";
private static final String readCounts = "readCounts";
private static final String content ="content";
public static void main(String[] args) throws IOException {
//使用java代码解析excel表格
List<Article> exceInfo = ExcelUtil.getExceInfo();
/* //将集合当中的数据,保存到es当中去
TransportClient client = getEsClient();
save2Es(exceInfo, client);
Table table = getTable();
//循环遍历我们的数据,将我们的数据装到List
saveToHbase(exceInfo, table);*/
/* //通过一个关键字进行搜索,将我们的数据从es当中查询出来
TransportClient esClient = getEsClient();
List getAllKeyWord = getByKeyWord(esClient,"机器人");*/
//拿到数据的id,看数据详情 1216
Table table = getTable();
Get get = new Get("1216".getBytes());
Result result = table.get(get);
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
byte[] value = cell.getValue();
System.out.println(Bytes.toString(value));
//将文章内容封装到article给前端返回即可
}
}
private static List<String> getByKeyWord(TransportClient esClient,String keyWord) {
ArrayList<String> strings = new ArrayList<String>();
SearchResponse searchResponse = esClient.prepareSearch("articles").setTypes("article")
.setQuery(QueryBuilders.termQuery("title", keyWord)).get();
SearchHits hits = searchResponse.getHits();
for (SearchHit hit : hits) {
//获取我们数据的系统的id
String id = hit.getId();
// System.out.println(id);
strings.add(id);
}
return strings;
}
private static void saveToHbase(List<Article> exceInfo, Table table) throws IOException {
System.out.println(exceInfo.size());
long startTime = System.currentTimeMillis();
List<Put> putList = new ArrayList<Put>();
for (Article article : exceInfo) {
System.out.println(article.getTitle());
Put put = new Put(Bytes.toBytes(article.getId()));
if(article.getTitle() != null && article.getTitle() != ""){
put.addColumn(familyName.getBytes(),title.getBytes(),article.getTitle().getBytes());
put.addColumn(familyName.getBytes(),from.getBytes(),article.getFrom().getBytes());
put.addColumn(familyName.getBytes(),times.getBytes(),article.getTimes().getBytes());
put.addColumn(familyName.getBytes(),readCounts.getBytes(),article.getReadCounts().getBytes());
put.addColumn(familyName.getBytes(),content.getBytes(),article.getContent().getBytes());
putList.add(put);
}
}
table.put(putList);
long endTime = System.currentTimeMillis();
System.out.println((endTime-startTime)/1000);
table.close();
}
private static Table getTable() throws IOException {
//将集合当中的数据,保存到hbase当中去
//第一步:获取hbase的客户端连接
Configuration configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(configuration);
Admin admin = connection.getAdmin();
//设置我们表名
HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf(tableName));
HColumnDescriptor f1 = new HColumnDescriptor(familyName);
hTableDescriptor.addFamily(f1);
if(!admin.tableExists(TableName.valueOf(tableName))){
admin.createTable(hTableDescriptor);
}
return connection.getTable(TableName.valueOf(tableName));
}
private static void save2Es(List<Article> exceInfo, TransportClient client) {
//通过批量添加,将我们的数据保存到es当中去
BulkRequestBuilder bulk = client.prepareBulk();
/**
* 循环遍历我们的集合,组织我们IndexRequestBuilder
*/
for (Article article : exceInfo) {
IndexRequestBuilder indexRequestBuilder = client.prepareIndex("articles", "article", article.getId());
Gson gson = new Gson();
String jsonStr = gson.toJson(article);
indexRequestBuilder.setSource(jsonStr, XContentType.JSON);
bulk.add(indexRequestBuilder);
}
//触发我们的数据真正的保存到es当中去
BulkResponse bulkItemResponses = bulk.get();
client.close();
}
private static TransportClient getEsClient() throws UnknownHostException {
Settings settings = Settings.builder().put("cluster.name", "myes").build();
TransportClient client = new PreBuiltTransportClient(settings)
.addTransportAddress(new TransportAddress(InetAddress.getByName("node01"),9300))
.addTransportAddress(new TransportAddress(InetAddress.getByName("node02"),9300));
return client;
}
}