本文也分别采用Node.js采(用Koa框架)和Vue.js创建搜索API和前端Web应用。
反向索引数据结构对查询“football”位于哪个文档这种查询非常迅速。Elasticsearch使用内存优化反向索引,可以实现强大和客制化全文本检索任务。
version: '3' services: api: # Node.js App container_name: gs-api build: . ports: - "3000:3000" # Expose API port - "9229:9229" # Expose Node process debug port (disable in production) environment: # Set ENV vars - NODE_ENV=local - ES_HOST=elasticsearch - PORT=3000 volumes: # Attach local book data directory - ./books:/usr/src/app/books frontend: # Nginx Server For Frontend App container_name: gs-frontend image: nginx volumes: # Serve local "public" dir - ./public:/usr/share/nginx/html ports: - "8080:80" # Forward site to localhost:8080 elasticsearch: # Elasticsearch Instance container_name: gs-search image: docker.elastic.co/elasticsearch/elasticsearch:6.1.1 volumes: # Persist ES data in seperate "esdata" volume - esdata:/usr/share/elasticsearch/data environment: - bootstrap.memory_lock=true - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - discovery.type=single-node ports: # Expose Elasticsearch ports - "9300:9300" - "9200:9200" volumes: # Define seperate volume for Elasticsearch data esdata:
# Use Node v8.9.0 LTS FROM node:carbon
node_modules/ npm-debug.log books/ public/
注意:不需要将node_modules拷入,因为我们后续要用npm install来安装这些进程。如果拷贝node_modules到容器中容易引起兼容性问题。例如在macOS上安装bcrypt包,如果将此module拷入Ubuntu容器就会引起操作系统不匹配问题。
Hello World From The Frontend Container
const Koa = require('koa') const app = new Koa() app.use(async (ctx, next) => { ctx.body = 'Hello World From the Backend Container' }) const port = process.env.PORT || 3000 app.listen(port, err => { if (err) console.error(err) console.log(`App Listening on Port ${port}` })
{ "name": "guttenberg-search", "version": "0.0.1", "description": "Source code for Elasticsearch tutorial using 100 classic open source books.", "scripts": { "start": "node --inspect=0.0.0.0:9229 server/app.js" }, "repository": { "type": "git", "url": "git+https://github.com/triestpa/guttenberg-search.git" }, "author": "[email protected]", "license": "MIT", "bugs": { "url": "https://github.com/triestpa/guttenberg-search/issues" }, "homepage": "https://github.com/triestpa/guttenberg-search#readme", "dependencies": { "elasticsearch": "13.3.1", "joi": "13.0.1", "koa": "2.4.1", "koa-joi-validate": "0.5.1", "koa-router": "7.2.1" } }
注意:不需要特意运行npm install,容器创建时候会自动安装依赖包。
注意:这一步可能会运行时间比较长,因为Docker可能需要下载基础镜像。以后执行速度会很快,因为本地已经有了基础镜像。访问localhost:8080,应该看到如下图输出“hello world”。
{ "name" : "SLTcfpI", "cluster_name" : "docker-cluster", "cluster_uuid" : "iId8e0ZeS_mgh9ALlWQ7-w", "version" : { "number" : "6.1.1", "build_hash" : "bd92e7f", "build_date" : "2017-12-17T20:23:25.338Z", "build_snapshot" : false, "lucene_version" : "7.1.0", "minimum_wire_compatibility_version" : "5.6.0", "minimum_index_compatibility_version" : "5.0.0" }, "tagline" : "You Know, for Search" }
如果所有URL输出都正常,恭喜,整个应用框架可以正常工作,下面开始进入真正有趣的部分了。
const elasticsearch = require('elasticsearch') // Core ES variables for this project const index = 'library' const type = 'novel' const port = 9200 const host = process.env.ES_HOST || 'localhost' const client = new elasticsearch.Client({ host: { host, port } }) /** Check the ES connection status */ async function checkConnection () { let isConnected = false while (!isConnected) { console.log('Connecting to ES') try { const health = await client.cluster.health({}) console.log(health) isConnected = true } catch (err) { console.log('Connection Failed, Retrying...', err) } } } checkConnection()
{ cluster_name: 'docker-cluster', status: 'yellow', timed_out: false, number_of_nodes: 1, number_of_data_nodes: 1, active_primary_shards: 1, active_shards: 1, relocating_shards: 0, initializing_shards: 0, unassigned_shards: 1, delayed_unassigned_shards: 0, number_of_pending_tasks: 0, number_of_in_flight_fetch: 0, task_max_waiting_in_queue_millis: 0, active_shards_percent_as_number: 50 }
/** Clear the index, recreate it, and add mappings */ async function resetIndex () { if (await client.indices.exists({ index })) { await client.indices.delete({ index }) } await client.indices.create({ index }) await putBookMapping() }
/** Add book section schema mapping to ES */ async function putBookMapping () { const schema = { title: { type: 'keyword' }, author: { type: 'keyword' }, location: { type: 'integer' }, text: { type: 'text' } } return client.indices.putMapping({ index, type, body: { properties: schema } }) }
module.exports = { client, index, type, checkConnection, resetIndex }
wget https://cdn.patricktriest.com/data/books.zip unar books.zip
Title: Heart of Darkness Author: Joseph Conrad Release Date: February 1995 [EBook #219] Last Updated: September 7, 2016 Language: English Character set encoding: UTF-8
const fs = require('fs') const path = require('path') const esConnection = require('./connection') /** Clear ES index, parse and index all files from the books directory */ async function readAndInsertBooks () { try { // Clear previous ES index await esConnection.resetIndex() // Read books directory let files = fs.readdirSync('./books').filter(file => file.slice(-4) === '.txt') console.log(`Found ${files.length} Files`) // Read each book file, and index each paragraph in elasticsearch for (let file of files) { console.log(`Reading File - ${file}`) const filePath = path.join('./books', file) const { title, author, paragraphs } = parseBookFile(filePath) await insertBookData(title, author, paragraphs) } } catch (err) { console.error(err) } } readAndInsertBooks()
/** Read an individual book text file, and extract the title, author, and paragraphs */ function parseBookFile (filePath) { // Read text file const book = fs.readFileSync(filePath, 'utf8') // Find book title and author const title = book.match(/^Title:\s(.+)$/m)[1] const authorMatch = book.match(/^Author:\s(.+)$/m) const author = (!authorMatch || authorMatch[1].trim() === '') ? 'Unknown Author' : authorMatch[1] console.log(`Reading Book - ${title} By ${author}`) // Find Guttenberg metadata header and footer const startOfBookMatch = book.match(/^\*{3}\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK.+\*{3}$/m) const startOfBookIndex = startOfBookMatch.index + startOfBookMatch[0].length const endOfBookIndex = book.match(/^\*{3}\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK.+\*{3}$/m).index // Clean book text and split into array of paragraphs const paragraphs = book .slice(startOfBookIndex, endOfBookIndex) // Remove Guttenberg header and footer .split(/\n\s+\n/g) // Split each paragraph into it's own array entry .map(line => line.replace(/\r\n/g, ' ').trim()) // Remove paragraph line breaks and whitespace .map(line => line.replace(/_/g, '')) // Guttenberg uses "_" to signify italics. We'll remove it, since it makes the raw text look messy. .filter((line) => (line && line.length !== '')) // Remove empty lines console.log(`Parsed ${paragraphs.length} Paragraphs\n`) return { title, author, paragraphs } }
/** Bulk index the book data in Elasticsearch */ async function insertBookData (title, author, paragraphs) { let bulkOps = [] // Array to store bulk operations // Add an index operation for each section in the book for (let i = 0; i < paragraphs.length; i++) { // Describe action bulkOps.push({ index: { _index: esConnection.index, _type: esConnection.type } }) // Add document bulkOps.push({ author, title, location: i, text: paragraphs[i] }) if (i > 0 && i % 500 === 0) { // Do bulk insert in 500 paragraph batches await esConnection.client.bulk({ body: bulkOps }) bulkOps = [] console.log(`Indexed Paragraphs ${i - 499} - ${i}`) } } // Insert remainder of bulk ops array await esConnection.client.bulk({ body: bulkOps }) console.log(`Indexed Paragraphs ${paragraphs.length - (bulkOps.length / 2)} - ${paragraphs.length}\n\n\n`) }
批量bulk索引这些段落可以使本应用运行在低配电脑上(我只有1.7G内存),如果你有高配电脑(大于4G内容),也许不用考虑批量bulk操作。运行docker-compose up -d --build 和 docker exec gs-api "node" "server/load_data.js" 输出如下:
{ "took" : 11, "timed_out" : false, "_shards" : { "total" : 5, "successful" : 5, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : 13, "max_score" : 14.259304, "hits" : [ { "_index" : "library", "_type" : "novel", "_id" : "p_GwFWEBaZvLlaAUdQgV", "_score" : 14.259304, "_source" : { "author" : "Charles Darwin", "title" : "On the Origin of Species", "location" : 1080, "text" : "Java, plants of, 375." } }, { "_index" : "library", "_type" : "novel", "_id" : "wfKwFWEBaZvLlaAUkjfk", "_score" : 10.186235, "_source" : { "author" : "Edgar Allan Poe", "title" : "The Works of Edgar Allan Poe", "location" : 827, "text" : "After many years spent in foreign travel, I sailed in the year 18-- , from the port of Batavia, in the rich and populous island of Java, on a voyage to the Archipelago of the Sunda islands. I went as passenger--having no other inducement than a kind of nervous restlessness which haunted me as a fiend." } }, ... ] } }
const { client, index, type } = require('./connection') module.exports = { /** Query ES index for the provided term */ queryTerm (term, offset = 0) { const body = { from: offset, query: { match: { text: { query: term, operator: 'and', fuzziness: 'auto' } } }, highlight: { fields: { text: {} } } } return client.search({ index, type, body }) } }
const Koa = require('koa') const Router = require('koa-router') const joi = require('joi') const validate = require('koa-joi-validate') const search = require('./search') const app = new Koa() const router = new Router() // Log each request to the console app.use(async (ctx, next) => { const start = Date.now() await next() const ms = Date.now() - start console.log(`${ctx.method} ${ctx.url} - ${ms}`) }) // Log percolated errors to the console app.on('error', err => { console.error('Server Error', err) }) // Set permissive CORS header app.use(async (ctx, next) => { ctx.set('Access-Control-Allow-Origin', '*') return next() }) // ADD ENDPOINTS HERE const port = process.env.PORT || 3000 app .use(router.routes()) .use(router.allowedMethods()) .listen(port, err => { if (err) throw err console.log(`App Listening on Port ${port}`) })
/** * GET /search * Search for a term in the library */ router.get('/search', async (ctx, next) => { const { term, offset } = ctx.request.query ctx.body = await search.queryTerm(term, offset) } )
{ "took": 242, "timed_out": false, "_shards": { "total": 5, "successful": 5, "skipped": 0, "failed": 0 }, "hits": { "total": 93, "max_score": 13.356944, "hits": [{ "_index": "library", "_type": "novel", "_id": "eHYHJmEBpQg9B4622421", "_score": 13.356944, "_source": { "author": "Charles Darwin", "title": "On the Origin of Species", "location": 1080, "text": "Java, plants of, 375." }, "highlight": { "text": ["Java, plants of, 375."] } }, { "_index": "library", "_type": "novel", "_id": "2HUHJmEBpQg9B462xdNg", "_score": 9.030668, "_source": { "author": "Unknown Author", "title": "The King James Bible", "location": 186, "text": "10:4 And the sons of Javan; Elishah, and Tarshish, Kittim, and Dodanim." }, "highlight": { "text": ["10:4 And the sons of Javan; Elishah, and Tarshish, Kittim, and Dodanim."] } } ... ] } }
/** * GET /search * Search for a term in the library * Query Params - * term: string under 60 characters * offset: positive integer */ router.get('/search', validate({ query: { term: joi.string().max(60).required(), offset: joi.number().integer().min(0).default(0) } }), async (ctx, next) => { const { term, offset } = ctx.request.query ctx.body = await search.queryTerm(term, offset) } )
可以用docker-compose logs -f api 查看日志。
const vm = new Vue ({ el: '#vue-instance', data () { return { baseUrl: 'http://localhost:3000', // API url searchTerm: 'Hello World', // Default search term searchDebounce: null, // Timeout for search bar debounce searchResults: [], // Displayed search results numHits: null, // Total search results found searchOffset: 0, // Search result pagination offset selectedParagraph: null, // Selected paragraph object bookOffset: 0, // Offset for book paragraphs being displayed paragraphs: [] // Paragraphs being displayed in book preview window } }, async created () { this.searchResults = await this.search() // Search for default term }, methods: { /** Debounce search input by 100 ms */ onSearchInput () { clearTimeout(this.searchDebounce) this.searchDebounce = setTimeout(async () => { this.searchOffset = 0 this.searchResults = await this.search() }, 100) }, /** Call API to search for inputted term */ async search () { const response = await axios.get(`${this.baseUrl}/search`, { params: { term: this.searchTerm, offset: this.searchOffset } }) this.numHits = response.data.hits.total return response.data.hits.hits }, /** Get next page of search results */ async nextResultsPage () { if (this.numHits > 10) { this.searchOffset += 10 if (this.searchOffset + 10 > this.numHits) { this.searchOffset = this.numHits - 10} this.searchResults = await this.search() document.documentElement.scrollTop = 0 } }, /** Get previous page of search results */ async prevResultsPage () { this.searchOffset -= 10 if (this.searchOffset < 0) { this.searchOffset = 0 } this.searchResults = await this.search() document.documentElement.scrollTop = 0 } } })
Elastic Library {{ numHits }} HitsDisplaying Results {{ searchOffset }} - {{ searchOffset + 9 }}{{ hit._source.title }} - {{ hit._source.author }}Location {{ hit._source.location }}
body { font-family: 'EB Garamond', serif; } .mui-textfield > input, .mui-btn, .mui--text-subhead, .mui-panel > .mui--text-headline { font-family: 'Open Sans', sans-serif; } .all-caps { text-transform: uppercase; } .app-container { padding: 16px; } .search-results em { font-weight: bold; } .book-modal > button { width: 100%; } .search-results .mui-divider { margin: 14px 0; } .search-results { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: space-around; } .search-results > div { flex-basis: 45%; box-sizing: border-box; cursor: pointer; } @media (max-width: 600px) { .search-results > div { flex-basis: 100%; } } .paragraphs-container { max-width: 800px; margin: 0 auto; margin-bottom: 48px; } .paragraphs-container .mui--text-body1, .paragraphs-container .mui--text-body2 { font-size: 1.8rem; line-height: 35px; } .book-modal { width: 100%; height: 100%; padding: 40px 10%; box-sizing: border-box; margin: 0 auto; background-color: white; overflow-y: scroll; position: fixed; top: 0; left: 0; } .pagination-panel { display: flex; justify-content: space-between; } .title-row { display: flex; justify-content: space-between; align-items: flex-end; } @media (max-width: 600px) { .title-row{ flex-direction: column; text-align: center; align-items: center } } .locations-label { text-align: center; margin: 8px; } .modal-footer { position: fixed; bottom: 0; left: 0; width: 100%; display: flex; justify-content: space-around; background: white; }
这一步不需要重新运行docker-compose up命令使修改生效。本地public目录直接挂载在Ngnix服务器容器中,因此前端本地系统数据改变直接反应在容器化应用中。
如果点任一个输出,没什么效果,意味着还有一些功能需要添加进应用中。
/** Get the specified range of paragraphs from a book */ getParagraphs (bookTitle, startLocation, endLocation) { const filter = [ { term: { title: bookTitle } }, { range: { location: { gte: startLocation, lte: endLocation } } } ] const body = { size: endLocation - startLocation, sort: { location: 'asc' }, query: { bool: { filter } } } return client.search({ index, type, body }) }
/** * GET /paragraphs * Get a range of paragraphs from the specified book * Query Params - * bookTitle: string under 256 characters * start: positive integer * end: positive integer greater than start */ router.get('/paragraphs', validate({ query: { bookTitle: joi.string().max(256).required(), start: joi.number().integer().min(0).default(0), end: joi.number().integer().greater(joi.ref('start')).default(10) } }), async (ctx, next) => { const { bookTitle, start, end } = ctx.request.query ctx.body = await search.getParagraphs(bookTitle, start, end) } )
/** Call the API to get current page of paragraphs */ async getParagraphs (bookTitle, offset) { try { this.bookOffset = offset const start = this.bookOffset const end = this.bookOffset + 10 const response = await axios.get(`${this.baseUrl}/paragraphs`, { params: { bookTitle, start, end } }) return response.data.hits.hits } catch (err) { console.error(err) } }, /** Get next page (next 10 paragraphs) of selected book */ async nextBookPage () { this.$refs.bookModal.scrollTop = 0 this.paragraphs = await this.getParagraphs(this.selectedParagraph._source.title, this.bookOffset + 10) }, /** Get previous page (previous 10 paragraphs) of selected book */ async prevBookPage () { this.$refs.bookModal.scrollTop = 0 this.paragraphs = await this.getParagraphs(this.selectedParagraph._source.title, this.bookOffset - 10) }, /** Display paragraphs from selected book in modal window */ async showBookModal (searchHit) { try { document.body.style.overflow = 'hidden' this.selectedParagraph = searchHit this.paragraphs = await this.getParagraphs(searchHit._source.title, searchHit._source.location - 5) } catch (err) { console.error(err) } }, /** Close the book detail modal */ closeBookModal () { document.body.style.overflow = 'auto' this.selectedParagraph = null }
{{ selectedParagraph._source.title }}{{ selectedParagraph._source.author }}
Locations {{ bookOffset - 5 }} to {{ bookOffset + 5 }}
{{ paragraph._source.text }}{{ paragraph._source.text }}
恭喜!!到这一步主体框架已经搭建完毕。以上所有代码都可以从这里获得。、
Elasticsearch与主数据库同步需求,与其说是ES的弱点,不如说是架构复杂造成的;给应用添加一个专用搜索引擎是一件值得考虑的事情,但是要折衷考虑带来的问题。
作者简介:Patrick Triest是一位全栈工程师,数据爱好者,持续学习者,洁癖编程者。作者github地址为https://github.com/triestpa。
原文链接:http://dockone.io/article/3655