From 7c77cb0815ee7c9811c5f2962e8cc13e116a283c Mon Sep 17 00:00:00 2001 From: Nick Date: Sun, 24 Mar 2019 00:27:42 -0400 Subject: [PATCH] fix: postgres indexing (#791) + deactivate handler --- CHANGELOG.md | 15 +++++ package.json | 2 + server/graph/resolvers/search.js | 11 ++++ server/models/pages.js | 36 ++++++++++++ server/models/searchEngines.js | 1 + server/modules/search/algolia/engine.js | 16 +++--- server/modules/search/aws/engine.js | 16 +++--- server/modules/search/azure/engine.js | 25 +++++++-- server/modules/search/db/engine.js | 4 +- server/modules/search/elasticsearch/engine.js | 19 ++++--- server/modules/search/postgres/engine.js | 55 ++++++++++++++----- yarn.lock | 10 ++++ 12 files changed, 167 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49e8b700..3dd81d67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [2.0.0-beta.XX] - 2019-XX-XX +### Added +- Added Search Engine - Algolia +- Added Search Engine - Elasticsearch + +### Fixed +- Fixed error when saving navigation in admin area +- Fixed guest search failing because of missing global permissions +- Fixed PostgreSQL search engine indexing issues + +### Changed +- Improved search suggestions from sanitized content +- Search engine deactivate handler is now being called on engine switch +- Markdown editor UI improvements for insert actions (wip) + ## [2.0.0-beta.68] - 2019-03-17 ### Added - Added Search Results overlay diff --git a/package.json b/package.json index 51b43aac..2420d98d 100644 --- a/package.json +++ b/package.json @@ -64,6 +64,7 @@ "diff2html": "2.7.0", "dotize": "0.3.0", "elasticsearch": "15.4.1", + "emoji-regex": "8.0.0", "express": "4.16.4", "express-brute": "1.0.1", "file-type": "10.7.1", @@ -154,6 +155,7 @@ "simple-git": "1.107.0", "solr-node": "1.1.3", "sqlite3": "4.0.6", + "striptags": "3.1.1", "subscriptions-transport-ws": "0.9.15", "twemoji": "11.3.0", "uslug": "1.0.4", diff --git a/server/graph/resolvers/search.js b/server/graph/resolvers/search.js index fd6c5b8b..70a12bb8 100644 --- a/server/graph/resolvers/search.js +++ b/server/graph/resolvers/search.js @@ -38,7 +38,11 @@ module.exports = { SearchMutation: { async updateSearchEngines(obj, args, context) { try { + let newActiveEngine = '' for (let searchEngine of args.engines) { + if (searchEngine.isEnabled) { + newActiveEngine = searchEngine.key + } await WIKI.models.searchEngines.query().patch({ isEnabled: searchEngine.isEnabled, config: _.reduce(searchEngine.config, (result, value, key) => { @@ -47,6 +51,13 @@ module.exports = { }, {}) }).where('key', searchEngine.key) } + if (newActiveEngine !== WIKI.data.searchEngine.key) { + try { + await WIKI.data.searchEngine.deactivate() + } catch (err) { + WIKI.logger.warn('Failed to deactivate previous search engine:', err) + } + } await WIKI.models.searchEngines.initEngine({ activate: true }) return { responseResult: graphHelper.generateSuccess('Search Engines updated successfully') diff --git a/server/models/pages.js b/server/models/pages.js index fbd77391..ba7981d5 100644 --- a/server/models/pages.js +++ b/server/models/pages.js @@ -5,6 +5,8 @@ const pageHelper = require('../helpers/page') const path = require('path') const fs = require('fs-extra') const yaml = require('js-yaml') +const striptags = require('striptags') +const emojiRegex = require('emoji-regex') /* global WIKI */ @@ -14,6 +16,9 @@ const frontmatterRegex = { markdown: /^(-{3}(?:\n|\r)([\w\W]+?)(?:\n|\r)-{3})?(?:\n|\r)*([\w\W]*)*/ } +const punctuationRegex = /[!,:;/\\_+\-=()&#@<>$~%^*[\]{}"'|]+|(\.\s)|(\s\.)/ig +const htmlEntitiesRegex = /(&#[0-9]{3};)|(&#x[a-zA-Z0-9]{2};)/ig + /** * Pages model */ @@ -209,14 +214,23 @@ module.exports = class Page extends Model { userId: opts.authorId, isPrivate: opts.isPrivate }) + + // -> Render page to HTML await WIKI.models.pages.renderPage(page) + + // -> Add to Search Index + const pageContents = await WIKI.models.pages.query().findById(page.id).select('render') + page.safeContent = WIKI.models.pages.cleanHTML(pageContents.render) await WIKI.data.searchEngine.created(page) + + // -> Add to Storage if (!opts.skipStorage) { await WIKI.models.storage.pageEvent({ event: 'created', page }) } + return page } @@ -245,8 +259,16 @@ module.exports = class Page extends Model { userId: ogPage.authorId, isPrivate: ogPage.isPrivate }) + + // -> Render page to HTML await WIKI.models.pages.renderPage(page) + + // -> Update Search Index + const pageContents = await WIKI.models.pages.query().findById(page.id).select('render') + page.safeContent = WIKI.models.pages.cleanHTML(pageContents.render) await WIKI.data.searchEngine.updated(page) + + // -> Update on Storage if (!opts.skipStorage) { await WIKI.models.storage.pageEvent({ event: 'updated', @@ -275,7 +297,11 @@ module.exports = class Page extends Model { }) await WIKI.models.pages.query().delete().where('id', page.id) await WIKI.models.pages.deletePageFromCache(page) + + // -> Delete from Search Index await WIKI.data.searchEngine.deleted(page) + + // -> Delete from Storage if (!opts.skipStorage) { await WIKI.models.storage.pageEvent({ event: 'deleted', @@ -390,4 +416,14 @@ module.exports = class Page extends Model { static async deletePageFromCache(page) { return fs.remove(path.join(process.cwd(), `data/cache/${page.hash}.bin`)) } + + static cleanHTML(rawHTML = '') { + return striptags(rawHTML || '') + .replace(emojiRegex(), '') + .replace(htmlEntitiesRegex, '') + .replace(punctuationRegex, ' ') + .replace(/(\r\n|\n|\r)/gm, ' ') + .replace(/\s\s+/g, ' ') + .split(' ').filter(w => w.length > 1).join(' ').toLowerCase() + } } diff --git a/server/models/searchEngines.js b/server/models/searchEngines.js index eca88ddd..40f881d4 100644 --- a/server/models/searchEngines.js +++ b/server/models/searchEngines.js @@ -99,6 +99,7 @@ module.exports = class SearchEngine extends Model { const searchEngine = await WIKI.models.searchEngines.query().findOne('isEnabled', true) if (searchEngine) { WIKI.data.searchEngine = require(`../modules/search/${searchEngine.key}/engine`) + WIKI.data.searchEngine.key = searchEngine.key WIKI.data.searchEngine.config = searchEngine.config if (activate) { try { diff --git a/server/modules/search/algolia/engine.js b/server/modules/search/algolia/engine.js index 217d2632..3559beb6 100644 --- a/server/modules/search/algolia/engine.js +++ b/server/modules/search/algolia/engine.js @@ -1,6 +1,8 @@ const _ = require('lodash') const algoliasearch = require('algoliasearch') -const { pipeline, Transform } = require('stream') +const stream = require('stream') +const Promise = require('bluebird') +const pipeline = Promise.promisify(stream.pipeline) /* global WIKI */ @@ -77,7 +79,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent }) }, /** @@ -90,7 +92,7 @@ module.exports = { objectID: page.hash, title: page.title, description: page.description, - content: page.content + content: page.safeContent }) }, /** @@ -114,7 +116,7 @@ module.exports = { path: page.destinationPath, title: page.title, description: page.description, - content: page.content + content: page.safeContent }) }, /** @@ -176,7 +178,7 @@ module.exports = { path: doc.path, title: doc.title, description: doc.description, - content: doc.content + content: WIKI.models.pages.cleanHTML(doc.render) })) ) } catch (err) { @@ -187,11 +189,11 @@ module.exports = { } await pipeline( - WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({ + WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({ isPublished: true, isPrivate: false }).stream(), - new Transform({ + new stream.Transform({ objectMode: true, transform: async (chunk, enc, cb) => processDocument(cb, chunk), flush: async (cb) => processDocument(cb) diff --git a/server/modules/search/aws/engine.js b/server/modules/search/aws/engine.js index 599f1bae..119ba397 100644 --- a/server/modules/search/aws/engine.js +++ b/server/modules/search/aws/engine.js @@ -1,6 +1,8 @@ const _ = require('lodash') const AWS = require('aws-sdk') -const { pipeline, Transform } = require('stream') +const stream = require('stream') +const Promise = require('bluebird') +const pipeline = Promise.promisify(stream.pipeline) /* global WIKI */ @@ -197,7 +199,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent } } ]) @@ -220,7 +222,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent } } ]) @@ -268,7 +270,7 @@ module.exports = { path: page.destinationPath, title: page.title, description: page.description, - content: page.content + content: page.safeContent } } ]) @@ -335,7 +337,7 @@ module.exports = { path: doc.path, title: doc.title, description: doc.description, - content: doc.content + content: WIKI.models.pages.cleanHTML(doc.render) } }))) }).promise() @@ -347,11 +349,11 @@ module.exports = { } await pipeline( - WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({ + WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({ isPublished: true, isPrivate: false }).stream(), - new Transform({ + new stream.Transform({ objectMode: true, transform: async (chunk, enc, cb) => processDocument(cb, chunk), flush: async (cb) => processDocument(cb) diff --git a/server/modules/search/azure/engine.js b/server/modules/search/azure/engine.js index f242df58..8ad9e61f 100644 --- a/server/modules/search/azure/engine.js +++ b/server/modules/search/azure/engine.js @@ -1,7 +1,9 @@ const _ = require('lodash') const { SearchService, QueryType } = require('azure-search-client') const request = require('request-promise') -const { pipeline } = require('stream') +const stream = require('stream') +const Promise = require('bluebird') +const pipeline = Promise.promisify(stream.pipeline) /* global WIKI */ @@ -146,7 +148,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent } ]) }, @@ -163,7 +165,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent } ]) }, @@ -199,7 +201,7 @@ module.exports = { path: page.destinationPath, title: page.title, description: page.description, - content: page.content + content: page.safeContent } ]) }, @@ -209,10 +211,23 @@ module.exports = { async rebuild() { WIKI.logger.info(`(SEARCH/AZURE) Rebuilding Index...`) await pipeline( - WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({ + WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({ isPublished: true, isPrivate: false }).stream(), + new stream.Transform({ + objectMode: true, + transform: (chunk, enc, cb) => { + cb(null, { + id: chunk.id, + path: chunk.path, + locale: chunk.locale, + title: chunk.title, + description: chunk.description, + content: WIKI.models.pages.cleanHTML(chunk.render) + }) + } + }), this.client.indexes.use(this.config.indexName).createIndexingStream() ) WIKI.logger.info(`(SEARCH/AZURE) Index rebuilt successfully.`) diff --git a/server/modules/search/db/engine.js b/server/modules/search/db/engine.js index cc5a9a3d..957f5052 100644 --- a/server/modules/search/db/engine.js +++ b/server/modules/search/db/engine.js @@ -1,4 +1,4 @@ -const _ = require('lodash') +/* global WIKI */ module.exports = { activate() { @@ -32,7 +32,7 @@ module.exports = { } // TODO: Add user permissions filtering builder.andWhere(builder => { - switch(WIKI.config.db.type) { + switch (WIKI.config.db.type) { case 'postgres': builder.where('title', 'ILIKE', `%${q}%`) builder.orWhere('description', 'ILIKE', `%${q}%`) diff --git a/server/modules/search/elasticsearch/engine.js b/server/modules/search/elasticsearch/engine.js index 19fa8963..2ad87547 100644 --- a/server/modules/search/elasticsearch/engine.js +++ b/server/modules/search/elasticsearch/engine.js @@ -1,6 +1,8 @@ const _ = require('lodash') const elasticsearch = require('elasticsearch') -const { pipeline, Transform } = require('stream') +const stream = require('stream') +const Promise = require('bluebird') +const pipeline = Promise.promisify(stream.pipeline) /* global WIKI */ @@ -116,7 +118,7 @@ module.exports = { input: s, weight: 3 })), - page.content.split(' ').map(s => ({ + page.safeContent.split(' ').map(s => ({ input: s, weight: 1 })) @@ -138,7 +140,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent }, refresh: true }) @@ -159,7 +161,7 @@ module.exports = { path: page.path, title: page.title, description: page.description, - content: page.content + content: page.safeContent }, refresh: true }) @@ -199,7 +201,7 @@ module.exports = { path: page.destinationPath, title: page.title, description: page.description, - content: page.content + content: page.safeContent }, refresh: true }) @@ -262,13 +264,14 @@ module.exports = { _id: doc.id } }) + doc.safeContent = WIKI.models.pages.cleanHTML(doc.render) result.push({ suggest: this.buildSuggest(doc), locale: doc.locale, path: doc.path, title: doc.title, description: doc.description, - content: doc.content + content: doc.safeContent }) return result }, []), @@ -282,11 +285,11 @@ module.exports = { } await pipeline( - WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({ + WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({ isPublished: true, isPrivate: false }).stream(), - new Transform({ + new stream.Transform({ objectMode: true, transform: async (chunk, enc, cb) => processDocument(cb, chunk), flush: async (cb) => processDocument(cb) diff --git a/server/modules/search/postgres/engine.js b/server/modules/search/postgres/engine.js index 1e4eba59..f6bd2204 100644 --- a/server/modules/search/postgres/engine.js +++ b/server/modules/search/postgres/engine.js @@ -1,5 +1,9 @@ -const _ = require('lodash') const tsquery = require('pg-tsquery')() +const stream = require('stream') +const Promise = require('bluebird') +const pipeline = Promise.promisify(stream.pipeline) + +/* global WIKI */ module.exports = { async activate() { @@ -8,7 +12,10 @@ module.exports = { } }, async deactivate() { - // not used + WIKI.logger.info(`(SEARCH/POSTGRES) Dropping index tables...`) + await WIKI.models.knex.schema.dropTable('pagesWords') + await WIKI.models.knex.schema.dropTable('pagesVector') + WIKI.logger.info(`(SEARCH/POSTGRES) Index tables have been dropped.`) }, /** * INIT @@ -27,6 +34,7 @@ module.exports = { table.string('title') table.string('description') table.specificType('tokens', 'TSVECTOR') + table.text('content') }) } // -> Create Words Index @@ -71,7 +79,6 @@ module.exports = { WIKI.logger.warn('Search Engine Error:') WIKI.logger.warn(err) } - }, /** * CREATE @@ -80,10 +87,10 @@ module.exports = { */ async created(page) { await WIKI.models.knex.raw(` - INSERT INTO "pagesVector" (path, locale, title, description, tokens) VALUES ( - '?', '?', '?', '?', (setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'C')) + INSERT INTO "pagesVector" (path, locale, title, description, "tokens") VALUES ( + ?, ?, ?, ?, (setweight(to_tsvector('${this.config.dictLanguage}', ?), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C')) ) - `, [page.path, page.localeCode, page.title, page.description, page.title, page.description, page.content]) + `, [page.path, page.localeCode, page.title, page.description, page.title, page.description, page.safeContent]) }, /** * UPDATE @@ -99,7 +106,7 @@ module.exports = { setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C')) WHERE path = ? AND locale = ? - `, [page.title, page.description, page.title, page.description, page.content, page.path, page.localeCode]) + `, [page.title, page.description, page.title, page.description, page.safeContent, page.path, page.localeCode]) }, /** * DELETE @@ -132,14 +139,34 @@ module.exports = { async rebuild() { WIKI.logger.info(`(SEARCH/POSTGRES) Rebuilding Index...`) await WIKI.models.knex('pagesVector').truncate() + await WIKI.models.knex('pagesWords').truncate() + + await pipeline( + WIKI.models.knex.column('path', 'localeCode', 'title', 'description', 'render').select().from('pages').where({ + isPublished: true, + isPrivate: false + }).stream(), + new stream.Transform({ + objectMode: true, + transform: async (page, enc, cb) => { + const content = WIKI.models.pages.cleanHTML(page.render) + await WIKI.models.knex.raw(` + INSERT INTO "pagesVector" (path, locale, title, description, "tokens", content) VALUES ( + ?, ?, ?, ?, (setweight(to_tsvector('${this.config.dictLanguage}', ?), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C')), ? + ) + `, [page.path, page.localeCode, page.title, page.description, page.title, page.description, content, content]) + cb() + } + }) + ) + await WIKI.models.knex.raw(` - INSERT INTO "pagesVector" (path, locale, title, description, "tokens") - SELECT path, "localeCode" AS locale, title, description, - (setweight(to_tsvector('${this.config.dictLanguage}', title), 'A') || - setweight(to_tsvector('${this.config.dictLanguage}', description), 'B') || - setweight(to_tsvector('${this.config.dictLanguage}', content), 'C')) AS tokens - FROM "pages" - WHERE pages."isPublished" AND NOT pages."isPrivate"`) + INSERT INTO "pagesWords" (word) + SELECT word FROM ts_stat( + 'SELECT to_tsvector(''simple'', "title") || to_tsvector(''simple'', "description") || to_tsvector(''simple'', "content") FROM "pagesVector"' + ) + `) + WIKI.logger.info(`(SEARCH/POSTGRES) Index rebuilt successfully.`) } } diff --git a/yarn.lock b/yarn.lock index 4b95a63c..4d2a1635 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4454,6 +4454,11 @@ emitter-listener@^1.1.1: dependencies: shimmer "^1.2.0" +emoji-regex@8.0.0: + version "8.0.0" + resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-8.0.0.tgz#e818fd69ce5ccfcb404594f842963bf53164cc37" + integrity sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A== + emojis-list@^2.0.0: version "2.1.0" resolved "https://registry.yarnpkg.com/emojis-list/-/emojis-list-2.1.0.tgz#4daa4d9db00f9819880c79fa457ae5b09a1fd389" @@ -12294,6 +12299,11 @@ strip-json-comments@^2.0.1, strip-json-comments@~2.0.1: resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-2.0.1.tgz#3c531942e908c2697c0ec344858c286c7ca0a60a" integrity sha1-PFMZQukIwml8DsNEhYwobHygpgo= +striptags@3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/striptags/-/striptags-3.1.1.tgz#c8c3e7fdd6fb4bb3a32a3b752e5b5e3e38093ebd" + integrity sha1-yMPn/db7S7OjKjt1LltePjgJPr0= + style-loader@0.23.1: version "0.23.1" resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-0.23.1.tgz#cb9154606f3e771ab6c4ab637026a1049174d925"