Browse Source

fix: postgres indexing (#791) + deactivate handler

pull/795/head
Nick 5 years ago
parent
commit
7c77cb0815
12 changed files with 167 additions and 43 deletions
  1. 15
      CHANGELOG.md
  2. 2
      package.json
  3. 11
      server/graph/resolvers/search.js
  4. 36
      server/models/pages.js
  5. 1
      server/models/searchEngines.js
  6. 16
      server/modules/search/algolia/engine.js
  7. 16
      server/modules/search/aws/engine.js
  8. 25
      server/modules/search/azure/engine.js
  9. 4
      server/modules/search/db/engine.js
  10. 19
      server/modules/search/elasticsearch/engine.js
  11. 55
      server/modules/search/postgres/engine.js
  12. 10
      yarn.lock

15
CHANGELOG.md

@ -2,6 +2,21 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [2.0.0-beta.XX] - 2019-XX-XX
### Added
- Added Search Engine - Algolia
- Added Search Engine - Elasticsearch
### Fixed
- Fixed error when saving navigation in admin area
- Fixed guest search failing because of missing global permissions
- Fixed PostgreSQL search engine indexing issues
### Changed
- Improved search suggestions from sanitized content
- Search engine deactivate handler is now being called on engine switch
- Markdown editor UI improvements for insert actions (wip)
## [2.0.0-beta.68] - 2019-03-17
### Added
- Added Search Results overlay

2
package.json

@ -64,6 +64,7 @@
"diff2html": "2.7.0",
"dotize": "0.3.0",
"elasticsearch": "15.4.1",
"emoji-regex": "8.0.0",
"express": "4.16.4",
"express-brute": "1.0.1",
"file-type": "10.7.1",
@ -154,6 +155,7 @@
"simple-git": "1.107.0",
"solr-node": "1.1.3",
"sqlite3": "4.0.6",
"striptags": "3.1.1",
"subscriptions-transport-ws": "0.9.15",
"twemoji": "11.3.0",
"uslug": "1.0.4",

11
server/graph/resolvers/search.js

@ -38,7 +38,11 @@ module.exports = {
SearchMutation: {
async updateSearchEngines(obj, args, context) {
try {
let newActiveEngine = ''
for (let searchEngine of args.engines) {
if (searchEngine.isEnabled) {
newActiveEngine = searchEngine.key
}
await WIKI.models.searchEngines.query().patch({
isEnabled: searchEngine.isEnabled,
config: _.reduce(searchEngine.config, (result, value, key) => {
@ -47,6 +51,13 @@ module.exports = {
}, {})
}).where('key', searchEngine.key)
}
if (newActiveEngine !== WIKI.data.searchEngine.key) {
try {
await WIKI.data.searchEngine.deactivate()
} catch (err) {
WIKI.logger.warn('Failed to deactivate previous search engine:', err)
}
}
await WIKI.models.searchEngines.initEngine({ activate: true })
return {
responseResult: graphHelper.generateSuccess('Search Engines updated successfully')

36
server/models/pages.js

@ -5,6 +5,8 @@ const pageHelper = require('../helpers/page')
const path = require('path')
const fs = require('fs-extra')
const yaml = require('js-yaml')
const striptags = require('striptags')
const emojiRegex = require('emoji-regex')
/* global WIKI */
@ -14,6 +16,9 @@ const frontmatterRegex = {
markdown: /^(-{3}(?:\n|\r)([\w\W]+?)(?:\n|\r)-{3})?(?:\n|\r)*([\w\W]*)*/
}
const punctuationRegex = /[!,:;/\\_+\-=()&#@<>$~%^*[\]{}"'|]+|(\.\s)|(\s\.)/ig
const htmlEntitiesRegex = /(&#[0-9]{3};)|(&#x[a-zA-Z0-9]{2};)/ig
/**
* Pages model
*/
@ -209,14 +214,23 @@ module.exports = class Page extends Model {
userId: opts.authorId,
isPrivate: opts.isPrivate
})
// -> Render page to HTML
await WIKI.models.pages.renderPage(page)
// -> Add to Search Index
const pageContents = await WIKI.models.pages.query().findById(page.id).select('render')
page.safeContent = WIKI.models.pages.cleanHTML(pageContents.render)
await WIKI.data.searchEngine.created(page)
// -> Add to Storage
if (!opts.skipStorage) {
await WIKI.models.storage.pageEvent({
event: 'created',
page
})
}
return page
}
@ -245,8 +259,16 @@ module.exports = class Page extends Model {
userId: ogPage.authorId,
isPrivate: ogPage.isPrivate
})
// -> Render page to HTML
await WIKI.models.pages.renderPage(page)
// -> Update Search Index
const pageContents = await WIKI.models.pages.query().findById(page.id).select('render')
page.safeContent = WIKI.models.pages.cleanHTML(pageContents.render)
await WIKI.data.searchEngine.updated(page)
// -> Update on Storage
if (!opts.skipStorage) {
await WIKI.models.storage.pageEvent({
event: 'updated',
@ -275,7 +297,11 @@ module.exports = class Page extends Model {
})
await WIKI.models.pages.query().delete().where('id', page.id)
await WIKI.models.pages.deletePageFromCache(page)
// -> Delete from Search Index
await WIKI.data.searchEngine.deleted(page)
// -> Delete from Storage
if (!opts.skipStorage) {
await WIKI.models.storage.pageEvent({
event: 'deleted',
@ -390,4 +416,14 @@ module.exports = class Page extends Model {
static async deletePageFromCache(page) {
return fs.remove(path.join(process.cwd(), `data/cache/${page.hash}.bin`))
}
static cleanHTML(rawHTML = '') {
return striptags(rawHTML || '')
.replace(emojiRegex(), '')
.replace(htmlEntitiesRegex, '')
.replace(punctuationRegex, ' ')
.replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s\s+/g, ' ')
.split(' ').filter(w => w.length > 1).join(' ').toLowerCase()
}
}

1
server/models/searchEngines.js

@ -99,6 +99,7 @@ module.exports = class SearchEngine extends Model {
const searchEngine = await WIKI.models.searchEngines.query().findOne('isEnabled', true)
if (searchEngine) {
WIKI.data.searchEngine = require(`../modules/search/${searchEngine.key}/engine`)
WIKI.data.searchEngine.key = searchEngine.key
WIKI.data.searchEngine.config = searchEngine.config
if (activate) {
try {

16
server/modules/search/algolia/engine.js

@ -1,6 +1,8 @@
const _ = require('lodash')
const algoliasearch = require('algoliasearch')
const { pipeline, Transform } = require('stream')
const stream = require('stream')
const Promise = require('bluebird')
const pipeline = Promise.promisify(stream.pipeline)
/* global WIKI */
@ -77,7 +79,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
})
},
/**
@ -90,7 +92,7 @@ module.exports = {
objectID: page.hash,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
})
},
/**
@ -114,7 +116,7 @@ module.exports = {
path: page.destinationPath,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
})
},
/**
@ -176,7 +178,7 @@ module.exports = {
path: doc.path,
title: doc.title,
description: doc.description,
content: doc.content
content: WIKI.models.pages.cleanHTML(doc.render)
}))
)
} catch (err) {
@ -187,11 +189,11 @@ module.exports = {
}
await pipeline(
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
isPublished: true,
isPrivate: false
}).stream(),
new Transform({
new stream.Transform({
objectMode: true,
transform: async (chunk, enc, cb) => processDocument(cb, chunk),
flush: async (cb) => processDocument(cb)

16
server/modules/search/aws/engine.js

@ -1,6 +1,8 @@
const _ = require('lodash')
const AWS = require('aws-sdk')
const { pipeline, Transform } = require('stream')
const stream = require('stream')
const Promise = require('bluebird')
const pipeline = Promise.promisify(stream.pipeline)
/* global WIKI */
@ -197,7 +199,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
}
}
])
@ -220,7 +222,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
}
}
])
@ -268,7 +270,7 @@ module.exports = {
path: page.destinationPath,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
}
}
])
@ -335,7 +337,7 @@ module.exports = {
path: doc.path,
title: doc.title,
description: doc.description,
content: doc.content
content: WIKI.models.pages.cleanHTML(doc.render)
}
})))
}).promise()
@ -347,11 +349,11 @@ module.exports = {
}
await pipeline(
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
isPublished: true,
isPrivate: false
}).stream(),
new Transform({
new stream.Transform({
objectMode: true,
transform: async (chunk, enc, cb) => processDocument(cb, chunk),
flush: async (cb) => processDocument(cb)

25
server/modules/search/azure/engine.js

@ -1,7 +1,9 @@
const _ = require('lodash')
const { SearchService, QueryType } = require('azure-search-client')
const request = require('request-promise')
const { pipeline } = require('stream')
const stream = require('stream')
const Promise = require('bluebird')
const pipeline = Promise.promisify(stream.pipeline)
/* global WIKI */
@ -146,7 +148,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
}
])
},
@ -163,7 +165,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
}
])
},
@ -199,7 +201,7 @@ module.exports = {
path: page.destinationPath,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
}
])
},
@ -209,10 +211,23 @@ module.exports = {
async rebuild() {
WIKI.logger.info(`(SEARCH/AZURE) Rebuilding Index...`)
await pipeline(
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
isPublished: true,
isPrivate: false
}).stream(),
new stream.Transform({
objectMode: true,
transform: (chunk, enc, cb) => {
cb(null, {
id: chunk.id,
path: chunk.path,
locale: chunk.locale,
title: chunk.title,
description: chunk.description,
content: WIKI.models.pages.cleanHTML(chunk.render)
})
}
}),
this.client.indexes.use(this.config.indexName).createIndexingStream()
)
WIKI.logger.info(`(SEARCH/AZURE) Index rebuilt successfully.`)

4
server/modules/search/db/engine.js

@ -1,4 +1,4 @@
const _ = require('lodash')
/* global WIKI */
module.exports = {
activate() {
@ -32,7 +32,7 @@ module.exports = {
}
// TODO: Add user permissions filtering
builder.andWhere(builder => {
switch(WIKI.config.db.type) {
switch (WIKI.config.db.type) {
case 'postgres':
builder.where('title', 'ILIKE', `%${q}%`)
builder.orWhere('description', 'ILIKE', `%${q}%`)

19
server/modules/search/elasticsearch/engine.js

@ -1,6 +1,8 @@
const _ = require('lodash')
const elasticsearch = require('elasticsearch')
const { pipeline, Transform } = require('stream')
const stream = require('stream')
const Promise = require('bluebird')
const pipeline = Promise.promisify(stream.pipeline)
/* global WIKI */
@ -116,7 +118,7 @@ module.exports = {
input: s,
weight: 3
})),
page.content.split(' ').map(s => ({
page.safeContent.split(' ').map(s => ({
input: s,
weight: 1
}))
@ -138,7 +140,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
},
refresh: true
})
@ -159,7 +161,7 @@ module.exports = {
path: page.path,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
},
refresh: true
})
@ -199,7 +201,7 @@ module.exports = {
path: page.destinationPath,
title: page.title,
description: page.description,
content: page.content
content: page.safeContent
},
refresh: true
})
@ -262,13 +264,14 @@ module.exports = {
_id: doc.id
}
})
doc.safeContent = WIKI.models.pages.cleanHTML(doc.render)
result.push({
suggest: this.buildSuggest(doc),
locale: doc.locale,
path: doc.path,
title: doc.title,
description: doc.description,
content: doc.content
content: doc.safeContent
})
return result
}, []),
@ -282,11 +285,11 @@ module.exports = {
}
await pipeline(
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
isPublished: true,
isPrivate: false
}).stream(),
new Transform({
new stream.Transform({
objectMode: true,
transform: async (chunk, enc, cb) => processDocument(cb, chunk),
flush: async (cb) => processDocument(cb)

55
server/modules/search/postgres/engine.js

@ -1,5 +1,9 @@
const _ = require('lodash')
const tsquery = require('pg-tsquery')()
const stream = require('stream')
const Promise = require('bluebird')
const pipeline = Promise.promisify(stream.pipeline)
/* global WIKI */
module.exports = {
async activate() {
@ -8,7 +12,10 @@ module.exports = {
}
},
async deactivate() {
// not used
WIKI.logger.info(`(SEARCH/POSTGRES) Dropping index tables...`)
await WIKI.models.knex.schema.dropTable('pagesWords')
await WIKI.models.knex.schema.dropTable('pagesVector')
WIKI.logger.info(`(SEARCH/POSTGRES) Index tables have been dropped.`)
},
/**
* INIT
@ -27,6 +34,7 @@ module.exports = {
table.string('title')
table.string('description')
table.specificType('tokens', 'TSVECTOR')
table.text('content')
})
}
// -> Create Words Index
@ -71,7 +79,6 @@ module.exports = {
WIKI.logger.warn('Search Engine Error:')
WIKI.logger.warn(err)
}
},
/**
* CREATE
@ -80,10 +87,10 @@ module.exports = {
*/
async created(page) {
await WIKI.models.knex.raw(`
INSERT INTO "pagesVector" (path, locale, title, description, tokens) VALUES (
'?', '?', '?', '?', (setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'C'))
INSERT INTO "pagesVector" (path, locale, title, description, "tokens") VALUES (
?, ?, ?, ?, (setweight(to_tsvector('${this.config.dictLanguage}', ?), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C'))
)
`, [page.path, page.localeCode, page.title, page.description, page.title, page.description, page.content])
`, [page.path, page.localeCode, page.title, page.description, page.title, page.description, page.safeContent])
},
/**
* UPDATE
@ -99,7 +106,7 @@ module.exports = {
setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') ||
setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C'))
WHERE path = ? AND locale = ?
`, [page.title, page.description, page.title, page.description, page.content, page.path, page.localeCode])
`, [page.title, page.description, page.title, page.description, page.safeContent, page.path, page.localeCode])
},
/**
* DELETE
@ -132,14 +139,34 @@ module.exports = {
async rebuild() {
WIKI.logger.info(`(SEARCH/POSTGRES) Rebuilding Index...`)
await WIKI.models.knex('pagesVector').truncate()
await WIKI.models.knex('pagesWords').truncate()
await pipeline(
WIKI.models.knex.column('path', 'localeCode', 'title', 'description', 'render').select().from('pages').where({
isPublished: true,
isPrivate: false
}).stream(),
new stream.Transform({
objectMode: true,
transform: async (page, enc, cb) => {
const content = WIKI.models.pages.cleanHTML(page.render)
await WIKI.models.knex.raw(`
INSERT INTO "pagesVector" (path, locale, title, description, "tokens", content) VALUES (
?, ?, ?, ?, (setweight(to_tsvector('${this.config.dictLanguage}', ?), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C')), ?
)
`, [page.path, page.localeCode, page.title, page.description, page.title, page.description, content, content])
cb()
}
})
)
await WIKI.models.knex.raw(`
INSERT INTO "pagesVector" (path, locale, title, description, "tokens")
SELECT path, "localeCode" AS locale, title, description,
(setweight(to_tsvector('${this.config.dictLanguage}', title), 'A') ||
setweight(to_tsvector('${this.config.dictLanguage}', description), 'B') ||
setweight(to_tsvector('${this.config.dictLanguage}', content), 'C')) AS tokens
FROM "pages"
WHERE pages."isPublished" AND NOT pages."isPrivate"`)
INSERT INTO "pagesWords" (word)
SELECT word FROM ts_stat(
'SELECT to_tsvector(''simple'', "title") || to_tsvector(''simple'', "description") || to_tsvector(''simple'', "content") FROM "pagesVector"'
)
`)
WIKI.logger.info(`(SEARCH/POSTGRES) Index rebuilt successfully.`)
}
}

10
yarn.lock

@ -4454,6 +4454,11 @@ emitter-listener@^1.1.1:
dependencies:
shimmer "^1.2.0"
emoji-regex@8.0.0:
version "8.0.0"
resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-8.0.0.tgz#e818fd69ce5ccfcb404594f842963bf53164cc37"
integrity sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==
emojis-list@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/emojis-list/-/emojis-list-2.1.0.tgz#4daa4d9db00f9819880c79fa457ae5b09a1fd389"
@ -12294,6 +12299,11 @@ strip-json-comments@^2.0.1, strip-json-comments@~2.0.1:
resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-2.0.1.tgz#3c531942e908c2697c0ec344858c286c7ca0a60a"
integrity sha1-PFMZQukIwml8DsNEhYwobHygpgo=
striptags@3.1.1:
version "3.1.1"
resolved "https://registry.yarnpkg.com/striptags/-/striptags-3.1.1.tgz#c8c3e7fdd6fb4bb3a32a3b752e5b5e3e38093ebd"
integrity sha1-yMPn/db7S7OjKjt1LltePjgJPr0=
style-loader@0.23.1:
version "0.23.1"
resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-0.23.1.tgz#cb9154606f3e771ab6c4ab637026a1049174d925"

Loading…
Cancel
Save