From b2797c3a11c80bd1945a0dfb7306d26bce9d00e5 Mon Sep 17 00:00:00 2001 From: Reto Brunner Date: Fri, 28 Jul 2023 06:40:19 +0200 Subject: [PATCH] sqlite: add full text search index Our search was using a linear search over the message text data. This is very inefficient for large databases and barely usable. We can add an FTS index, trading storage for speed. As it's setup, this only supports English, but then we get fancy stemming so that say "work" matches itself as well as "working". This could be reduced to just stripping funny chars, with less good search in the English case. --- server/plugins/messageStorage/sqlite.ts | 122 +++++++++++++++++++----- test/plugins/sqlite.ts | 84 +++++++++------- 2 files changed, 150 insertions(+), 56 deletions(-) diff --git a/server/plugins/messageStorage/sqlite.ts b/server/plugins/messageStorage/sqlite.ts index 713f108f..e39323b2 100644 --- a/server/plugins/messageStorage/sqlite.ts +++ b/server/plugins/messageStorage/sqlite.ts @@ -26,7 +26,7 @@ try { type Migration = {version: number; stmts: string[]}; type Rollback = {version: number; rollback_forbidden?: boolean; stmts: string[]}; -export const currentSchemaVersion = 1703322560448; // use `new Date().getTime()` +export const currentSchemaVersion = 1706445634804; // use `new Date().getTime()` // Desired schema, adapt to the newest version and add migrations to the array below const schema = [ @@ -46,6 +46,35 @@ const schema = [ "CREATE INDEX network_channel ON messages (network, channel)", "CREATE INDEX time ON messages (time)", "CREATE INDEX msg_type_idx on messages (type)", // needed for efficient storageCleaner queries + `CREATE VIEW msg_search_view (id, type, text) AS + select + id, + type, + msg ->> '$.text' + from messages + where type in ('message', 'action', 'notice')`, + `CREATE VIRTUAL TABLE msg_fts USING fts5( + text, + tokenize='porter unicode61 remove_diacritics 2', + content='msg_search_view', + content_rowid=id + )`, + `CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages + WHEN new.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, ]; // the migrations will be executed in an exclusive transaction as a whole @@ -83,6 +112,42 @@ export const migrations: Migration[] = [ version: 1703322560448, stmts: ["CREATE INDEX msg_type_idx on messages (type)"], }, + { + version: 1706445634804, + stmts: [ + `CREATE VIEW msg_search_view (id, type, text) AS + select + id, + type, + msg ->> '$.text' + from messages + where type in ('message', 'action', 'notice')`, + `CREATE VIRTUAL TABLE msg_fts USING fts5( + text, + tokenize='porter unicode61 remove_diacritics 2', + content='msg_search_view', + content_rowid=id + )`, + `CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages + WHEN new.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, + `INSERT into msg_fts (rowid, text) SELECT id, text FROM msg_search_view + WHERE type in ('message', 'action', 'notice');`, + ], + }, ]; // down migrations need to restore the state of the prior version. @@ -100,6 +165,16 @@ export const rollbacks: Rollback[] = [ version: 1703322560448, stmts: ["drop INDEX msg_type_idx"], }, + { + version: 1706445634804, + stmts: [ + "drop table msg_fts", + "drop view msg_search_view", + "drop trigger msg_fts_insert", + "drop trigger msg_fts_update", + "drop trigger msg_fts_delete", + ], + }, ]; class Deferred { @@ -465,30 +540,24 @@ class SqliteMessageStorage implements SearchableMessageStorage { ); } - // Using the '@' character to escape '%' and '_' in patterns. - const escapedSearchTerm = query.searchTerm.replace(/([%_@])/g, "@$1"); + const rows = await this.serialize_fetchall( + `SELECT msg, type, time, network, channel + FROM messages + WHERE network = ? + AND channel = ? + AND messages.id in ( + SELECT rowid FROM msg_fts WHERE msg_fts MATCH ? + ) + ORDER BY time DESC + LIMIT ? + OFFSET ?`, + query.networkUuid, + query.channelName.toLowerCase(), + fts5_escape(query.searchTerm), + 100, // limit + query.offset + ); - let select = - 'SELECT msg, type, time, network, channel FROM messages WHERE type = "message" AND json_extract(msg, "$.text") LIKE ? ESCAPE \'@\''; - const params: any[] = [`%${escapedSearchTerm}%`]; - - if (query.networkUuid) { - select += " AND network = ? "; - params.push(query.networkUuid); - } - - if (query.channelName) { - select += " AND channel = ? "; - params.push(query.channelName.toLowerCase()); - } - - const maxResults = 100; - - select += " ORDER BY time DESC LIMIT ? OFFSET ? "; - params.push(maxResults); - params.push(query.offset); - - const rows = await this.serialize_fetchall(select, ...params); return { ...query, results: parseSearchRowsToMessages(query.offset, rows).reverse(), @@ -590,6 +659,11 @@ function parseSearchRowsToMessages(id: number, rows: any[]) { return messages; } +function fts5_escape(s: string): string { + s = s.replaceAll('"', '""'); // doubled quotes escape the quote + return `"${s}"`; // this makes it a string, rather than hoping it still fits the bareword +} + export function necessaryMigrations(since: number): Migration[] { return migrations.filter((m) => m.version > since); } diff --git a/test/plugins/sqlite.ts b/test/plugins/sqlite.ts index e2af20be..ab2f796a 100644 --- a/test/plugins/sqlite.ts +++ b/test/plugins/sqlite.ts @@ -382,36 +382,47 @@ describe("SQLite Message Storage", function () { }); it("should search messages", async function () { - const originalMaxHistory = Config.values.maxHistory; - - try { - Config.values.maxHistory = 2; - - const search = await store.search({ - searchTerm: "msg", - networkUuid: "retrieval-order-test-network", - channelName: "", - offset: 0, - }); - expect(search.results).to.have.lengthOf(100); - const expectedMessages: string[] = []; - - for (let i = 100; i < 200; ++i) { - expectedMessages.push(`msg ${i}`); - } - - expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages); - } finally { - Config.values.maxHistory = originalMaxHistory; + for (let i = 0; i < 101; ++i) { + await store.index( + {uuid: "searchNet"} as any, + {name: "#channel"} as any, + new Msg({ + time: 123456789 + i, + text: `msg ${i}`, + } as any) + ); + await store.index( + {uuid: "searchNet"} as any, + {name: "#channel"} as any, + new Msg({ + time: 123456789 + i, + text: `no match ${i}`, + } as any) + ); } + + const search = await store.search({ + searchTerm: "msg", + networkUuid: "searchNet", + channelName: "#channel", + offset: 0, + }); + expect(search.results).to.have.lengthOf(100); + const expectedMessages: string[] = []; + + for (let i = 1; i < 101; ++i) { + expectedMessages.push(`msg ${i}`); + } + + expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages); }); - it("should search messages with escaped wildcards", async function () { + it("should search messages when symbols are given", async function () { async function assertResults(query: string, expected: string[]) { const search = await store.search({ searchTerm: query, networkUuid: "this-is-a-network-guid2", - channelName: "", + channelName: "#channel", offset: 0, }); expect(search.results.map((i) => i.text)).to.deep.equal(expected); @@ -422,12 +433,16 @@ describe("SQLite Message Storage", function () { try { Config.values.maxHistory = 3; + const foo_bar_baz = `foo % bar _ baz`; + const bar_baz = `bar @ " baz`; + const foo_bar_x_baz = `👻 foo bar x baz`; + await store.index( {uuid: "this-is-a-network-guid2"} as any, {name: "#channel"} as any, new Msg({ time: 123456790, - text: `foo % bar _ baz`, + text: foo_bar_baz, } as any) ); @@ -436,7 +451,7 @@ describe("SQLite Message Storage", function () { {name: "#channel"} as any, new Msg({ time: 123456791, - text: `foo bar x baz`, + text: foo_bar_x_baz, } as any) ); @@ -445,18 +460,23 @@ describe("SQLite Message Storage", function () { {name: "#channel"} as any, new Msg({ time: 123456792, - text: `bar @ baz`, + text: bar_baz, } as any) ); - await assertResults("foo", ["foo % bar _ baz", "foo bar x baz"]); - await assertResults("%", ["foo % bar _ baz"]); - await assertResults("foo % bar ", ["foo % bar _ baz"]); - await assertResults("_", ["foo % bar _ baz"]); - await assertResults("bar _ baz", ["foo % bar _ baz"]); + await assertResults("foo", [foo_bar_baz, foo_bar_x_baz]); + await assertResults("foo % bar ", [foo_bar_baz, foo_bar_x_baz]); + await assertResults("bar _ baz", [foo_bar_baz, bar_baz]); + await assertResults("👻 foo", [foo_bar_baz, foo_bar_x_baz]); + // Our tokenizer doesn't care at all about non text, this just serves as documentation + // as to what one can expect from it and to check that we can't crash the search with "funny" symbols + await assertResults("%", []); + await assertResults("_", []); await assertResults("%%", []); await assertResults("@%", []); - await assertResults("@", ["bar @ baz"]); + await assertResults("@ '", []); + await assertResults('"', []); + await assertResults('"👻', []); } finally { Config.values.maxHistory = originalMaxHistory; }