sqlite: add full text search index

Our search was using a linear search over the message text data.
This is very inefficient for large databases and barely usable.
We can add an FTS index, trading storage for speed.

As it's setup, this only supports English, but then we get fancy
stemming so that say "work" matches itself as well as "working".

This could be reduced to just stripping funny chars, with less good
search in the English case.
This commit is contained in:
Reto Brunner 2023-07-28 06:40:19 +02:00
parent a8be84028c
commit b2797c3a11
2 changed files with 150 additions and 56 deletions

View file

@ -26,7 +26,7 @@ try {
type Migration = {version: number; stmts: string[]};
type Rollback = {version: number; rollback_forbidden?: boolean; stmts: string[]};
export const currentSchemaVersion = 1703322560448; // use `new Date().getTime()`
export const currentSchemaVersion = 1706445634804; // use `new Date().getTime()`
// Desired schema, adapt to the newest version and add migrations to the array below
const schema = [
@ -46,6 +46,35 @@ const schema = [
"CREATE INDEX network_channel ON messages (network, channel)",
"CREATE INDEX time ON messages (time)",
"CREATE INDEX msg_type_idx on messages (type)", // needed for efficient storageCleaner queries
`CREATE VIEW msg_search_view (id, type, text) AS
select
id,
type,
msg ->> '$.text'
from messages
where type in ('message', 'action', 'notice')`,
`CREATE VIRTUAL TABLE msg_fts USING fts5(
text,
tokenize='porter unicode61 remove_diacritics 2',
content='msg_search_view',
content_rowid=id
)`,
`CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages
WHEN new.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
];
// the migrations will be executed in an exclusive transaction as a whole
@ -83,6 +112,42 @@ export const migrations: Migration[] = [
version: 1703322560448,
stmts: ["CREATE INDEX msg_type_idx on messages (type)"],
},
{
version: 1706445634804,
stmts: [
`CREATE VIEW msg_search_view (id, type, text) AS
select
id,
type,
msg ->> '$.text'
from messages
where type in ('message', 'action', 'notice')`,
`CREATE VIRTUAL TABLE msg_fts USING fts5(
text,
tokenize='porter unicode61 remove_diacritics 2',
content='msg_search_view',
content_rowid=id
)`,
`CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages
WHEN new.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
`INSERT into msg_fts (rowid, text) SELECT id, text FROM msg_search_view
WHERE type in ('message', 'action', 'notice');`,
],
},
];
// down migrations need to restore the state of the prior version.
@ -100,6 +165,16 @@ export const rollbacks: Rollback[] = [
version: 1703322560448,
stmts: ["drop INDEX msg_type_idx"],
},
{
version: 1706445634804,
stmts: [
"drop table msg_fts",
"drop view msg_search_view",
"drop trigger msg_fts_insert",
"drop trigger msg_fts_update",
"drop trigger msg_fts_delete",
],
},
];
class Deferred {
@ -465,30 +540,24 @@ class SqliteMessageStorage implements SearchableMessageStorage {
);
}
// Using the '@' character to escape '%' and '_' in patterns.
const escapedSearchTerm = query.searchTerm.replace(/([%_@])/g, "@$1");
const rows = await this.serialize_fetchall(
`SELECT msg, type, time, network, channel
FROM messages
WHERE network = ?
AND channel = ?
AND messages.id in (
SELECT rowid FROM msg_fts WHERE msg_fts MATCH ?
)
ORDER BY time DESC
LIMIT ?
OFFSET ?`,
query.networkUuid,
query.channelName.toLowerCase(),
fts5_escape(query.searchTerm),
100, // limit
query.offset
);
let select =
'SELECT msg, type, time, network, channel FROM messages WHERE type = "message" AND json_extract(msg, "$.text") LIKE ? ESCAPE \'@\'';
const params: any[] = [`%${escapedSearchTerm}%`];
if (query.networkUuid) {
select += " AND network = ? ";
params.push(query.networkUuid);
}
if (query.channelName) {
select += " AND channel = ? ";
params.push(query.channelName.toLowerCase());
}
const maxResults = 100;
select += " ORDER BY time DESC LIMIT ? OFFSET ? ";
params.push(maxResults);
params.push(query.offset);
const rows = await this.serialize_fetchall(select, ...params);
return {
...query,
results: parseSearchRowsToMessages(query.offset, rows).reverse(),
@ -590,6 +659,11 @@ function parseSearchRowsToMessages(id: number, rows: any[]) {
return messages;
}
function fts5_escape(s: string): string {
s = s.replaceAll('"', '""'); // doubled quotes escape the quote
return `"${s}"`; // this makes it a string, rather than hoping it still fits the bareword
}
export function necessaryMigrations(since: number): Migration[] {
return migrations.filter((m) => m.version > since);
}

View file

@ -382,36 +382,47 @@ describe("SQLite Message Storage", function () {
});
it("should search messages", async function () {
const originalMaxHistory = Config.values.maxHistory;
try {
Config.values.maxHistory = 2;
const search = await store.search({
searchTerm: "msg",
networkUuid: "retrieval-order-test-network",
channelName: "",
offset: 0,
});
expect(search.results).to.have.lengthOf(100);
const expectedMessages: string[] = [];
for (let i = 100; i < 200; ++i) {
expectedMessages.push(`msg ${i}`);
}
expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages);
} finally {
Config.values.maxHistory = originalMaxHistory;
for (let i = 0; i < 101; ++i) {
await store.index(
{uuid: "searchNet"} as any,
{name: "#channel"} as any,
new Msg({
time: 123456789 + i,
text: `msg ${i}`,
} as any)
);
await store.index(
{uuid: "searchNet"} as any,
{name: "#channel"} as any,
new Msg({
time: 123456789 + i,
text: `no match ${i}`,
} as any)
);
}
const search = await store.search({
searchTerm: "msg",
networkUuid: "searchNet",
channelName: "#channel",
offset: 0,
});
expect(search.results).to.have.lengthOf(100);
const expectedMessages: string[] = [];
for (let i = 1; i < 101; ++i) {
expectedMessages.push(`msg ${i}`);
}
expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages);
});
it("should search messages with escaped wildcards", async function () {
it("should search messages when symbols are given", async function () {
async function assertResults(query: string, expected: string[]) {
const search = await store.search({
searchTerm: query,
networkUuid: "this-is-a-network-guid2",
channelName: "",
channelName: "#channel",
offset: 0,
});
expect(search.results.map((i) => i.text)).to.deep.equal(expected);
@ -422,12 +433,16 @@ describe("SQLite Message Storage", function () {
try {
Config.values.maxHistory = 3;
const foo_bar_baz = `foo % bar _ baz`;
const bar_baz = `bar @ " baz`;
const foo_bar_x_baz = `👻 foo bar x baz`;
await store.index(
{uuid: "this-is-a-network-guid2"} as any,
{name: "#channel"} as any,
new Msg({
time: 123456790,
text: `foo % bar _ baz`,
text: foo_bar_baz,
} as any)
);
@ -436,7 +451,7 @@ describe("SQLite Message Storage", function () {
{name: "#channel"} as any,
new Msg({
time: 123456791,
text: `foo bar x baz`,
text: foo_bar_x_baz,
} as any)
);
@ -445,18 +460,23 @@ describe("SQLite Message Storage", function () {
{name: "#channel"} as any,
new Msg({
time: 123456792,
text: `bar @ baz`,
text: bar_baz,
} as any)
);
await assertResults("foo", ["foo % bar _ baz", "foo bar x baz"]);
await assertResults("%", ["foo % bar _ baz"]);
await assertResults("foo % bar ", ["foo % bar _ baz"]);
await assertResults("_", ["foo % bar _ baz"]);
await assertResults("bar _ baz", ["foo % bar _ baz"]);
await assertResults("foo", [foo_bar_baz, foo_bar_x_baz]);
await assertResults("foo % bar ", [foo_bar_baz, foo_bar_x_baz]);
await assertResults("bar _ baz", [foo_bar_baz, bar_baz]);
await assertResults("👻 foo", [foo_bar_baz, foo_bar_x_baz]);
// Our tokenizer doesn't care at all about non text, this just serves as documentation
// as to what one can expect from it and to check that we can't crash the search with "funny" symbols
await assertResults("%", []);
await assertResults("_", []);
await assertResults("%%", []);
await assertResults("@%", []);
await assertResults("@", ["bar @ baz"]);
await assertResults("@ '", []);
await assertResults('"', []);
await assertResults('"👻', []);
} finally {
Config.values.maxHistory = originalMaxHistory;
}