This commit is contained in:
Reto 2024-04-21 15:13:33 +02:00 committed by GitHub
commit acc1f5a149
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 150 additions and 56 deletions

View file

@ -26,7 +26,7 @@ try {
type Migration = {version: number; stmts: string[]};
type Rollback = {version: number; rollback_forbidden?: boolean; stmts: string[]};
export const currentSchemaVersion = 1703322560448; // use `new Date().getTime()`
export const currentSchemaVersion = 1706445634804; // use `new Date().getTime()`
// Desired schema, adapt to the newest version and add migrations to the array below
const schema = [
@ -46,6 +46,35 @@ const schema = [
"CREATE INDEX network_channel ON messages (network, channel)",
"CREATE INDEX time ON messages (time)",
"CREATE INDEX msg_type_idx on messages (type)", // needed for efficient storageCleaner queries
`CREATE VIEW msg_search_view (id, type, text) AS
select
id,
type,
msg ->> '$.text'
from messages
where type in ('message', 'action', 'notice')`,
`CREATE VIRTUAL TABLE msg_fts USING fts5(
text,
tokenize='porter unicode61 remove_diacritics 2',
content='msg_search_view',
content_rowid=id
)`,
`CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages
WHEN new.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
];
// the migrations will be executed in an exclusive transaction as a whole
@ -83,6 +112,42 @@ export const migrations: Migration[] = [
version: 1703322560448,
stmts: ["CREATE INDEX msg_type_idx on messages (type)"],
},
{
version: 1706445634804,
stmts: [
`CREATE VIEW msg_search_view (id, type, text) AS
select
id,
type,
msg ->> '$.text'
from messages
where type in ('message', 'action', 'notice')`,
`CREATE VIRTUAL TABLE msg_fts USING fts5(
text,
tokenize='porter unicode61 remove_diacritics 2',
content='msg_search_view',
content_rowid=id
)`,
`CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages
WHEN new.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
END`,
`CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages
WHEN old.type in ('message', 'action', 'notice')
BEGIN
INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text');
INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text');
END`,
`INSERT into msg_fts (rowid, text) SELECT id, text FROM msg_search_view
WHERE type in ('message', 'action', 'notice');`,
],
},
];
// down migrations need to restore the state of the prior version.
@ -100,6 +165,16 @@ export const rollbacks: Rollback[] = [
version: 1703322560448,
stmts: ["drop INDEX msg_type_idx"],
},
{
version: 1706445634804,
stmts: [
"drop table msg_fts",
"drop view msg_search_view",
"drop trigger msg_fts_insert",
"drop trigger msg_fts_update",
"drop trigger msg_fts_delete",
],
},
];
class Deferred {
@ -465,30 +540,24 @@ class SqliteMessageStorage implements SearchableMessageStorage {
);
}
// Using the '@' character to escape '%' and '_' in patterns.
const escapedSearchTerm = query.searchTerm.replace(/([%_@])/g, "@$1");
const rows = await this.serialize_fetchall(
`SELECT msg, type, time, network, channel
FROM messages
WHERE network = ?
AND channel = ?
AND messages.id in (
SELECT rowid FROM msg_fts WHERE msg_fts MATCH ?
)
ORDER BY time DESC
LIMIT ?
OFFSET ?`,
query.networkUuid,
query.channelName.toLowerCase(),
fts5_escape(query.searchTerm),
100, // limit
query.offset
);
let select =
'SELECT msg, type, time, network, channel FROM messages WHERE type = "message" AND json_extract(msg, "$.text") LIKE ? ESCAPE \'@\'';
const params: any[] = [`%${escapedSearchTerm}%`];
if (query.networkUuid) {
select += " AND network = ? ";
params.push(query.networkUuid);
}
if (query.channelName) {
select += " AND channel = ? ";
params.push(query.channelName.toLowerCase());
}
const maxResults = 100;
select += " ORDER BY time DESC LIMIT ? OFFSET ? ";
params.push(maxResults);
params.push(query.offset);
const rows = await this.serialize_fetchall(select, ...params);
return {
...query,
results: parseSearchRowsToMessages(query.offset, rows).reverse(),
@ -590,6 +659,11 @@ function parseSearchRowsToMessages(id: number, rows: any[]) {
return messages;
}
function fts5_escape(s: string): string {
s = s.replaceAll('"', '""'); // doubled quotes escape the quote
return `"${s}"`; // this makes it a string, rather than hoping it still fits the bareword
}
export function necessaryMigrations(since: number): Migration[] {
return migrations.filter((m) => m.version > since);
}

View file

@ -382,36 +382,47 @@ describe("SQLite Message Storage", function () {
});
it("should search messages", async function () {
const originalMaxHistory = Config.values.maxHistory;
try {
Config.values.maxHistory = 2;
const search = await store.search({
searchTerm: "msg",
networkUuid: "retrieval-order-test-network",
channelName: "",
offset: 0,
});
expect(search.results).to.have.lengthOf(100);
const expectedMessages: string[] = [];
for (let i = 100; i < 200; ++i) {
expectedMessages.push(`msg ${i}`);
}
expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages);
} finally {
Config.values.maxHistory = originalMaxHistory;
for (let i = 0; i < 101; ++i) {
await store.index(
{uuid: "searchNet"} as any,
{name: "#channel"} as any,
new Msg({
time: 123456789 + i,
text: `msg ${i}`,
} as any)
);
await store.index(
{uuid: "searchNet"} as any,
{name: "#channel"} as any,
new Msg({
time: 123456789 + i,
text: `no match ${i}`,
} as any)
);
}
const search = await store.search({
searchTerm: "msg",
networkUuid: "searchNet",
channelName: "#channel",
offset: 0,
});
expect(search.results).to.have.lengthOf(100);
const expectedMessages: string[] = [];
for (let i = 1; i < 101; ++i) {
expectedMessages.push(`msg ${i}`);
}
expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages);
});
it("should search messages with escaped wildcards", async function () {
it("should search messages when symbols are given", async function () {
async function assertResults(query: string, expected: string[]) {
const search = await store.search({
searchTerm: query,
networkUuid: "this-is-a-network-guid2",
channelName: "",
channelName: "#channel",
offset: 0,
});
expect(search.results.map((i) => i.text)).to.deep.equal(expected);
@ -422,12 +433,16 @@ describe("SQLite Message Storage", function () {
try {
Config.values.maxHistory = 3;
const foo_bar_baz = `foo % bar _ baz`;
const bar_baz = `bar @ " baz`;
const foo_bar_x_baz = `👻 foo bar x baz`;
await store.index(
{uuid: "this-is-a-network-guid2"} as any,
{name: "#channel"} as any,
new Msg({
time: 123456790,
text: `foo % bar _ baz`,
text: foo_bar_baz,
} as any)
);
@ -436,7 +451,7 @@ describe("SQLite Message Storage", function () {
{name: "#channel"} as any,
new Msg({
time: 123456791,
text: `foo bar x baz`,
text: foo_bar_x_baz,
} as any)
);
@ -445,18 +460,23 @@ describe("SQLite Message Storage", function () {
{name: "#channel"} as any,
new Msg({
time: 123456792,
text: `bar @ baz`,
text: bar_baz,
} as any)
);
await assertResults("foo", ["foo % bar _ baz", "foo bar x baz"]);
await assertResults("%", ["foo % bar _ baz"]);
await assertResults("foo % bar ", ["foo % bar _ baz"]);
await assertResults("_", ["foo % bar _ baz"]);
await assertResults("bar _ baz", ["foo % bar _ baz"]);
await assertResults("foo", [foo_bar_baz, foo_bar_x_baz]);
await assertResults("foo % bar ", [foo_bar_baz, foo_bar_x_baz]);
await assertResults("bar _ baz", [foo_bar_baz, bar_baz]);
await assertResults("👻 foo", [foo_bar_baz, foo_bar_x_baz]);
// Our tokenizer doesn't care at all about non text, this just serves as documentation
// as to what one can expect from it and to check that we can't crash the search with "funny" symbols
await assertResults("%", []);
await assertResults("_", []);
await assertResults("%%", []);
await assertResults("@%", []);
await assertResults("@", ["bar @ baz"]);
await assertResults("@ '", []);
await assertResults('"', []);
await assertResults('"👻', []);
} finally {
Config.values.maxHistory = originalMaxHistory;
}