fleshing out readme and working on table scan test

This commit is contained in:
forest 2022-12-31 19:54:12 -06:00
parent 64b0bbc4bc
commit 4a23f8a26e
11 changed files with 377 additions and 3 deletions

View file

@ -8,8 +8,55 @@ The problem at hand:
Matrix-synapse stores a lot of data that it has no way of cleaning up or deleting.
Specifically, there is a table it creates in the database called `state_groups_state`. I don't understand what this table is for, however, I can recognize fairly easily that it accounts for the grand majority of the disk space bloat in matrix-synapse:
Specifically, there is a table it creates in the database called `state_groups_state`:
```
root@matrix:~# sudo -u postgres pg_dump synapse -t state_groups_state --schema-only
--
-- PostgreSQL database dump
--
...
CREATE TABLE public.state_groups_state (
state_group bigint NOT NULL,
room_id text NOT NULL,
type text NOT NULL,
state_key text NOT NULL,
event_id text NOT NULL
);
```
I don't understand what this table is for, however, I can recognize fairly easily that it accounts for the grand majority of the disk space bloat of a matrix-synapse instance:
#### top 10 tables by disk space used, cyberia.club instance:
![a pie chart showing state_groups_state using 87% of the disk space](readme/state_groups_state.png)
So, I think it's safe to say that if we can cut down the size of `state_groups_state`, then we can solve our disk space issues.
I know that there are other projects dedicated to this, like https://github.com/matrix-org/rust-synapse-compress-state
However, a cursory examination of the data in `state_groups_state` led me to believe maybe there is an easier and better way.
`state_groups_state` _DOES_ have a `room_id` column on it. It's not _indexed_ by `room_id`, but we can still count the # of rows for each room and rank them:
#### top 100 rooms by number of `state_groups_state` rows, cyberia.club instance:
![a pie chart with two slices taking up about 2 thirds of the pie, and the remaining third taken up mostly by the next 8 slices](readme/top100rooms.png)
In summary, it looks like
> **about 90% of the disk space used by matrix-synapse is in `state_groups_state`, and about 90% of the rows in `state_groups_state` come from just a handfull of rooms**.
So from this information we have hatched a plan:
> _Just delete those rooms from our homeserver ![4head](readme/4head.png)_
However, unfortunately the [matrix-synapse delete room API](https://matrix-org.github.io/synapse/latest/admin_api/rooms.html#version-2-new-version) does not remove anything from `state_groups_state`.
This is similar to the way that the [matrix-synapse message retention policies](https://github.com/matrix-org/synapse/blob/develop/docs/message_retention_policies.md) also do not remove anything from `state_groups_state`.
In fact, probably helps explain why `state_groups_state` gets hundreds of millions of rows and takes up so much disk space: Nothing ever deletes from it!!
#### top 10 tables by disk space used:
![a pie chart showing state_groups_state using 87% of the disk space](state_groups_state.png)

4
config.json Normal file
View file

@ -0,0 +1,4 @@
{
"DatabaseType": "postgres",
"DatabaseConnectionString": "host=localhost port=5432 user=postgres password=dev database=synapse sslmode=disable"
}

88
db_model.go Normal file
View file

@ -0,0 +1,88 @@
package main
import (
"database/sql"
"log"
errors "git.sequentialread.com/forest/pkg-errors"
_ "github.com/lib/pq"
)
type DBModel struct {
DB *sql.DB
}
type StateGroupsStateStream struct {
EstimatedCount int
Channel chan StateGroupsStateRow
}
type StateGroupsStateRow struct {
StateGroup int64
Type string
StateKey string
RoomID string
EventId string
}
func initDatabase(config *Config) *DBModel {
db, err := sql.Open(config.DatabaseType, config.DatabaseConnectionString)
if err != nil {
log.Fatal(err)
}
if err := db.Ping(); err != nil {
log.Fatalf("failed to open database connection: %+v", err)
}
return &DBModel{
DB: db,
}
}
func (model *DBModel) StateGroupsStateStream() (*StateGroupsStateStream, error) {
var estimatedCount int
err := model.DB.QueryRow(`
SELECT reltuples::bigint FROM pg_class WHERE oid = 'public.state_groups_state'::regclass;
`).Scan(&estimatedCount)
if err != nil {
return nil, errors.Wrap(err, "could not get estimated row count of state_groups_state")
}
rows, err := model.DB.Query("SELECT state_group, type, state_key, room_id FROM state_groups_state")
if err != nil {
return nil, errors.Wrap(err, "could not select from state_groups_state")
}
toReturn := StateGroupsStateStream{
EstimatedCount: estimatedCount,
Channel: make(chan StateGroupsStateRow, 10000),
}
go func(rows *sql.Rows, channel chan StateGroupsStateRow) {
defer rows.Close()
for rows.Next() {
var stateGroup int64
var tyype string
var stateKey string
var roomID string
err := rows.Scan(&stateGroup, &tyype, &stateKey, &roomID)
if err != nil {
log.Printf("error scanning a state_groups_state row: %s \n", err)
} else {
channel <- StateGroupsStateRow{
StateGroup: stateGroup,
Type: tyype,
StateKey: stateKey,
RoomID: roomID,
}
}
}
close(channel)
}(rows, toReturn.Channel)
return &toReturn, nil
}

7
go.mod
View file

@ -1,3 +1,10 @@
module git.cyberia.club/cyberia/matrix-synapse-state-groups-state-janitor
go 1.19
require (
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04 // indirect
git.sequentialread.com/forest/pkg-errors v0.9.2 // indirect
github.com/lib/pq v1.10.7 // indirect
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect
)

8
go.sum Normal file
View file

@ -0,0 +1,8 @@
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04 h1:FmvQmRJzAgbCc/4qfECAluzd+oVBzXNJMjyLQTJ4Wq0=
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04/go.mod h1:jaNfZ5BXx8OsKVZ6FuN0Lr/gIeEwbTNNHSO4RpFz6qo=
git.sequentialread.com/forest/pkg-errors v0.9.2 h1:j6pwbL6E+TmE7TD0tqRtGwuoCbCfO6ZR26Nv5nest9g=
git.sequentialread.com/forest/pkg-errors v0.9.2/go.mod h1:8TkJ/f8xLWFIAid20aoqgDZcCj9QQt+FU+rk415XO1w=
github.com/lib/pq v1.10.7 h1:p7ZhMD+KsSRozJr34udlUrhboJwWAgCg34+/ZZNvZZw=
github.com/lib/pq v1.10.7/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c h1:HelZ2kAFadG0La9d+4htN4HzQ68Bm2iM9qKMSMES6xg=
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c/go.mod h1:JlzghshsemAMDGZLytTFY8C1JQxQPhnatWqNwUXjggo=

57
main.go Normal file
View file

@ -0,0 +1,57 @@
package main
import (
"encoding/json"
"log"
"reflect"
"time"
configlite "git.sequentialread.com/forest/config-lite"
)
type Config struct {
DatabaseType string
DatabaseConnectionString string
}
func main() {
config := Config{}
ignoreCommandlineFlags := []string{}
err := configlite.ReadConfiguration("config.json", "JANITOR", ignoreCommandlineFlags, reflect.ValueOf(&config))
if err != nil {
panic(err)
}
db := initDatabase(&config)
stream, err := db.StateGroupsStateStream()
if err != nil {
log.Fatalf("Can't start because %+v\n", err)
}
lastUpdateTime := time.Now()
updateCounter := 0
rowCounter := 0
rowCountByRoom := map[string]int{}
for row := range stream.Channel {
rowCountByRoom[row.RoomID] = rowCountByRoom[row.RoomID] + 1
updateCounter += 1
rowCounter += 1
if updateCounter > 10000 {
if time.Now().After(lastUpdateTime.Add(time.Second)) {
percent := int((float64(rowCounter) / float64(stream.EstimatedCount)) * float64(100))
log.Printf("%d/%d (%d%s) ... \n", rowCounter, stream.EstimatedCount, percent, "%")
}
updateCounter = 0
}
}
output, err := json.MarshalIndent(rowCountByRoom, "", " ")
if err != nil {
log.Fatalf("Can't display output because json.MarshalIndent returned %+v\n", err)
}
log.Println(string(output))
}

BIN
readme/4head.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 876 B

View file

@ -0,0 +1,163 @@
<div style="width:700px; height:700px;">
<canvas id="myChart"></canvas>
</div>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script>
const ctx = document.getElementById('myChart');
/*
The following comes from the output of this query:
sudo -u postgres psql -d synapse -c "select room_id, count(*) from state_groups_state group by room_id order by count(*) DESC LIMIT 100" | tee /root/large-rooms.txt
*/
const roomSizeOutput = ` NlinI | 450657231
xPlvG | 389135018
ndyAb | 101706083
dUtxN | 37545025
NmgkA | 24989806
grOTr | 21818353
WAdLC | 16239804
sfQtQ | 14144383
qSUvX | 13388996
firtm | 10151194
DGslq | 9353566
kVEqw | 8354595
EYXdr | 7548122
tkMlP | 5122957
TUiY1 | 3726174
NQAlm | 3464720
AdWuy | 2808438
RkUcr | 2782375
AKJwK | 2734718
VwKfl | 2686841
CURkx | 2155982
VifHH | 2049300
MBCzT | 1947499
zbSMK | 1925341
eqlZc | 1685376
uZFDt | 1262008
WghOY | 1059302
iaFrB | 967741
hWNAS | 935235
fCLCt | 919655
fXSjB | 791741
uHYso | 743723
FEKtB | 723364
zcQwy | 638365
AdphE | 628321
gIhuF | 608747
CCXyQ | 580139
cWoZo | 530707
RiQky | 422555
QLScq | 379495
yTEzA | 367794
GTcBD | 357648
epvwS | 335763
FXtQc | 329364
uNxgS | 317571
NBoXO | 304640
NXmXu | 296147
qdIqF | 287192
ping-v6 | 284514
tPcnH | 269458
ndtvS | 268087
okEaS | 252121
QTpPW | 252050
FwgFX | 247442
QPhnk | 241355
rvmur | 239737
MyYoQ | 237460
GAnDi | 216717
PJwYc | 207801
zVoJs | 205413
MPQht | 204009
EhDPW | 197840
DNSWA | 183426
WdDwd | 173816
mvrdC | 162345
FeiZg | 160574
MuqXb | 148981
jrFel | 131865
MacrE | 127906
xJAJX | 125590
NqqGH | 122167
vZTkJ | 119077
OwxSP | 118159
cZrMe | 112115
SooHf | 112057
Nllau | 109477
swJuG | 101739
WBztC | 101330
lGTHN | 96961
OSwqD | 94302
GBYQw | 93689
FRcyQ | 92222
kfUkX | 90445
xnjXI | 88150
hZdVI | 87030
sHavD | 82696
dxrXz | 76413
yxxkg | 75466
APKBt | 74789
Ztpai | 72639
bcfmb | 71595
BPbab | 70127
GWmHb | 66293
OzCDR | 65822
YlhCi | 64625
lXVii | 64562
NnwbP | 62317
jNOIF | 59045
Iojqe | 56746
tbMgT | 54567`;
const rooms = roomSizeOutput.split("\n").map(rowString => {
const row = rowString.split("|").map(raw => raw.trim());
return {
roomid: row[0].slice(15, row[0].length),
numRows: Number(row[1]),
}
});
rooms.sort((a, b) => {
return b.numRows - a.numRows;
});
// // this was pulled from tablesize-chartjs.html row count for state_groups_state
// const totalRowCount = 1104920600;
// const otherRoomsRowCount = totalRowCount - rooms.reduce((accumulator, room) => accumulator + room.numRows, 0)
// rooms.push({
// roomid: "others",
// numRows: otherRoomsRowCount
// })
new Chart(ctx, {
type: 'doughnut',
data: {
labels: rooms.map(room => room.roomid),
datasets: [{
label: 'filesize %',
//data: rooms.map(room => Math.round((room.numRows/totalRowCount)*100)),
data: rooms.map(room => room.numRows),
borderWidth: 2
}]
},
options: {
}
});
</script>

View file

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

BIN
readme/top100rooms.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB