diff --git a/cmd/doctor_convert.go b/cmd/doctor_convert.go index 2385f23e52193..af4980ef8b7f4 100644 --- a/cmd/doctor_convert.go +++ b/cmd/doctor_convert.go @@ -37,11 +37,16 @@ func runDoctorConvert(ctx *cli.Context) error { switch { case setting.Database.Type.IsMySQL(): - if err := db.ConvertUtf8ToUtf8mb4(); err != nil { - log.Fatal("Failed to convert database from utf8 to utf8mb4: %v", err) + charset, collation, err := db.GetDesiredCharsetAndCollation() + if err != nil { + log.Fatal("Failed to determine the desired database charset or collation: %v", err) return err } - fmt.Println("Converted successfully, please confirm your database's character set is now utf8mb4") + if err := db.ConvertCharsetAndCollation(charset, collation); err != nil { + log.Fatal("Failed to convert database from utf8 to %s: %v", charset, err) + return err + } + fmt.Printf("Converted successfully, please confirm your database's character set is now %s, and collation is set to %s\n", charset, collation) case setting.Database.Type.IsMSSQL(): if err := db.ConvertVarcharToNVarchar(); err != nil { log.Fatal("Failed to convert database from varchar to nvarchar: %v", err) diff --git a/cmd/web.go b/cmd/web.go index 01386251becfa..afc11ebee1839 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -15,6 +15,7 @@ import ( _ "net/http/pprof" // Used for debugging if enabled and a web server is running + "code.gitea.io/gitea/models/db" "code.gitea.io/gitea/modules/container" "code.gitea.io/gitea/modules/graceful" "code.gitea.io/gitea/modules/log" @@ -193,6 +194,10 @@ func serveInstalled(ctx *cli.Context) error { routers.InitWebInstalled(graceful.GetManager().HammerContext()) + if err := db.SanityCheck(); err != nil { + log.Warn("database sanity check warning: %s", err) + } + // We check that AppDataPath exists here (it should have been created during installation) // We can't check it in `InitWebInstalled`, because some integration tests // use cmd -> InitWebInstalled, but the AppDataPath doesn't exist during those tests. diff --git a/docs/content/help/faq.en-us.md b/docs/content/help/faq.en-us.md index e6350936ef816..ec3027a5e34c8 100644 --- a/docs/content/help/faq.en-us.md +++ b/docs/content/help/faq.en-us.md @@ -385,10 +385,12 @@ Unfortunately MySQL's `utf8` charset does not completely allow all possible UTF- They created a new charset and collation called `utf8mb4` that allows for emoji to be stored but tables which use the `utf8` charset, and connections which use the `utf8` charset will not use this. -Please run `gitea doctor convert`, or run `ALTER DATABASE database_name CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;` -for the database_name and run `ALTER TABLE table_name CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;` +Please run `gitea doctor convert`, or run `ALTER DATABASE database_name CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;` +for the database_name and run `ALTER TABLE table_name CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;` for each table in the database. +The most appropriate collate function depends on your variant of the database: for MySQL, it is `utf8mb4_0900_as_cs`, for MariaDB, it is `uca1400_as_cs`. Both of them support `utf8mb4_bin`, so that's the common ground. `gitea doctor convert` will choose the best one for you automatically. + ## Why are Emoji displaying only as placeholders or in monochrome Gitea requires the system or browser to have one of the supported Emoji fonts installed, which are Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol, Noto Color Emoji and Twemoji Mozilla. Generally, the operating system should already provide one of these fonts, but especially on Linux, it may be necessary to install them manually. diff --git a/docs/content/installation/database-preparation.en-us.md b/docs/content/installation/database-preparation.en-us.md index 5e0b94665ff41..bb0fefbe56844 100644 --- a/docs/content/installation/database-preparation.en-us.md +++ b/docs/content/installation/database-preparation.en-us.md @@ -61,13 +61,13 @@ Note: All steps below requires that the database engine of your choice is instal Replace username and password above as appropriate. -4. Create database with UTF-8 charset and collation. Make sure to use `utf8mb4` charset instead of `utf8` as the former supports all Unicode characters (including emojis) beyond _Basic Multilingual Plane_. Also, collation chosen depending on your expected content. When in doubt, use either `unicode_ci` or `general_ci`. +4. Create database with UTF-8 charset and collation. Make sure to use `utf8mb4` charset instead of `utf8` as the former supports all Unicode characters (including emojis) beyond _Basic Multilingual Plane_. Also, collation chosen depending on your expected content (such as `utf8mb4_0900_as_cs` for MySQL, or `uca1400_as_cs` for MariaDB, or `utf8mb4_bin` that works for both). When in doubt, leave it unset, and Gitea will adjust the database to use the most fitting one. ```sql - CREATE DATABASE giteadb CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci'; + CREATE DATABASE giteadb CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_bin'; ``` - Replace database name as appropriate. + Replace database name and the collate function as appropriate. 5. Grant all privileges on the database to database user created above. diff --git a/models/db/convert.go b/models/db/convert.go index 112c8575ca2c7..0abe435cb9ee5 100644 --- a/models/db/convert.go +++ b/models/db/convert.go @@ -15,12 +15,12 @@ import ( ) // ConvertUtf8ToUtf8mb4 converts database and tables from utf8 to utf8mb4 if it's mysql and set ROW_FORMAT=dynamic -func ConvertUtf8ToUtf8mb4() error { +func ConvertCharsetAndCollation(charset, collation string) error { if x.Dialect().URI().DBType != schemas.MYSQL { return nil } - _, err := x.Exec(fmt.Sprintf("ALTER DATABASE `%s` CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci", setting.Database.Name)) + _, err := x.Exec(fmt.Sprintf("ALTER DATABASE `%s` CHARACTER SET `%s` COLLATE `%s`", setting.Database.Name, charset, collation)) if err != nil { return err } @@ -34,7 +34,7 @@ func ConvertUtf8ToUtf8mb4() error { return err } - if _, err := x.Exec(fmt.Sprintf("ALTER TABLE `%s` CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;", table.Name)); err != nil { + if _, err := x.Exec(fmt.Sprintf("ALTER TABLE `%s` CONVERT TO CHARACTER SET `%s` COLLATE `%s`", table.Name, charset, collation)); err != nil { return err } } diff --git a/models/db/engine.go b/models/db/engine.go index 182d8cd993696..dfe39b24214e4 100755 --- a/models/db/engine.go +++ b/models/db/engine.go @@ -150,6 +150,114 @@ func InitEngine(ctx context.Context) error { return nil } +func findCaseSensitiveCollation() (string, error) { + if x.Dialect().URI().DBType != schemas.MYSQL { + return "", nil + } + + v, err := x.DBVersion() + if err != nil { + return "", nil + } + + var collation string + switch v.Edition { + case "MariaDB": + collation = "uca1400_as_cs" + default: + collation = "utf8mb4_0900_as_cs" + } + + return collation, nil +} + +func GetDesiredCharsetAndCollation() (string, string, error) { + if x.Dialect().URI().DBType != schemas.MYSQL { + return "", "", nil + } + + var charset string + var collation string + var err error + if setting.Database.DefaultCharset == "" { + charset = "utf8mb4" + } else { + charset = setting.Database.DefaultCharset + } + if setting.Database.DefaultCollation == "" { + collation, err = findCaseSensitiveCollation() + if err != nil { + return "", "", err + } + } else { + collation = setting.Database.DefaultCollation + } + return charset, collation, nil +} + +func SanityCheck() error { + // We do not have any sanity checks for engines other than MySQL + if !setting.Database.Type.IsMySQL() { + return nil + } + + expectedCharset, expectedCollation, err := GetDesiredCharsetAndCollation() + if err != nil { + return err + } + + // check that the database collation is set to a case sensitive one. + var collation []string + _, err = x.SQL("SELECT default_collation_name FROM information_schema.schemata WHERE schema_name = ?", + setting.Database.Name).Get(&collation) + if err != nil { + return err + } + // For mariadb, when we set the collation to uca1400_as_cs, that is + // translated to utf8mb4_uca1400_as_cs, hence the suffix check. + if !strings.HasSuffix(collation[0], expectedCollation) { + return fmt.Errorf(`database collation ("%s") is not %s. Consider running "gitea doctor convert"`, collation[0], expectedCollation) + } + + // check the database character set + var charset []string + _, err = x.SQL("SELECT default_character_set_name FROM information_schema.schemata WHERE schema_name = ?", setting.Database.Name).Get(&charset) + if err != nil { + return err + } + if charset[0] != expectedCharset { + return fmt.Errorf(`database charset ("%s") is not %s. Consider running "gitea doctor convert"`, charset[0], expectedCharset) + } + + // check table collations and character sets + tables, err := x.DBMetas() + if err != nil { + return err + } + for _, table := range tables { + _, err := x.SQL("SELECT CCSA.character_set_name FROM information_schema.tables T, information_schema.collation_character_set_applicability CCSA WHERE CCSA.collation_name = T.table_collation AND T.table_schema = ? AND T.table_name = ?", + setting.Database.Name, table.Name).Get(&charset) + if err != nil { + return err + } + if charset[0] != expectedCharset { + return fmt.Errorf(`table charset for '%s' (%s) is not %s. Consider running "gitea doctor convert"`, table.Name, charset[0], expectedCharset) + } + + _, err = x.SQL("SELECT CCSA.collation_name FROM information_schema.tables T, information_schema.collation_character_set_applicability CCSA WHERE CCSA.collation_name = T.table_collation AND T.table_schema = ? AND T.table_name = ?", + setting.Database.Name, table.Name).Get(&collation) + if err != nil { + return err + } + if !strings.HasSuffix(collation[0], expectedCollation) { + return fmt.Errorf(`table collation for '%s' (%s) is not %s. Consider running "gitea doctor convert"`, table.Name, collation[0], expectedCollation) + } + } + + // if all is well, return without an error + return nil +} + // SetDefaultEngine sets the default engine for db func SetDefaultEngine(ctx context.Context, eng *xorm.Engine) { x = eng @@ -185,6 +293,29 @@ func InitEngineWithMigration(ctx context.Context, migrateFunc func(*xorm.Engine) return err } + // If we're using MySQL, and there are no tables, set the database charaset + // and collation to the desired ones. This will help cases where the + // database is created automatically, and with the wrong settings (such as + // when using the official mysql/mariadb container images). + if x.Dialect().URI().DBType == schemas.MYSQL { + tables, err := x.DBMetas() + if err != nil { + return err + } + + if len(tables) == 0 { + charset, collation, err := GetDesiredCharsetAndCollation() + if err != nil { + return err + } + + _, err = x.Exec(fmt.Sprintf("ALTER DATABASE `%s` DEFAULT CHARACTER SET `%s` COLLATE `%s`", setting.Database.Name, charset, collation)) + if err != nil { + return err + } + } + } + // We have to run migrateFunc here in case the user is re-running installation on a previously created DB. // If we do not then table schemas will be changed and there will be conflicts when the migrations run properly. // diff --git a/models/git/branch.go b/models/git/branch.go index ffd1d7ed164a0..9b3a4dc99be32 100644 --- a/models/git/branch.go +++ b/models/git/branch.go @@ -103,7 +103,7 @@ func (err ErrBranchesEqual) Unwrap() error { type Branch struct { ID int64 RepoID int64 `xorm:"UNIQUE(s)"` - Name string `xorm:"UNIQUE(s) NOT NULL"` // git's ref-name is case-sensitive internally, however, in some databases (mssql, mysql, by default), it's case-insensitive at the moment + Name string `xorm:"UNIQUE(s) NOT NULL"` CommitID string CommitMessage string `xorm:"TEXT"` // it only stores the message summary (the first line) PusherID int64 diff --git a/modules/setting/database.go b/modules/setting/database.go index b68f250f787ef..bd401f4e071fb 100644 --- a/modules/setting/database.go +++ b/modules/setting/database.go @@ -34,7 +34,6 @@ var ( SSLMode string Path string LogSQL bool - MysqlCharset string Timeout int // seconds SQLiteJournalMode string DBConnectRetries int @@ -44,6 +43,8 @@ var ( ConnMaxLifetime time.Duration IterateBufferSize int AutoMigration bool + DefaultCharset string + DefaultCollation string }{ Timeout: 500, IterateBufferSize: 50, @@ -67,7 +68,6 @@ func loadDBSetting(rootCfg ConfigProvider) { } Database.Schema = sec.Key("SCHEMA").String() Database.SSLMode = sec.Key("SSL_MODE").MustString("disable") - Database.MysqlCharset = sec.Key("MYSQL_CHARSET").MustString("utf8mb4") // do not document it, end users won't need it. Database.Path = sec.Key("PATH").MustString(filepath.Join(AppDataPath, "gitea.db")) Database.Timeout = sec.Key("SQLITE_TIMEOUT").MustInt(500) @@ -86,6 +86,9 @@ func loadDBSetting(rootCfg ConfigProvider) { Database.DBConnectRetries = sec.Key("DB_RETRIES").MustInt(10) Database.DBConnectBackoff = sec.Key("DB_RETRY_BACKOFF").MustDuration(3 * time.Second) Database.AutoMigration = sec.Key("AUTO_MIGRATION").MustBool(true) + + Database.DefaultCharset = sec.Key("DEFAULT_CHARSET").String() + Database.DefaultCollation = sec.Key("DEFAULT_COLLATION").String() } // DBConnStr returns database connection string @@ -105,8 +108,8 @@ func DBConnStr() (string, error) { if tls == "disable" { // allow (Postgres-inspired) default value to work in MySQL tls = "false" } - connStr = fmt.Sprintf("%s:%s@%s(%s)/%s%scharset=%s&parseTime=true&tls=%s", - Database.User, Database.Passwd, connType, Database.Host, Database.Name, paramSep, Database.MysqlCharset, tls) + connStr = fmt.Sprintf("%s:%s@%s(%s)/%s%sparseTime=true&tls=%s", + Database.User, Database.Passwd, connType, Database.Host, Database.Name, paramSep, tls) case "postgres": connStr = getPostgreSQLConnectionString(Database.Host, Database.User, Database.Passwd, Database.Name, Database.SSLMode) case "mssql": diff --git a/tests/integration/collate_test.go b/tests/integration/collate_test.go new file mode 100644 index 0000000000000..1295f2e4b7001 --- /dev/null +++ b/tests/integration/collate_test.go @@ -0,0 +1,138 @@ +// Copyright 2023 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package integration + +import ( + "fmt" + "net/http" + "testing" + + auth_model "code.gitea.io/gitea/models/auth" + "code.gitea.io/gitea/models/db" + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/models/unittest" + user_model "code.gitea.io/gitea/models/user" + "code.gitea.io/gitea/modules/setting" + api "code.gitea.io/gitea/modules/structs" + "code.gitea.io/gitea/tests" + + "github.com/stretchr/testify/assert" +) + +func TestMySQLCollate(t *testing.T) { + // This test is only for MySQL, return early for any other engine. + if !setting.Database.Type.IsMySQL() { + t.Skip() + } + + defer tests.PrepareTestEnv(t)() + + // Helpers + loadProps := func() (*repo_model.Repository, *user_model.User, string) { + repo := unittest.AssertExistsAndLoadBean(t, &repo_model.Repository{ID: 2}) + owner := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: repo.OwnerID}) + session := loginUser(t, owner.Name) + token := getTokenForLoggedInUser(t, session, auth_model.AccessTokenScopeWriteIssue) + + return repo, owner, token + } + + breakCollation := func() { + err := db.ConvertCharsetAndCollation("utf8mb4", "utf8mb4_general_ci") + assert.NoError(t, err) + } + fixCollation := func() { + charset, collation, err := db.GetDesiredCharsetAndCollation() + assert.NoError(t, err) + err = db.ConvertCharsetAndCollation(charset, collation) + assert.NoError(t, err) + } + + t.Run("Collation fixing", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + // Ensure that the database uses the wrong collation + breakCollation() + + // With the wrong collation, sanity checking fails + err := db.SanityCheck() + assert.Error(t, err) + + // Try updating the collation + fixCollation() + + // Sanity checking works after the collation update + err = db.SanityCheck() + assert.NoError(t, err) + }) + + t.Run("Case sensitive issue search by label", func(t *testing.T) { + defer tests.PrintCurrentTest(t)() + + assert.NoError(t, unittest.LoadFixtures()) + + // Helpers + createLabel := func(name string) int64 { + repo, owner, token := loadProps() + urlStr := fmt.Sprintf("/api/v1/repos/%s/%s/labels", owner.Name, repo.Name) + + // CreateLabel + req := NewRequestWithJSON(t, "POST", urlStr, &api.CreateLabelOption{ + Name: name, + Color: "abcdef", + Description: "test label", + }).AddTokenAuth(token) + resp := MakeRequest(t, req, http.StatusCreated) + apiLabel := new(api.Label) + DecodeJSON(t, resp, &apiLabel) + return apiLabel.ID + } + + createIssue := func(title string, labelID int64) { + repo, owner, token := loadProps() + urlStr := fmt.Sprintf("/api/v1/repos/%s/%s/issues", owner.Name, repo.Name) + + // CreateIssue + req := NewRequestWithJSON(t, "POST", urlStr, &api.CreateIssueOption{ + Title: title, + Labels: []int64{labelID}, + }).AddTokenAuth(token) + MakeRequest(t, req, http.StatusCreated) + } + + searchIssues := func(label string) []*api.Issue { + _, _, token := loadProps() + var apiIssues []*api.Issue + + urlStr := fmt.Sprintf("/api/v1/repos/issues/search?labels=%s", label) + req := NewRequest(t, "GET", urlStr).AddTokenAuth(token) + resp := MakeRequest(t, req, http.StatusOK) + + DecodeJSON(t, resp, &apiIssues) + return apiIssues + } + + // Ensure that the database uses the wrong collation + breakCollation() + + // Create two labels that differ in case only + labelID1 := createLabel("case-sens") + labelID2 := createLabel("Case-Sens") + + // Create two issues, one with each of the labels above + createIssue("case-sens 1", labelID1) + createIssue("case-sens 2", labelID2) + + // Search for 'label1', and expect two results (`label1` and `Label1`) + issues := searchIssues("case-sens") + assert.Len(t, issues, 2) + + // Update the collation + fixCollation() + + // Search for 'label1', and expect only one result now. + issues = searchIssues("case-sens") + assert.Len(t, issues, 1) + }) +}