From ad6d7d733bbf3f2f12f3774a5fb22d9914586b24 Mon Sep 17 00:00:00 2001 From: Peter Sanchez Date: Wed, 26 Feb 2025 19:03:49 -0600 Subject: [PATCH] Add utf8 sanitizer to address edge case import errors. Fixes: https://todo.code.netlandish.com/~netlandish/links/96 Signed-off-by: Peter Sanchez --- core/import.go | 5 +++-- helpers.go | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/core/import.go b/core/import.go index 1671f47..ae8e14e 100644 --- a/core/import.go +++ b/core/import.go @@ -114,6 +114,7 @@ func (p pinBoardObj) GetTags() []string { func trimTags(tags []string) []string { var ret []string for _, t := range tags { + t = links.SanitizeUTF8(t) if len(t) > 50 { t = t[:50] } @@ -300,9 +301,9 @@ func processOrgLinks(obj importObj, baseURLMap map[string]int, title = title[:146] + "..." } return &models.OrgLink{ - Title: title, + Title: links.SanitizeUTF8(title), URL: obj.GetURL(), - Description: obj.GetDescription(), + Description: links.SanitizeUTF8(obj.GetDescription()), BaseURLID: sql.NullInt64{Valid: true, Int64: int64(baseID)}, OrgID: org.ID, UserID: int(user.ID), diff --git a/helpers.go b/helpers.go index 36a6eb7..125698a 100644 --- a/helpers.go +++ b/helpers.go @@ -23,6 +23,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" "git.sr.ht/~emersion/gqlclient" "github.com/99designs/gqlgen/graphql" @@ -1159,3 +1160,23 @@ func IPForContext(ctx context.Context) string { } return ip } + +// SanitizeUTF8 will strip out invalid utf-8 characters +func SanitizeUTF8(input string) string { + if utf8.ValidString(input) { + return input + } + + var b strings.Builder + for i := 0; i < len(input); { + r, size := utf8.DecodeRuneInString(input[i:]) + if r == utf8.RuneError && size == 1 { + // Replace invalid bytes with a space + b.WriteString(" ") + } else { + b.WriteRune(r) + } + i += size + } + return b.String() +} -- 2.45.3