forgejo/modules/charset/escape.go

// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
	"bytes"
	"fmt"
	"io"
	"strings"
	"unicode"
	"unicode/utf8"

	"golang.org/x/text/unicode/bidi"
)

// EscapeStatus represents the findings of the unicode escaper
type EscapeStatus struct {
	Escaped      bool
	HasError     bool
	HasBadRunes  bool
	HasControls  bool
	HasSpaces    bool
	HasMarks     bool
	HasBIDI      bool
	BadBIDI      bool
	HasRTLScript bool
	HasLTRScript bool
}

// Or combines two EscapeStatus structs into one representing the conjunction of the two
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
	st := status
	st.Escaped = st.Escaped || other.Escaped
	st.HasError = st.HasError || other.HasError
	st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
	st.HasControls = st.HasControls || other.HasControls
	st.HasSpaces = st.HasSpaces || other.HasSpaces
	st.HasMarks = st.HasMarks || other.HasMarks
	st.HasBIDI = st.HasBIDI || other.HasBIDI
	st.BadBIDI = st.BadBIDI || other.BadBIDI
	st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
	st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
	return st
}

// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
func EscapeControlString(text string) (EscapeStatus, string) {
	sb := &strings.Builder{}
	escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
	return escaped, sb.String()
}

// EscapeControlBytes escapes the unicode control sequences  a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
	buf := &bytes.Buffer{}
	escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
	return escaped, buf.Bytes()
}

// EscapeControlReader escapes the unicode control sequences  a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
	buf := make([]byte, 4096)
	readStart := 0
	runeCount := 0
	var n int
	var writePos int

	lineHasBIDI := false
	lineHasRTLScript := false
	lineHasLTRScript := false

readingloop:
	for err == nil {
		n, err = text.Read(buf[readStart:])
		bs := buf[:n+readStart]
		n = len(bs)
		i := 0

		for i < len(bs) {
			r, size := utf8.DecodeRune(bs[i:])
			runeCount++

			// Now handle the codepoints
			switch {
			case r == utf8.RuneError:
				if writePos < i {
					if _, err = output.Write(bs[writePos:i]); err != nil {
						escaped.HasError = true
						return
					}
					writePos = i
				}
				// runes can be at most 4 bytes - so...
				if len(bs)-i <= 3 {
					// if not request more data
					copy(buf, bs[i:])
					readStart = n - i
					writePos = 0
					continue readingloop
				}
				// this is a real broken rune
				escaped.HasBadRunes = true
				escaped.Escaped = true
				if err = writeBroken(output, bs[i:i+size]); err != nil {
					escaped.HasError = true
					return
				}
				writePos += size
			case r == '\n':
				if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
					escaped.BadBIDI = true
				}
				lineHasBIDI = false
				lineHasRTLScript = false
				lineHasLTRScript = false

			case runeCount == 1 && r == 0xFEFF: // UTF BOM
				// the first BOM is safe
			case r == '\r' || r == '\t' || r == ' ':
				// These are acceptable control characters and space characters
			case unicode.IsSpace(r):
				escaped.HasSpaces = true
				escaped.Escaped = true
				if writePos < i {
					if _, err = output.Write(bs[writePos:i]); err != nil {
						escaped.HasError = true
						return
					}
				}
				if err = writeEscaped(output, r); err != nil {
					escaped.HasError = true
					return
				}
				writePos = i + size
			case unicode.Is(unicode.Bidi_Control, r):
				escaped.Escaped = true
				escaped.HasBIDI = true
				if writePos < i {
					if _, err = output.Write(bs[writePos:i]); err != nil {
						escaped.HasError = true
						return
					}
				}
				lineHasBIDI = true
				if err = writeEscaped(output, r); err != nil {
					escaped.HasError = true
					return
				}
				writePos = i + size
			case unicode.Is(unicode.C, r):
				escaped.Escaped = true
				escaped.HasControls = true
				if writePos < i {
					if _, err = output.Write(bs[writePos:i]); err != nil {
						escaped.HasError = true
						return
					}
				}
				if err = writeEscaped(output, r); err != nil {
					escaped.HasError = true
					return
				}
				writePos = i + size
			case unicode.Is(unicode.M, r):
				escaped.Escaped = true
				escaped.HasMarks = true
				if writePos < i {
					if _, err = output.Write(bs[writePos:i]); err != nil {
						escaped.HasError = true
						return
					}
				}
				if err = writeEscaped(output, r); err != nil {
					escaped.HasError = true
					return
				}
				writePos = i + size
			default:
				p, _ := bidi.Lookup(bs[i : i+size])
				c := p.Class()
				if c == bidi.R || c == bidi.AL {
					lineHasRTLScript = true
					escaped.HasRTLScript = true
				} else if c == bidi.L {
					lineHasLTRScript = true
					escaped.HasLTRScript = true
				}
			}
			i += size
		}
		if n > 0 {
			// we read something...
			// write everything unwritten
			if writePos < i {
				if _, err = output.Write(bs[writePos:i]); err != nil {
					escaped.HasError = true
					return
				}
			}

			// reset the starting positions for the next read
			readStart = 0
			writePos = 0
		}
	}
	if readStart > 0 {
		// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
		escaped.Escaped = true
		escaped.HasBadRunes = true
		if err = writeBroken(output, buf[:readStart]); err != nil {
			escaped.HasError = true
			return
		}
	}
	if err == io.EOF {
		if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
			escaped.BadBIDI = true
		}
		err = nil
		return
	}
	escaped.HasError = true
	return
}

func writeBroken(output io.Writer, bs []byte) (err error) {
	_, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs)
	return
}

func writeEscaped(output io.Writer, r rune) (err error) {
	_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
	return
}
Add warning for BIDI characters in page renders and in diffs (#17562) Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-01-06 20:18:52 -05:00			`// Copyright 2021 The Gitea Authors. All rights reserved.`
			`// Use of this source code is governed by a MIT-style`
			`// license that can be found in the LICENSE file.`

			`package charset`

			`import (`
			`"bytes"`
			`"fmt"`
			`"io"`
			`"strings"`
			`"unicode"`
			`"unicode/utf8"`

			`"golang.org/x/text/unicode/bidi"`
			`)`

			`// EscapeStatus represents the findings of the unicode escaper`
			`type EscapeStatus struct {`
			`Escaped bool`
			`HasError bool`
			`HasBadRunes bool`
			`HasControls bool`
			`HasSpaces bool`
			`HasMarks bool`
			`HasBIDI bool`
			`BadBIDI bool`
			`HasRTLScript bool`
			`HasLTRScript bool`
			`}`

			`// Or combines two EscapeStatus structs into one representing the conjunction of the two`
			`func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {`
			`st := status`
			`st.Escaped = st.Escaped \|\| other.Escaped`
			`st.HasError = st.HasError \|\| other.HasError`
			`st.HasBadRunes = st.HasBadRunes \|\| other.HasBadRunes`
			`st.HasControls = st.HasControls \|\| other.HasControls`
			`st.HasSpaces = st.HasSpaces \|\| other.HasSpaces`
			`st.HasMarks = st.HasMarks \|\| other.HasMarks`
			`st.HasBIDI = st.HasBIDI \|\| other.HasBIDI`
			`st.BadBIDI = st.BadBIDI \|\| other.BadBIDI`
			`st.HasRTLScript = st.HasRTLScript \|\| other.HasRTLScript`
			`st.HasLTRScript = st.HasLTRScript \|\| other.HasLTRScript`
			`return st`
			`}`

			`// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string`
			`func EscapeControlString(text string) (EscapeStatus, string) {`
			`sb := &strings.Builder{}`
			`escaped, _ := EscapeControlReader(strings.NewReader(text), sb)`
			`return escaped, sb.String()`
			`}`

			`// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte`
			`func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {`
			`buf := &bytes.Buffer{}`
			`escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)`
			`return escaped, buf.Bytes()`
			`}`

			`// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error`
			`func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {`
			`buf := make([]byte, 4096)`
			`readStart := 0`
Don't treat BOM escape sequence as hidden character. (#18909) * Don't treat BOM escape sequence as hidden character. - BOM sequence is a common non-harmfull escape sequence, it shouldn't be shown as hidden character. - Follows GitHub's behavior. - Resolves #18837 Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-02-26 11:48:23 -05:00			`runeCount := 0`
Add warning for BIDI characters in page renders and in diffs (#17562) Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-01-06 20:18:52 -05:00			`var n int`
			`var writePos int`

			`lineHasBIDI := false`
			`lineHasRTLScript := false`
			`lineHasLTRScript := false`

			`readingloop:`
			`for err == nil {`
			`n, err = text.Read(buf[readStart:])`
			`bs := buf[:n+readStart]`
Fix panic in EscapeReader (#18820) There is a potential panic due to a mistaken resetting of the length parameter when multibyte characters go over a read boundary. Signed-off-by: Andrew Thornton <art27@cantab.net> 2022-02-19 10:25:31 -05:00			`n = len(bs)`
Add warning for BIDI characters in page renders and in diffs (#17562) Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-01-06 20:18:52 -05:00			`i := 0`

			`for i < len(bs) {`
			`r, size := utf8.DecodeRune(bs[i:])`
Don't treat BOM escape sequence as hidden character. (#18909) * Don't treat BOM escape sequence as hidden character. - BOM sequence is a common non-harmfull escape sequence, it shouldn't be shown as hidden character. - Follows GitHub's behavior. - Resolves #18837 Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-02-26 11:48:23 -05:00			`runeCount++`

Add warning for BIDI characters in page renders and in diffs (#17562) Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-01-06 20:18:52 -05:00			`// Now handle the codepoints`
			`switch {`
			`case r == utf8.RuneError:`
			`if writePos < i {`
			`if _, err = output.Write(bs[writePos:i]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`writePos = i`
			`}`
			`// runes can be at most 4 bytes - so...`
			`if len(bs)-i <= 3 {`
			`// if not request more data`
			`copy(buf, bs[i:])`
			`readStart = n - i`
			`writePos = 0`
			`continue readingloop`
			`}`
			`// this is a real broken rune`
			`escaped.HasBadRunes = true`
			`escaped.Escaped = true`
			`if err = writeBroken(output, bs[i:i+size]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`writePos += size`
			`case r == '\n':`
			`if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {`
			`escaped.BadBIDI = true`
			`}`
			`lineHasBIDI = false`
			`lineHasRTLScript = false`
			`lineHasLTRScript = false`

Don't treat BOM escape sequence as hidden character. (#18909) * Don't treat BOM escape sequence as hidden character. - BOM sequence is a common non-harmfull escape sequence, it shouldn't be shown as hidden character. - Follows GitHub's behavior. - Resolves #18837 Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-02-26 11:48:23 -05:00			`case runeCount == 1 && r == 0xFEFF: // UTF BOM`
			`// the first BOM is safe`
Add warning for BIDI characters in page renders and in diffs (#17562) Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com> 2022-01-06 20:18:52 -05:00			`case r == '\r' \|\| r == '\t' \|\| r == ' ':`
			`// These are acceptable control characters and space characters`
			`case unicode.IsSpace(r):`
			`escaped.HasSpaces = true`
			`escaped.Escaped = true`
			`if writePos < i {`
			`if _, err = output.Write(bs[writePos:i]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`}`
			`if err = writeEscaped(output, r); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`writePos = i + size`
			`case unicode.Is(unicode.Bidi_Control, r):`
			`escaped.Escaped = true`
			`escaped.HasBIDI = true`
			`if writePos < i {`
			`if _, err = output.Write(bs[writePos:i]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`}`
			`lineHasBIDI = true`
			`if err = writeEscaped(output, r); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`writePos = i + size`
			`case unicode.Is(unicode.C, r):`
			`escaped.Escaped = true`
			`escaped.HasControls = true`
			`if writePos < i {`
			`if _, err = output.Write(bs[writePos:i]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`}`
			`if err = writeEscaped(output, r); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`writePos = i + size`
			`case unicode.Is(unicode.M, r):`
			`escaped.Escaped = true`
			`escaped.HasMarks = true`
			`if writePos < i {`
			`if _, err = output.Write(bs[writePos:i]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`}`
			`if err = writeEscaped(output, r); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`writePos = i + size`
			`default:`
			`p, _ := bidi.Lookup(bs[i : i+size])`
			`c := p.Class()`
			`if c == bidi.R \|\| c == bidi.AL {`
			`lineHasRTLScript = true`
			`escaped.HasRTLScript = true`
			`} else if c == bidi.L {`
			`lineHasLTRScript = true`
			`escaped.HasLTRScript = true`
			`}`
			`}`
			`i += size`
			`}`
			`if n > 0 {`
			`// we read something...`
			`// write everything unwritten`
			`if writePos < i {`
			`if _, err = output.Write(bs[writePos:i]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`}`

			`// reset the starting positions for the next read`
			`readStart = 0`
			`writePos = 0`
			`}`
			`}`
			`if readStart > 0 {`
			`// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round`
			`escaped.Escaped = true`
			`escaped.HasBadRunes = true`
			`if err = writeBroken(output, buf[:readStart]); err != nil {`
			`escaped.HasError = true`
			`return`
			`}`
			`}`
			`if err == io.EOF {`
			`if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {`
			`escaped.BadBIDI = true`
			`}`
			`err = nil`
			`return`
			`}`
			`escaped.HasError = true`
			`return`
			`}`

			`func writeBroken(output io.Writer, bs []byte) (err error) {`
			_, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs)
			`return`
			`}`

			`func writeEscaped(output io.Writer, r rune) (err error) {`
			_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
			`return`
			`}`