// Copyright 2017 The Bazel Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package syntax // Starlark quoted string utilities. import ( "fmt" "strconv" "strings" "unicode" "unicode/utf8" ) // unesc maps single-letter chars following \ to their actual values. var unesc = [256]byte{ 'a': '\a', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', '\\': '\\', '\'': '\'', '"': '"', } // esc maps escape-worthy bytes to the char that should follow \. var esc = [256]byte{ '\a': 'a', '\b': 'b', '\f': 'f', '\n': 'n', '\r': 'r', '\t': 't', '\v': 'v', '\\': '\\', '\'': '\'', '"': '"', } // unquote unquotes the quoted string, returning the actual // string value, whether the original was triple-quoted, // whether it was a byte string, and an error describing invalid input. func unquote(quoted string) (s string, triple, isByte bool, err error) { // Check for raw prefix: means don't interpret the inner \. raw := false if strings.HasPrefix(quoted, "r") { raw = true quoted = quoted[1:] } // Check for bytes prefix. if strings.HasPrefix(quoted, "b") { isByte = true quoted = quoted[1:] } if len(quoted) < 2 { err = fmt.Errorf("string literal too short") return } if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] { err = fmt.Errorf("string literal has invalid quotes") return } // Check for triple quoted string. quote := quoted[0] if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] { triple = true quoted = quoted[3 : len(quoted)-3] } else { quoted = quoted[1 : len(quoted)-1] } // Now quoted is the quoted data, but no quotes. // If we're in raw mode or there are no escapes or // carriage returns, we're done. var unquoteChars string if raw { unquoteChars = "\r" } else { unquoteChars = "\\\r" } if !strings.ContainsAny(quoted, unquoteChars) { s = quoted return } // Otherwise process quoted string. // Each iteration processes one escape sequence along with the // plain text leading up to it. buf := new(strings.Builder) for { // Remove prefix before escape sequence. i := strings.IndexAny(quoted, unquoteChars) if i < 0 { i = len(quoted) } buf.WriteString(quoted[:i]) quoted = quoted[i:] if len(quoted) == 0 { break } // Process carriage return. if quoted[0] == '\r' { buf.WriteByte('\n') if len(quoted) > 1 && quoted[1] == '\n' { quoted = quoted[2:] } else { quoted = quoted[1:] } continue } // Process escape sequence. if len(quoted) == 1 { err = fmt.Errorf(`truncated escape sequence \`) return } switch quoted[1] { default: // In Starlark, like Go, a backslash must escape something. // (Python still treats unnecessary backslashes literally, // but since 3.6 has emitted a deprecation warning.) err = fmt.Errorf("invalid escape sequence \\%c", quoted[1]) return case '\n': // Ignore the escape and the line break. quoted = quoted[2:] case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': // One-char escape. // Escapes are allowed for both kinds of quotation // mark, not just the kind in use. buf.WriteByte(unesc[quoted[1]]) quoted = quoted[2:] case '0', '1', '2', '3', '4', '5', '6', '7': // Octal escape, up to 3 digits, \OOO. n := int(quoted[1] - '0') quoted = quoted[2:] for i := 1; i < 3; i++ { if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] { break } n = n*8 + int(quoted[0]-'0') quoted = quoted[1:] } if !isByte && n > 127 { err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n) return } if n >= 256 { // NOTE: Python silently discards the high bit, // so that '\541' == '\141' == 'a'. // Let's see if we can avoid doing that in BUILD files. err = fmt.Errorf(`invalid escape sequence \%03o`, n) return } buf.WriteByte(byte(n)) case 'x': // Hexadecimal escape, exactly 2 digits, \xXX. [0-127] if len(quoted) < 4 { err = fmt.Errorf(`truncated escape sequence %s`, quoted) return } n, err1 := strconv.ParseUint(quoted[2:4], 16, 0) if err1 != nil { err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4]) return } if !isByte && n > 127 { err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`, quoted[:4], n, n) return } buf.WriteByte(byte(n)) quoted = quoted[4:] case 'u', 'U': // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits. sz := 6 if quoted[1] == 'U' { sz = 10 } if len(quoted) < sz { err = fmt.Errorf(`truncated escape sequence %s`, quoted) return } n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0) if err1 != nil { err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz]) return } if n > unicode.MaxRune { err = fmt.Errorf(`code point out of range: %s (max \U%08x)`, quoted[:sz], n) return } // As in Go, surrogates are disallowed. if 0xD800 <= n && n < 0xE000 { err = fmt.Errorf(`invalid Unicode code point U+%04X`, n) return } buf.WriteRune(rune(n)) quoted = quoted[sz:] } } s = buf.String() return } // indexByte returns the index of the first instance of b in s, or else -1. func indexByte(s string, b byte) int { for i := 0; i < len(s); i++ { if s[i] == b { return i } } return -1 } // Quote returns a Starlark literal that denotes s. // If b, it returns a bytes literal. func Quote(s string, b bool) string { const hex = "0123456789abcdef" var runeTmp [utf8.UTFMax]byte buf := make([]byte, 0, 3*len(s)/2) if b { buf = append(buf, 'b') } buf = append(buf, '"') for width := 0; len(s) > 0; s = s[width:] { r := rune(s[0]) width = 1 if r >= utf8.RuneSelf { r, width = utf8.DecodeRuneInString(s) } if width == 1 && r == utf8.RuneError { // String (!b) literals accept \xXX escapes only for ASCII, // but we must use them here to represent invalid bytes. // The result is not a legal literal. buf = append(buf, `\x`...) buf = append(buf, hex[s[0]>>4]) buf = append(buf, hex[s[0]&0xF]) continue } if r == '"' || r == '\\' { // always backslashed buf = append(buf, '\\') buf = append(buf, byte(r)) continue } if strconv.IsPrint(r) { n := utf8.EncodeRune(runeTmp[:], r) buf = append(buf, runeTmp[:n]...) continue } switch r { case '\a': buf = append(buf, `\a`...) case '\b': buf = append(buf, `\b`...) case '\f': buf = append(buf, `\f`...) case '\n': buf = append(buf, `\n`...) case '\r': buf = append(buf, `\r`...) case '\t': buf = append(buf, `\t`...) case '\v': buf = append(buf, `\v`...) default: switch { case r < ' ' || r == 0x7f: buf = append(buf, `\x`...) buf = append(buf, hex[byte(r)>>4]) buf = append(buf, hex[byte(r)&0xF]) case r > utf8.MaxRune: r = 0xFFFD fallthrough case r < 0x10000: buf = append(buf, `\u`...) for s := 12; s >= 0; s -= 4 { buf = append(buf, hex[r>>uint(s)&0xF]) } default: buf = append(buf, `\U`...) for s := 28; s >= 0; s -= 4 { buf = append(buf, hex[r>>uint(s)&0xF]) } } } } buf = append(buf, '"') return string(buf) }