denoland-deno/cli/js/decode_utf8.ts

// Copyright 2018-2019 the Deno authors. All rights reserved. MIT license.
// The following code is based off:
// https://github.com/inexorabletash/text-encoding
//
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

// `.apply` can actually take a typed array, though the type system doesn't
// really support it, so we have to "hack" it a bit to get past some of the
// strict type checks.
declare global {
  interface CallableFunction extends Function {
    apply<T, R>(
      this: (this: T, ...args: number[]) => R,
      thisArg: T,
      args: Uint16Array
    ): R;
  }
}

export function decodeUtf8(
  input: Uint8Array,
  fatal: boolean,
  ignoreBOM: boolean
): string {
  let outString = "";

  // Prepare a buffer so that we don't have to do a lot of string concats, which
  // are very slow.
  const outBufferLength: number = Math.min(1024, input.length);
  const outBuffer = new Uint16Array(outBufferLength);
  let outIndex = 0;

  let state = 0;
  let codepoint = 0;
  let type: number;

  let i =
    ignoreBOM && input[0] === 0xef && input[1] === 0xbb && input[2] === 0xbf
      ? 3
      : 0;

  for (; i < input.length; ++i) {
    // Encoding error handling
    if (state === 12 || (state !== 0 && (input[i] & 0xc0) !== 0x80)) {
      if (fatal)
        throw new TypeError(
          `Decoder error. Invalid byte in sequence at position ${i} in data.`
        );
      outBuffer[outIndex++] = 0xfffd; // Replacement character
      if (outIndex === outBufferLength) {
        outString += String.fromCharCode.apply(null, outBuffer);
        outIndex = 0;
      }
      state = 0;
    }

    // prettier-ignore
    type = [
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
       7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
       8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
      10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
    ][input[i]];
    codepoint =
      state !== 0
        ? (input[i] & 0x3f) | (codepoint << 6)
        : (0xff >> type) & input[i];
    // prettier-ignore
    state = [
       0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
      12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
      12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
      12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
      12,36,12,12,12,12,12,12,12,12,12,12
    ][state + type];

    if (state !== 0) continue;

    // Add codepoint to buffer (as charcodes for utf-16), and flush buffer to
    // string if needed.
    if (codepoint > 0xffff) {
      outBuffer[outIndex++] = 0xd7c0 + (codepoint >> 10);
      if (outIndex === outBufferLength) {
        outString += String.fromCharCode.apply(null, outBuffer);
        outIndex = 0;
      }
      outBuffer[outIndex++] = 0xdc00 | (codepoint & 0x3ff);
      if (outIndex === outBufferLength) {
        outString += String.fromCharCode.apply(null, outBuffer);
        outIndex = 0;
      }
    } else {
      outBuffer[outIndex++] = codepoint;
      if (outIndex === outBufferLength) {
        outString += String.fromCharCode.apply(null, outBuffer);
        outIndex = 0;
      }
    }
  }

  // Add a replacement character if we ended in the middle of a sequence or
  // encountered an invalid code at the end.
  if (state !== 0) {
    if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
    outBuffer[outIndex++] = 0xfffd; // Replacement character
  }

  // Final flush of buffer
  outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex));

  return outString;
}
Use a more performant utf8 decoder algorithm. (#3204) Fixes #3163 Co-authored-by: Kitson Kelly <me@kitsonkelly.com> Co-authored-by: Qwerasd <qwerasd205@users.noreply.github.com> 2019-10-26 10:51:53 -04:00			`// Copyright 2018-2019 the Deno authors. All rights reserved. MIT license.`
			`// The following code is based off:`
			`// https://github.com/inexorabletash/text-encoding`
			`//`
			`// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>`
			`//`
			`// Permission is hereby granted, free of charge, to any person obtaining a copy`
			`// of this software and associated documentation files (the "Software"), to deal`
			`// in the Software without restriction, including without limitation the rights`
			`// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`// copies of the Software, and to permit persons to whom the Software is`
			`// furnished to do so, subject to the following conditions:`
			`//`
			`// The above copyright notice and this permission notice shall be included in`
			`// all copies or substantial portions of the Software.`
			`//`
			`// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`// SOFTWARE.`

			// `.apply` can actually take a typed array, though the type system doesn't
			`// really support it, so we have to "hack" it a bit to get past some of the`
			`// strict type checks.`
			`declare global {`
			`interface CallableFunction extends Function {`
			`apply<T, R>(`
			`this: (this: T, ...args: number[]) => R,`
			`thisArg: T,`
			`args: Uint16Array`
			`): R;`
			`}`
			`}`

			`export function decodeUtf8(`
			`input: Uint8Array,`
			`fatal: boolean,`
			`ignoreBOM: boolean`
			`): string {`
			`let outString = "";`

			`// Prepare a buffer so that we don't have to do a lot of string concats, which`
			`// are very slow.`
			`const outBufferLength: number = Math.min(1024, input.length);`
			`const outBuffer = new Uint16Array(outBufferLength);`
			`let outIndex = 0;`

			`let state = 0;`
			`let codepoint = 0;`
			`let type: number;`

			`let i =`
			`ignoreBOM && input[0] === 0xef && input[1] === 0xbb && input[2] === 0xbf`
			`? 3`
			`: 0;`

			`for (; i < input.length; ++i) {`
			`// Encoding error handling`
			`if (state === 12 \|\| (state !== 0 && (input[i] & 0xc0) !== 0x80)) {`
			`if (fatal)`
			`throw new TypeError(`
			`Decoder error. Invalid byte in sequence at position ${i} in data.`
			`);`
			`outBuffer[outIndex++] = 0xfffd; // Replacement character`
			`if (outIndex === outBufferLength) {`
			`outString += String.fromCharCode.apply(null, outBuffer);`
			`outIndex = 0;`
			`}`
			`state = 0;`
			`}`

			`// prettier-ignore`
			`type = [`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,`
			`7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,`
			`8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,`
			`10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8`
			`][input[i]];`
			`codepoint =`
			`state !== 0`
			`? (input[i] & 0x3f) \| (codepoint << 6)`
			`: (0xff >> type) & input[i];`
			`// prettier-ignore`
			`state = [`
			`0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,`
			`12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,`
			`12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,`
			`12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,`
			`12,36,12,12,12,12,12,12,12,12,12,12`
			`][state + type];`

			`if (state !== 0) continue;`

			`// Add codepoint to buffer (as charcodes for utf-16), and flush buffer to`
			`// string if needed.`
			`if (codepoint > 0xffff) {`
			`outBuffer[outIndex++] = 0xd7c0 + (codepoint >> 10);`
			`if (outIndex === outBufferLength) {`
			`outString += String.fromCharCode.apply(null, outBuffer);`
			`outIndex = 0;`
			`}`
			`outBuffer[outIndex++] = 0xdc00 \| (codepoint & 0x3ff);`
			`if (outIndex === outBufferLength) {`
			`outString += String.fromCharCode.apply(null, outBuffer);`
			`outIndex = 0;`
			`}`
			`} else {`
			`outBuffer[outIndex++] = codepoint;`
			`if (outIndex === outBufferLength) {`
			`outString += String.fromCharCode.apply(null, outBuffer);`
			`outIndex = 0;`
			`}`
			`}`
			`}`

			`// Add a replacement character if we ended in the middle of a sequence or`
			`// encountered an invalid code at the end.`
			`if (state !== 0) {`
			if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
			`outBuffer[outIndex++] = 0xfffd; // Replacement character`
			`}`

			`// Final flush of buffer`
			`outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex));`

			`return outString;`
			`}`