mirror of
https://github.com/denoland/deno.git
synced 2024-10-30 09:08:00 -04:00
133 lines
4.9 KiB
TypeScript
133 lines
4.9 KiB
TypeScript
// This module is based on Bjoern Hoehrmann's DFA UTF-8 decoder.
|
|
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
|
//
|
|
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
// `.apply` can actually take a typed array, though the type system doesn't
|
|
// really support it, so we have to "hack" it a bit to get past some of the
|
|
// strict type checks.
|
|
declare global {
|
|
interface CallableFunction extends Function {
|
|
apply<T, R>(
|
|
this: (this: T, ...args: number[]) => R,
|
|
thisArg: T,
|
|
args: Uint16Array
|
|
): R;
|
|
}
|
|
}
|
|
|
|
export function decodeUtf8(
|
|
input: Uint8Array,
|
|
fatal: boolean,
|
|
ignoreBOM: boolean
|
|
): string {
|
|
let outString = "";
|
|
|
|
// Prepare a buffer so that we don't have to do a lot of string concats, which
|
|
// are very slow.
|
|
const outBufferLength: number = Math.min(1024, input.length);
|
|
const outBuffer = new Uint16Array(outBufferLength);
|
|
let outIndex = 0;
|
|
|
|
let state = 0;
|
|
let codepoint = 0;
|
|
let type: number;
|
|
|
|
let i =
|
|
ignoreBOM && input[0] === 0xef && input[1] === 0xbb && input[2] === 0xbf
|
|
? 3
|
|
: 0;
|
|
|
|
for (; i < input.length; ++i) {
|
|
// Encoding error handling
|
|
if (state === 12 || (state !== 0 && (input[i] & 0xc0) !== 0x80)) {
|
|
if (fatal)
|
|
throw new TypeError(
|
|
`Decoder error. Invalid byte in sequence at position ${i} in data.`
|
|
);
|
|
outBuffer[outIndex++] = 0xfffd; // Replacement character
|
|
if (outIndex === outBufferLength) {
|
|
outString += String.fromCharCode.apply(null, outBuffer);
|
|
outIndex = 0;
|
|
}
|
|
state = 0;
|
|
}
|
|
|
|
// prettier-ignore
|
|
type = [
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
|
|
][input[i]];
|
|
codepoint =
|
|
state !== 0
|
|
? (input[i] & 0x3f) | (codepoint << 6)
|
|
: (0xff >> type) & input[i];
|
|
// prettier-ignore
|
|
state = [
|
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
12,36,12,12,12,12,12,12,12,12,12,12
|
|
][state + type];
|
|
|
|
if (state !== 0) continue;
|
|
|
|
// Add codepoint to buffer (as charcodes for utf-16), and flush buffer to
|
|
// string if needed.
|
|
if (codepoint > 0xffff) {
|
|
outBuffer[outIndex++] = 0xd7c0 + (codepoint >> 10);
|
|
if (outIndex === outBufferLength) {
|
|
outString += String.fromCharCode.apply(null, outBuffer);
|
|
outIndex = 0;
|
|
}
|
|
outBuffer[outIndex++] = 0xdc00 | (codepoint & 0x3ff);
|
|
if (outIndex === outBufferLength) {
|
|
outString += String.fromCharCode.apply(null, outBuffer);
|
|
outIndex = 0;
|
|
}
|
|
} else {
|
|
outBuffer[outIndex++] = codepoint;
|
|
if (outIndex === outBufferLength) {
|
|
outString += String.fromCharCode.apply(null, outBuffer);
|
|
outIndex = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add a replacement character if we ended in the middle of a sequence or
|
|
// encountered an invalid code at the end.
|
|
if (state !== 0) {
|
|
if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
|
|
outBuffer[outIndex++] = 0xfffd; // Replacement character
|
|
}
|
|
|
|
// Final flush of buffer
|
|
outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex));
|
|
|
|
return outString;
|
|
}
|