1
0
Fork 0
mirror of https://github.com/denoland/deno.git synced 2025-01-15 10:35:19 -05:00
denoland-deno/ext/node/polyfills/string_decoder.ts

421 lines
12 KiB
TypeScript

// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
// Copyright Joyent, Inc. and other Node contributors.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to permit
// persons to whom the Software is furnished to do so, subject to the
// following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
// USE OR OTHER DEALINGS IN THE SOFTWARE.
// Logic and comments translated pretty much one-to-one from node's impl
// (https://github.com/nodejs/node/blob/ba06c5c509956dc413f91b755c1c93798bb700d4/src/string_decoder.cc)
import { Buffer, constants } from "node:buffer";
import { normalizeEncoding as castEncoding } from "ext:deno_node/_utils.ts";
import {
ERR_INVALID_ARG_TYPE,
ERR_INVALID_THIS,
ERR_UNKNOWN_ENCODING,
NodeError,
} from "ext:deno_node/internal/errors.ts";
import { core, primordials } from "ext:core/mod.js";
const {
ArrayBufferIsView,
ObjectDefineProperties,
Symbol,
MathMin,
DataViewPrototypeGetBuffer,
ObjectPrototypeIsPrototypeOf,
String,
TypedArrayPrototypeGetBuffer,
StringPrototypeToLowerCase,
} = primordials;
const { isTypedArray } = core;
const { MAX_STRING_LENGTH } = constants;
// to cast from string to `BufferEncoding`, which doesn't seem nameable from here
// deno-lint-ignore no-explicit-any
type Any = any;
function normalizeEncoding(enc?: string): string {
const encoding = castEncoding(enc ?? null);
if (!encoding) {
if (typeof enc !== "string" || StringPrototypeToLowerCase(enc) !== "raw") {
throw new ERR_UNKNOWN_ENCODING(
enc as Any,
);
}
}
return String(encoding);
}
/**
* Check is `ArrayBuffer` and not `TypedArray`. Typescript allowed `TypedArray` to be passed as `ArrayBuffer` and does not do a deep check
*/
function isBufferType(buf: Buffer) {
return ObjectPrototypeIsPrototypeOf(Buffer.prototype, buf) &&
buf.BYTES_PER_ELEMENT;
}
function normalizeBuffer(buf: Buffer) {
if (!ArrayBufferIsView(buf)) {
throw new ERR_INVALID_ARG_TYPE(
"buf",
["Buffer", "TypedArray", "DataView"],
buf,
);
}
if (isBufferType(buf)) {
return buf;
} else {
return Buffer.from(
isTypedArray(buf)
? TypedArrayPrototypeGetBuffer(buf)
: DataViewPrototypeGetBuffer(buf),
);
}
}
function bufferToString(
buf: Buffer,
encoding?: string,
start?: number,
end?: number,
): string {
const len = (end ?? buf.length) - (start ?? 0);
if (len > MAX_STRING_LENGTH) {
throw new NodeError("ERR_STRING_TOO_LONG", "string exceeds maximum length");
}
// deno-lint-ignore prefer-primordials
return buf.toString(encoding as Any, start, end);
}
// the heart of the logic, decodes a buffer, storing
// incomplete characters in a buffer if applicable
function decode(this: StringDecoder, buf: Buffer) {
const enc = this.enc;
let bufIdx = 0;
let bufEnd = buf.length;
let prepend = "";
let rest = "";
if (
enc === Encoding.Utf8 || enc === Encoding.Utf16 || enc === Encoding.Base64
) {
// check if we need to finish an incomplete char from the last chunk
// written. If we do, we copy the bytes into our `lastChar` buffer
// and prepend the completed char to the result of decoding the rest of the buffer
if (this[kMissingBytes] > 0) {
if (enc === Encoding.Utf8) {
// Edge case for incomplete character at a chunk boundary
// (see https://github.com/nodejs/node/blob/73025c4dec042e344eeea7912ed39f7b7c4a3991/src/string_decoder.cc#L74)
for (
let i = 0;
i < buf.length - bufIdx && i < this[kMissingBytes];
i++
) {
if ((buf[i] & 0xC0) !== 0x80) {
// We expected a continuation byte, but got something else.
// Stop trying to decode the incomplete char, and assume
// the byte we got starts a new char.
this[kMissingBytes] = 0;
buf.copy(this.lastChar, this[kBufferedBytes], bufIdx, bufIdx + i);
this[kBufferedBytes] += i;
bufIdx += i;
break;
}
}
}
const bytesToCopy = MathMin(buf.length - bufIdx, this[kMissingBytes]);
buf.copy(
this.lastChar,
this[kBufferedBytes],
bufIdx,
bufIdx + bytesToCopy,
);
bufIdx += bytesToCopy;
this[kBufferedBytes] += bytesToCopy;
this[kMissingBytes] -= bytesToCopy;
if (this[kMissingBytes] === 0) {
// we have all the bytes, complete the char
prepend = bufferToString(
this.lastChar,
this.encoding,
0,
this[kBufferedBytes],
);
// reset the char buffer
this[kBufferedBytes] = 0;
}
}
if (buf.length - bufIdx === 0) {
// we advanced the bufIdx, so we may have completed the
// incomplete char
rest = prepend.length > 0 ? prepend : "";
prepend = "";
} else {
// no characters left to finish
// check if the end of the buffer has an incomplete
// character, if so we write it into our `lastChar` buffer and
// truncate buf
if (enc === Encoding.Utf8 && (buf[buf.length - 1] & 0x80)) {
for (let i = buf.length - 1;; i--) {
this[kBufferedBytes] += 1;
if ((buf[i] & 0xC0) === 0x80) {
// Doesn't start a character (i.e. it's a trailing byte)
if (this[kBufferedBytes] >= 4 || i === 0) {
// invalid utf8, we'll just pass it to the underlying decoder
this[kBufferedBytes] = 0;
break;
}
} else {
// First byte of a UTF-8 char, check
// to see how long it should be
if ((buf[i] & 0xE0) === 0xC0) {
this[kMissingBytes] = 2;
} else if ((buf[i] & 0xF0) === 0xE0) {
this[kMissingBytes] = 3;
} else if ((buf[i] & 0xF8) === 0xF0) {
this[kMissingBytes] = 4;
} else {
// invalid
this[kBufferedBytes] = 0;
break;
}
if (this[kBufferedBytes] >= this[kMissingBytes]) {
// We have enough trailing bytes to complete
// the char
this[kMissingBytes] = 0;
this[kBufferedBytes] = 0;
}
this[kMissingBytes] -= this[kBufferedBytes];
break;
}
}
} else if (enc === Encoding.Utf16) {
if ((buf.length - bufIdx) % 2 === 1) {
// Have half of a code unit
this[kBufferedBytes] = 1;
this[kMissingBytes] = 1;
} else if ((buf[buf.length - 1] & 0xFC) === 0xD8) {
// 2 bytes out of a 4 byte UTF-16 char
this[kBufferedBytes] = 2;
this[kMissingBytes] = 2;
}
} else if (enc === Encoding.Base64) {
this[kBufferedBytes] = (buf.length - bufIdx) % 3;
if (this[kBufferedBytes] > 0) {
this[kMissingBytes] = 3 - this[kBufferedBytes];
}
}
if (this[kBufferedBytes] > 0) {
// Copy the bytes that make up the incomplete char
// from the end of the buffer into our `lastChar` buffer
buf.copy(
this.lastChar,
0,
buf.length - this[kBufferedBytes],
);
bufEnd -= this[kBufferedBytes];
}
rest = bufferToString(buf, this.encoding, bufIdx, bufEnd);
}
if (prepend.length === 0) {
return rest;
} else {
return prepend + rest;
}
} else {
return bufferToString(buf, this.encoding, bufIdx, bufEnd);
}
}
function flush(this: StringDecoder) {
const enc = this.enc;
if (enc === Encoding.Utf16 && this[kBufferedBytes] % 2 === 1) {
// ignore trailing byte if it isn't a complete code unit (2 bytes)
this[kBufferedBytes] -= 1;
this[kMissingBytes] -= 1;
}
if (this[kBufferedBytes] === 0) {
return "";
}
const ret = bufferToString(
this.lastChar,
this.encoding,
0,
this[kBufferedBytes],
);
this[kBufferedBytes] = 0;
this[kMissingBytes] = 0;
return ret;
}
enum Encoding {
Utf8,
Base64,
Utf16,
Ascii,
Latin1,
Hex,
}
const kBufferedBytes = Symbol("bufferedBytes");
const kMissingBytes = Symbol("missingBytes");
type StringDecoder = {
encoding: string;
end: (buf: Buffer) => string;
write: (buf: Buffer) => string;
lastChar: Buffer;
lastNeed: number;
lastTotal: number;
text: (buf: Buffer, idx: number) => string;
enc: Encoding;
decode: (buf: Buffer) => string;
[kBufferedBytes]: number;
[kMissingBytes]: number;
flush: () => string;
};
/*
* StringDecoder provides an interface for efficiently splitting a series of
* buffers into a series of JS strings without breaking apart multi-byte
* characters.
*/
export function StringDecoder(this: Partial<StringDecoder>, encoding?: string) {
const normalizedEncoding = normalizeEncoding(encoding);
let enc: Encoding = Encoding.Utf8;
let bufLen = 0;
switch (normalizedEncoding) {
case "utf8":
enc = Encoding.Utf8;
bufLen = 4;
break;
case "base64":
enc = Encoding.Base64;
bufLen = 3;
break;
case "utf16le":
enc = Encoding.Utf16;
bufLen = 4;
break;
case "hex":
enc = Encoding.Hex;
bufLen = 0;
break;
case "latin1":
enc = Encoding.Latin1;
bufLen = 0;
break;
case "ascii":
enc = Encoding.Ascii;
bufLen = 0;
break;
}
this.encoding = normalizedEncoding;
this.lastChar = Buffer.allocUnsafe(bufLen);
this.enc = enc;
this[kBufferedBytes] = 0;
this[kMissingBytes] = 0;
this.flush = flush;
this.decode = decode;
}
/**
* Returns a decoded string, omitting any incomplete multi-bytes
* characters at the end of the Buffer, or TypedArray, or DataView
*/
StringDecoder.prototype.write = function write(buf: Buffer): string {
if (typeof buf === "string") {
return buf;
}
const normalizedBuf = normalizeBuffer(buf);
if (this[kBufferedBytes] === undefined) {
throw new ERR_INVALID_THIS("StringDecoder");
}
return this.decode(normalizedBuf);
};
/**
* Returns any remaining input stored in the internal buffer as a string.
* After end() is called, the stringDecoder object can be reused for new
* input.
*/
StringDecoder.prototype.end = function end(buf: Buffer): string {
let ret = "";
if (buf !== undefined) {
ret = this.write(buf);
}
if (this[kBufferedBytes] > 0) {
ret += this.flush();
}
return ret;
};
// Below is undocumented but accessible stuff from node's old impl
// (node's tests assert on these, so we need to support them)
StringDecoder.prototype.text = function text(
buf: Buffer,
offset: number,
): string {
this[kBufferedBytes] = 0;
this[kMissingBytes] = 0;
return this.write(buf.subarray(offset));
};
ObjectDefineProperties(StringDecoder.prototype, {
lastNeed: {
configurable: true,
enumerable: true,
get(this: StringDecoder): number {
return this[kMissingBytes];
},
},
lastTotal: {
configurable: true,
enumerable: true,
get(this: StringDecoder): number {
return this[kBufferedBytes] + this[kMissingBytes];
},
},
});
export default { StringDecoder };