| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559 |
- 'use strict';
- const utils = require('./utils.cjs');
- const NO_VALUE = Symbol('empty');
- const STACK_OBJECT = 1;
- const STACK_ARRAY = 2;
- const MODE_JSON = 0;
- const MODE_JSONL = 1;
- const MODE_JSONL_AUTO = 2;
- const decoder = new TextDecoder();
- function adjustPosition(error, jsonParseOffset) {
- if (error.name === 'SyntaxError' && jsonParseOffset) {
- error.message = error.message.replace(/at position (\d+)/, (_, pos) =>
- 'at position ' + (Number(pos) + jsonParseOffset)
- );
- }
- return error;
- }
- function append(array, elements) {
- // Note: Avoid using array.push(...elements) since it may lead to
- // "RangeError: Maximum call stack size exceeded" for long arrays
- const initialLength = array.length;
- array.length += elements.length;
- for (let i = 0; i < elements.length; i++) {
- array[initialLength + i] = elements[i];
- }
- }
- function resolveParseMode(mode) {
- switch (mode) {
- case 'json':
- return MODE_JSON;
- case 'jsonl':
- return MODE_JSONL;
- case 'auto':
- return MODE_JSONL_AUTO;
- default:
- throw new TypeError('Invalid options: `mode` should be "json", "jsonl", or "auto"');
- }
- }
- function parseChunkedOptions(value) {
- const options = typeof value === 'function'
- ? { reviver: value }
- : value || {};
- return {
- mode: resolveParseMode(options.mode ?? 'json'),
- reviver: options.reviver ?? null,
- onRootValue: options.onRootValue ?? null,
- onChunk: options.onChunk ?? null
- };
- }
- function applyReviver(value, reviver) {
- return walk({ '': value }, '', value);
- function walk(holder, key, value) {
- if (value && typeof value === 'object') {
- for (const childKey of Object.keys(value)) {
- const childValue = value[childKey];
- const newValue = walk(value, childKey, childValue);
- if (newValue === undefined) {
- delete value[childKey];
- } else if (newValue !== childValue) {
- value[childKey] = newValue;
- }
- }
- }
- return reviver.call(holder, key, value);
- }
- }
- async function parseChunked(chunkEmitter, optionsOrReviver) {
- const { mode, reviver, onRootValue, onChunk } = parseChunkedOptions(optionsOrReviver);
- const iterable = typeof chunkEmitter === 'function'
- ? chunkEmitter()
- : chunkEmitter;
- if (utils.isIterable(iterable)) {
- const parser = createChunkParser(mode, reviver, onRootValue, onChunk);
- try {
- for await (const chunk of iterable) {
- if (typeof chunk !== 'string' && !ArrayBuffer.isView(chunk)) {
- throw new TypeError('Invalid chunk: Expected string, TypedArray or Buffer');
- }
- parser.push(chunk);
- }
- return parser.finish();
- } catch (e) {
- throw adjustPosition(e, parser.jsonParseOffset);
- }
- }
- throw new TypeError(
- 'Invalid chunk emitter: Expected an Iterable, AsyncIterable, generator, ' +
- 'async generator, or a function returning an Iterable or AsyncIterable'
- );
- }
- function createChunkParser(parseMode, reviver, onRootValue, onChunk) {
- let rootValues = parseMode === MODE_JSONL ? [] : null;
- let rootValuesCount = 0;
- let currentRootValue = NO_VALUE;
- let currentRootValueCursor = null;
- let consumedChunkLength = 0;
- let parsedChunkLength = 0;
- let prevArray = null;
- let prevArraySlices = [];
- let stack = new Array(100);
- let lastFlushDepth = 0;
- let flushDepth = 0;
- let stateString = false;
- let stateStringEscape = false;
- let seenNonWhiteSpace = false;
- let allowNewRootValue = true;
- let pendingByteSeq = null;
- let pendingChunk = null;
- let jsonParseOffset = 0;
- const state = Object.freeze({
- get mode() {
- return parseMode === MODE_JSONL ? 'jsonl' : 'json';
- },
- get rootValuesCount() {
- return rootValuesCount;
- },
- get consumed() {
- return consumedChunkLength;
- },
- get parsed() {
- return parsedChunkLength;
- }
- });
- return {
- push,
- finish,
- state,
- get jsonParseOffset() {
- return jsonParseOffset;
- }
- };
- function startRootValue(fragment) {
- // Extra non-whitespace after complete root value should fail to parse
- if (!allowNewRootValue) {
- jsonParseOffset -= 2;
- JSON.parse('[]' + fragment);
- }
- // In "auto" mode, switch to JSONL when a second root value is starting after a newline
- if (currentRootValue !== NO_VALUE && parseMode === MODE_JSONL_AUTO) {
- parseMode = MODE_JSONL;
- rootValues = [currentRootValue];
- }
- // Block parsing of an additional root value until a newline is encountered
- allowNewRootValue = false;
- // Parse fragment as a new root value
- currentRootValue = JSON.parse(fragment);
- }
- function finishRootValue() {
- rootValuesCount++;
- if (typeof reviver === 'function') {
- currentRootValue = applyReviver(currentRootValue, reviver);
- }
- if (typeof onRootValue === 'function') {
- onRootValue(currentRootValue, state);
- } else if (parseMode === MODE_JSONL) {
- rootValues.push(currentRootValue);
- }
- }
- function mergeArraySlices() {
- if (prevArray === null) {
- return;
- }
- if (prevArraySlices.length !== 0) {
- const newArray = prevArraySlices.length === 1
- ? prevArray.concat(prevArraySlices[0])
- : prevArray.concat(...prevArraySlices);
- if (currentRootValueCursor.prev !== null) {
- currentRootValueCursor.prev.value[currentRootValueCursor.key] = newArray;
- } else {
- currentRootValue = newArray;
- }
- currentRootValueCursor.value = newArray;
- prevArraySlices = [];
- }
- prevArray = null;
- }
- function parseAndAppend(fragment, wrap) {
- // Append new entries or elements
- if (stack[lastFlushDepth - 1] === STACK_OBJECT) {
- if (wrap) {
- jsonParseOffset--;
- fragment = '{' + fragment + '}';
- }
- Object.assign(currentRootValueCursor.value, JSON.parse(fragment));
- } else {
- if (wrap) {
- jsonParseOffset--;
- fragment = '[' + fragment + ']';
- }
- if (prevArray === currentRootValueCursor.value) {
- prevArraySlices.push(JSON.parse(fragment));
- } else {
- append(currentRootValueCursor.value, JSON.parse(fragment));
- prevArray = currentRootValueCursor.value;
- }
- }
- }
- function prepareAddition(fragment) {
- const { value } = currentRootValueCursor;
- const expectComma = Array.isArray(value)
- ? value.length !== 0
- : Object.keys(value).length !== 0;
- if (expectComma) {
- // Skip a comma at the beginning of fragment, otherwise it would
- // fail to parse
- if (fragment[0] === ',') {
- jsonParseOffset++;
- return fragment.slice(1);
- }
- // When value (an object or array) is not empty and a fragment
- // doesn't start with a comma, a single valid fragment starting
- // is a closing bracket. If it's not, a prefix is adding to fail
- // parsing. Otherwise, the sequence of chunks can be successfully
- // parsed, although it should not, e.g. ["[{}", "{}]"]
- if (fragment[0] !== '}' && fragment[0] !== ']') {
- jsonParseOffset -= 3;
- return '[[]' + fragment;
- }
- }
- return fragment;
- }
- function flush(chunk, start, end) {
- let fragment = chunk.slice(start, end);
- // Save position correction for an error in JSON.parse() if any
- jsonParseOffset = consumedChunkLength + start;
- parsedChunkLength += end - start;
- // Prepend pending chunk if any
- if (pendingChunk !== null) {
- fragment = pendingChunk + fragment;
- jsonParseOffset -= pendingChunk.length;
- parsedChunkLength += pendingChunk.length;
- pendingChunk = null;
- }
- if (flushDepth === lastFlushDepth) {
- // Depth didn't change, so it's a continuation of the current value or entire value if it's a root one
- if (lastFlushDepth === 0) {
- startRootValue(fragment);
- } else {
- parseAndAppend(prepareAddition(fragment), true);
- }
- } else if (flushDepth > lastFlushDepth) {
- // Add missed closing brackets/parentheses
- for (let i = flushDepth - 1; i >= lastFlushDepth; i--) {
- fragment += stack[i] === STACK_OBJECT ? '}' : ']';
- }
- if (lastFlushDepth === 0) {
- startRootValue(fragment);
- currentRootValueCursor = {
- value: currentRootValue,
- key: null,
- prev: null
- };
- } else {
- parseAndAppend(prepareAddition(fragment), true);
- mergeArraySlices();
- }
- // Move down to the depths to the last object/array, which is current now
- for (let i = lastFlushDepth || 1; i < flushDepth; i++) {
- let { value } = currentRootValueCursor;
- let key = null;
- if (stack[i - 1] === STACK_OBJECT) {
- // Find last entry
- // eslint-disable-next-line curly
- for (key in value);
- value = value[key];
- } else {
- // Last element
- key = value.length - 1;
- value = value[key];
- }
- currentRootValueCursor = {
- value,
- key,
- prev: currentRootValueCursor
- };
- }
- } else /* flushDepth < lastFlushDepth */ {
- fragment = prepareAddition(fragment);
- // Add missed opening brackets/parentheses
- for (let i = lastFlushDepth - 1; i >= flushDepth; i--) {
- jsonParseOffset--;
- fragment = (stack[i] === STACK_OBJECT ? '{' : '[') + fragment;
- }
- parseAndAppend(fragment, false);
- mergeArraySlices();
- for (let i = lastFlushDepth - 1; i >= flushDepth; i--) {
- currentRootValueCursor = currentRootValueCursor.prev;
- }
- }
- if (flushDepth === 0) {
- finishRootValue();
- }
- lastFlushDepth = flushDepth;
- seenNonWhiteSpace = false;
- }
- function ensureChunkString(chunk) {
- if (typeof chunk !== 'string') {
- // Suppose chunk is Buffer or Uint8Array
- // Prepend uncompleted byte sequence if any
- if (pendingByteSeq !== null) {
- const origRawChunk = chunk;
- chunk = new Uint8Array(pendingByteSeq.length + origRawChunk.length);
- chunk.set(pendingByteSeq);
- chunk.set(origRawChunk, pendingByteSeq.length);
- pendingByteSeq = null;
- }
- // In case Buffer/Uint8Array, an input is encoded in UTF8
- // Seek for parts of uncompleted UTF8 symbol on the ending
- // This makes sense only if we expect more chunks and last char is not multi-bytes
- if (chunk[chunk.length - 1] > 127) {
- for (let seqLength = 0; seqLength < chunk.length; seqLength++) {
- const byte = chunk[chunk.length - 1 - seqLength];
- // 10xxxxxx - 2nd, 3rd or 4th byte
- // 110xxxxx – first byte of 2-byte sequence
- // 1110xxxx - first byte of 3-byte sequence
- // 11110xxx - first byte of 4-byte sequence
- if (byte >> 6 === 3) {
- seqLength++;
- // If the sequence is really incomplete, then preserve it
- // for the future chunk and cut off it from the current chunk
- if ((seqLength !== 4 && byte >> 3 === 0b11110) ||
- (seqLength !== 3 && byte >> 4 === 0b1110) ||
- (seqLength !== 2 && byte >> 5 === 0b110)) {
- pendingByteSeq = chunk.slice(chunk.length - seqLength); // use slice to avoid tying chunk
- chunk = chunk.subarray(0, -seqLength); // use subarray to avoid buffer copy
- }
- break;
- }
- }
- }
- // Convert chunk to a string, since single decode per chunk
- // is much effective than decode multiple small substrings
- chunk = decoder.decode(chunk);
- }
- return chunk;
- }
- function push(chunk) {
- chunk = ensureChunkString(chunk);
- const chunkLength = chunk.length;
- const prevParsedChunkLength = parsedChunkLength;
- let lastFlushPoint = 0;
- let flushPoint = 0;
- // Main scan loop
- scan: for (let i = 0; i < chunkLength; i++) {
- if (stateString) {
- for (; i < chunkLength; i++) {
- if (stateStringEscape) {
- stateStringEscape = false;
- } else {
- switch (chunk.charCodeAt(i)) {
- case 0x22: /* " */
- stateString = false;
- continue scan;
- case 0x5C: /* \ */
- stateStringEscape = true;
- }
- }
- }
- break;
- }
- switch (chunk.charCodeAt(i)) {
- case 0x22: /* " */
- stateString = true;
- stateStringEscape = false;
- seenNonWhiteSpace = true;
- break;
- case 0x2C: /* , */
- flushPoint = i;
- break;
- case 0x7B: /* { */
- // Open an object
- flushPoint = i + 1;
- stack[flushDepth++] = STACK_OBJECT;
- seenNonWhiteSpace = true;
- break;
- case 0x5B: /* [ */
- // Open an array
- flushPoint = i + 1;
- stack[flushDepth++] = STACK_ARRAY;
- seenNonWhiteSpace = true;
- break;
- case 0x5D: /* ] */
- case 0x7D: /* } */
- // Close an object or array
- flushPoint = i + 1;
- if (flushDepth === 0) {
- // Unmatched closing bracket/brace at top level, should fail to parse
- break scan;
- }
- flushDepth--;
- // Flush on depth decrease related to last flush, otherwise wait for more chunks to flush together
- if (flushDepth < lastFlushDepth) {
- flush(chunk, lastFlushPoint, flushPoint);
- lastFlushPoint = flushPoint;
- }
- break;
- case 0x09: /* \t */
- case 0x0A: /* \n */
- case 0x0D: /* \r */
- case 0x20: /* space */
- if (flushDepth === 0) {
- if (seenNonWhiteSpace) {
- flushPoint = i;
- flush(chunk, lastFlushPoint, flushPoint);
- lastFlushPoint = flushPoint;
- }
- if (parseMode !== MODE_JSON &&
- allowNewRootValue === false &&
- (chunk.charCodeAt(i) === 0x0A || chunk.charCodeAt(i) === 0x0D)
- ) {
- allowNewRootValue = true;
- }
- if (flushPoint === i) {
- parsedChunkLength++;
- }
- }
- // Move points forward when they point to current position and it's a whitespace
- if (lastFlushPoint === i) {
- lastFlushPoint++;
- }
- if (flushPoint === i) {
- flushPoint++;
- }
- break;
- default:
- seenNonWhiteSpace = true;
- }
- }
- if (flushPoint > lastFlushPoint) {
- flush(chunk, lastFlushPoint, flushPoint);
- }
- // Produce pendingChunk if something left
- if (flushPoint < chunkLength) {
- if (pendingChunk !== null) {
- // When there is already a pending chunk then no flush happened,
- // appending entire chunk to pending one
- pendingChunk += chunk;
- } else {
- // Create a pending chunk, it will start with non-whitespace since
- // flushPoint was moved forward away from whitespaces on scan
- pendingChunk = chunk.slice(flushPoint, chunkLength);
- }
- }
- consumedChunkLength += chunkLength;
- if (typeof onChunk === 'function') {
- onChunk(parsedChunkLength - prevParsedChunkLength, chunk, pendingChunk, state);
- }
- }
- function finish() {
- if (pendingChunk !== null || (currentRootValue === NO_VALUE && parseMode !== MODE_JSONL)) {
- // Force the `flushDepth < lastFlushDepth` branch in flush() to prepend missed
- // opening brackets/parentheses and produce a natural JSON.parse() EOF error
- flushDepth = 0;
- flush('', 0, 0);
- }
- if (typeof onChunk === 'function') {
- parsedChunkLength = consumedChunkLength;
- onChunk(0, null, null, state);
- }
- if (typeof onRootValue === 'function') {
- return rootValuesCount;
- }
- return rootValues !== null ? rootValues : currentRootValue;
- }
- }
- exports.parseChunked = parseChunked;
|