BREAKING(std/encoding/csv): improve the definition of ParseOptions (#7714)

2024-12-23 15:49:44 -05:00 · 2020-09-28 03:20:46 +09:00 · 2020-09-28 03:20:46 +09:00 · 94dcef714d
commit 94dcef714d
parent 5db72dcaf3
3 changed files with 101 additions and 72 deletions
--- a/std/encoding/README.md
+++ b/std/encoding/README.md
@ -37,25 +37,29 @@ writeVarbig(w: Deno.Writer, x: bigint, o: VarbigOptions = {}): Promise<number>
 Parse the CSV from the `reader` with the options provided and return
 `string[][]`.

-#### `parse(input: string | BufReader, opt: ParseOptions = { header: false }): Promise<unknown[]>`:
+#### `parse(input: string | BufReader, opt: ParseOptions = { skipFirstRow: false }): Promise<unknown[]>`:

 Parse the CSV string/buffer with the options provided. The result of this
 function is as follows:

- If you don't provide both `opt.header` and `opt.parse`, it returns
-  `string[][]`.
- If you provide `opt.header` but not `opt.parse`, it returns `object[]`.
+- If you don't provide `opt.skipFirstRow`, `opt.parse`, and `opt.columns`, it
+  returns `string[][]`.
+- If you provide `opt.skipFirstRow` or `opt.columns` but not `opt.parse`, it
+  returns `object[]`.
 - If you provide `opt.parse`, it returns an array where each element is the
  value returned from `opt.parse`.

 ##### `ParseOptions`

- **`header: boolean | string[] | HeaderOptions[];`**: If a boolean is provided,
-  the first line will be used as Header definitions. If `string[]` or
-  `HeaderOptions[]` those names will be used for header definition.
+- **`skipFirstRow: boolean;`**: If you provide `skipFirstRow: true` and
+  `columns`, the first line will be skipped. If you provide `skipFirstRow: true`
+  but not `columns`, the first line will be skipped and used as header
+  definitions.
+- **`columns: string[] | HeaderOptions[];`**: If you provide `string[]` or
+  `ColumnOptions[]`, those names will be used for header definition.
 - **`parse?: (input: unknown) => unknown;`**: Parse function for the row, which
  will be executed after parsing of all columns. Therefore if you don't provide
-  header and parse function with headers, input will be `string[]`.
+  `skipFirstRow`, `columns`, and `parse` function, input will be `string[]`.

 ##### `HeaderOptions`

--- a/std/encoding/csv.ts
+++ b/std/encoding/csv.ts
@ -52,7 +52,7 @@ export class ParseError extends Error {
 }

 /**
- * @property comma - Character which separates values. Default: ','
+ * @property separator - Character which separates values. Default: ','
 * @property comment - Character to start a comment. Default: '#'
 * @property trimLeadingSpace - Flag to trim the leading space of the value.
 *           Default: 'false'
@ -62,7 +62,7 @@ export class ParseError extends Error {
 *           If == 0, first row is used as referral for the number of fields.
 */
 export interface ReadOptions {
-  comma?: string;
+  separator?: string;
  comment?: string;
  trimLeadingSpace?: boolean;
  lazyQuotes?: boolean;
@ -70,16 +70,16 @@ export interface ReadOptions {
 }

 function chkOptions(opt: ReadOptions): void {
-  if (!opt.comma) {
-    opt.comma = ",";
+  if (!opt.separator) {
+    opt.separator = ",";
  }
  if (!opt.trimLeadingSpace) {
    opt.trimLeadingSpace = false;
  }
  if (
-    INVALID_RUNE.includes(opt.comma) ||
+    INVALID_RUNE.includes(opt.separator) ||
    (typeof opt.comment === "string" && INVALID_RUNE.includes(opt.comment)) ||
-    opt.comma === opt.comment
+    opt.separator === opt.comment
  ) {
    throw new Error(ERR_INVALID_DELIM);
  }
@ -88,7 +88,7 @@ function chkOptions(opt: ReadOptions): void {
 async function readRecord(
  startLine: number,
  reader: BufReader,
-  opt: ReadOptions = { comma: ",", trimLeadingSpace: false },
+  opt: ReadOptions = { separator: ",", trimLeadingSpace: false },
 ): Promise<string[] | null> {
  const tp = new TextProtoReader(reader);
  let line = await readLine(tp);
@ -103,13 +103,13 @@ async function readRecord(
    return [];
  }

-  assert(opt.comma != null);
+  assert(opt.separator != null);

  let fullLine = line;
  let quoteError: ParseError | null = null;
  const quote = '"';
  const quoteLen = quote.length;
-  const commaLen = opt.comma.length;
+  const separatorLen = opt.separator.length;
  let recordBuffer = "";
  const fieldIndexes = [] as number[];
  parseField:
@ -120,7 +120,7 @@ async function readRecord(

    if (line.length === 0 || !line.startsWith(quote)) {
      // Non-quoted string field
-      const i = line.indexOf(opt.comma);
+      const i = line.indexOf(opt.separator);
      let field = line;
      if (i >= 0) {
        field = field.substring(0, i);
@ -144,7 +144,7 @@ async function readRecord(
      recordBuffer += field;
      fieldIndexes.push(recordBuffer.length);
      if (i >= 0) {
-        line = line.substring(i + commaLen);
+        line = line.substring(i + separatorLen);
        continue parseField;
      }
      break parseField;
@ -161,9 +161,9 @@ async function readRecord(
            // `""` sequence (append quote).
            recordBuffer += quote;
            line = line.substring(quoteLen);
-          } else if (line.startsWith(opt.comma)) {
+          } else if (line.startsWith(opt.separator)) {
            // `","` sequence (end of field).
-            line = line.substring(commaLen);
+            line = line.substring(separatorLen);
            fieldIndexes.push(recordBuffer.length);
            continue parseField;
          } else if (0 === line.length) {
@ -281,7 +281,7 @@ async function readLine(tp: TextProtoReader): Promise<string | null> {
 export async function readMatrix(
  reader: BufReader,
  opt: ReadOptions = {
-    comma: ",",
+    separator: ",",
    trimLeadingSpace: false,
    lazyQuotes: false,
  },
@ -324,13 +324,13 @@ export async function readMatrix(
 /**
 * Parse the CSV string/buffer with the options provided.
 *
- * HeaderOptions provides the column definition
+ * ColumnOptions provides the column definition
 * and the parse function for each entry of the
 * column.
 */
-export interface HeaderOptions {
+export interface ColumnOptions {
  /**
-   * Name of the header to be used as property
+   * Name of the column to be used as property
   */
  name: string;
  /**
@ -343,14 +343,20 @@ export interface HeaderOptions {

 export interface ParseOptions extends ReadOptions {
  /**
-   * If a boolean is provided, the first line will be used as Header definitions.
-   * If `string[]` or `HeaderOptions[]` those names will be used for header definition.
+   * If you provide `skipFirstRow: true` and `columns`, the first line will be skipped.
+   * If you provide `skipFirstRow: true` but not `columns`, the first line will be skipped and used as header definitions.
   */
-  header: boolean | string[] | HeaderOptions[];
+  skipFirstRow?: boolean;
+
+  /**
+   * If you provide `string[]` or `ColumnOptions[]`, those names will be used for header definition.
+   */
+  columns?: string[] | ColumnOptions[];
+
  /** Parse function for rows.
   * Example:
   *     const r = await parseFile('a,b,c\ne,f,g\n', {
-   *      header: ["this", "is", "sparta"],
+   *      columns: ["this", "is", "sparta"],
   *       parse: (e: Record<string, unknown>) => {
   *         return { super: e.this, street: e.is, fighter: e.sparta };
   *       }
@ -370,14 +376,14 @@ export interface ParseOptions extends ReadOptions {
 * for columns and rows.
 * @param input Input to parse. Can be a string or BufReader.
 * @param opt options of the parser.
- * @returns If you don't provide both `opt.header` and `opt.parse`, it returns `string[][]`.
- *   If you provide `opt.header` but not `opt.parse`, it returns `object[]`.
+ * @returns If you don't provide `opt.skipFirstRow`, `opt.parse`, and `opt.columns`, it returns `string[][]`.
+ *   If you provide `opt.skipFirstRow` or `opt.columns` but not `opt.parse`, it returns `object[]`.
 *   If you provide `opt.parse`, it returns an array where each element is the value returned from `opt.parse`.
 */
 export async function parse(
  input: string | BufReader,
  opt: ParseOptions = {
-    header: false,
+    skipFirstRow: false,
  },
 ): Promise<unknown[]> {
  let r: string[][];
@ -386,27 +392,15 @@ export async function parse(
  } else {
    r = await readMatrix(new BufReader(new StringReader(input)), opt);
  }
-  if (opt.header) {
-    let headers: HeaderOptions[] = [];
+  if (opt.skipFirstRow || opt.columns) {
+    let headers: ColumnOptions[] = [];
    let i = 0;
-    if (Array.isArray(opt.header)) {
-      if (typeof opt.header[0] !== "string") {
-        headers = opt.header as HeaderOptions[];
-      } else {
-        const h = opt.header as string[];
-        headers = h.map(
-          (e): HeaderOptions => {
-            return {
-              name: e,
-            };
-          },
-        );
-      }
-    } else {
+
+    if (opt.skipFirstRow) {
      const head = r.shift();
      assert(head != null);
      headers = head.map(
-        (e): HeaderOptions => {
+        (e): ColumnOptions => {
          return {
            name: e,
          };
@ -414,6 +408,21 @@ export async function parse(
      );
      i++;
    }
+
+    if (opt.columns) {
+      if (typeof opt.columns[0] !== "string") {
+        headers = opt.columns as ColumnOptions[];
+      } else {
+        const h = opt.columns as string[];
+        headers = h.map(
+          (e): ColumnOptions => {
+            return {
+              name: e,
+            };
+          },
+        );
+      }
+    }
    return r.map((e): unknown => {
      if (e.length !== headers.length) {
        throw `Error number of fields line:${i}`;
--- a/std/encoding/csv_test.ts
+++ b/std/encoding/csv_test.ts
@ -17,6 +17,7 @@ import {
 import { StringReader } from "../io/readers.ts";
 import { BufReader } from "../io/bufio.ts";

+// Test cases for `readMatrix()`
 const testCases = [
  {
    Name: "Simple",
@ -60,7 +61,7 @@ zzz,yyy,xxx`,
    Name: "Semicolon",
    Input: "a;b;c\n",
    Output: [["a", "b", "c"]],
-    Comma: ";",
+    Separator: ";",
  },
  {
    Name: "MultiLine",
@ -334,14 +335,14 @@ x,,,
    Input: "a£b,c£ \td,e\n€ comment\n",
    Output: [["a", "b,c", "d,e"]],
    TrimLeadingSpace: true,
-    Comma: "£",
+    Separator: "£",
    Comment: "€",
  },
  {
    Name: "NonASCIICommaAndCommentWithQuotes",
    Input: 'a€"  b,"€ c\nλ comment\n',
    Output: [["a", "  b,", " c"]],
-    Comma: "€",
+    Separator: "€",
    Comment: "λ",
  },
  {
@ -350,7 +351,7 @@ x,,,
    Name: "NonASCIICommaConfusion",
    Input: '"abθcd"λefθgh',
    Output: [["abθcd", "efθgh"]],
-    Comma: "λ",
+    Separator: "λ",
    Comment: "€",
  },
  {
@ -415,17 +416,17 @@ x,,,
  },
  {
    Name: "BadComma1",
-    Comma: "\n",
+    Separator: "\n",
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadComma2",
-    Comma: "\r",
+    Separator: "\r",
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadComma3",
-    Comma: '"',
+    Separator: '"',
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
@ -440,7 +441,7 @@ x,,,
  },
  {
    Name: "BadCommaComment",
-    Comma: "X",
+    Separator: "X",
    Comment: "X",
    Error: new Error(ERR_INVALID_DELIM),
  },
@ -449,13 +450,13 @@ for (const t of testCases) {
  Deno.test({
    name: `[CSV] ${t.Name}`,
    async fn(): Promise<void> {
-      let comma = ",";
+      let separator = ",";
      let comment: string | undefined;
      let fieldsPerRec: number | undefined;
      let trim = false;
      let lazyquote = false;
-      if (t.Comma) {
-        comma = t.Comma;
+      if (t.Separator) {
+        separator = t.Separator;
      }
      if (t.Comment) {
        comment = t.Comment;
@ -475,7 +476,7 @@ for (const t of testCases) {
          await readMatrix(
            new BufReader(new StringReader(t.Input ?? "")),
            {
-              comma: comma,
+              separator,
              comment: comment,
              trimLeadingSpace: trim,
              fieldsPerRecord: fieldsPerRec,
@ -489,7 +490,7 @@ for (const t of testCases) {
        actual = await readMatrix(
          new BufReader(new StringReader(t.Input ?? "")),
          {
-            comma: comma,
+            separator,
            comment: comment,
            trimLeadingSpace: trim,
            fieldsPerRecord: fieldsPerRec,
@ -507,19 +508,19 @@ const parseTestCases = [
  {
    name: "simple",
    in: "a,b,c",
-    header: false,
+    skipFirstRow: false,
    result: [["a", "b", "c"]],
  },
  {
    name: "simple Bufreader",
    in: new BufReader(new StringReader("a,b,c")),
-    header: false,
+    skipFirstRow: false,
    result: [["a", "b", "c"]],
  },
  {
    name: "multiline",
    in: "a,b,c\ne,f,g\n",
-    header: false,
+    skipFirstRow: false,
    result: [
      ["a", "b", "c"],
      ["e", "f", "g"],
@ -528,13 +529,13 @@ const parseTestCases = [
  {
    name: "header mapping boolean",
    in: "a,b,c\ne,f,g\n",
-    header: true,
+    skipFirstRow: true,
    result: [{ a: "e", b: "f", c: "g" }],
  },
  {
    name: "header mapping array",
    in: "a,b,c\ne,f,g\n",
-    header: ["this", "is", "sparta"],
+    columns: ["this", "is", "sparta"],
    result: [
      { this: "a", is: "b", sparta: "c" },
      { this: "e", is: "f", sparta: "g" },
@ -543,7 +544,7 @@ const parseTestCases = [
  {
    name: "header mapping object",
    in: "a,b,c\ne,f,g\n",
-    header: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
+    columns: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
    result: [
      { this: "a", is: "b", sparta: "c" },
      { this: "e", is: "f", sparta: "g" },
@ -552,7 +553,7 @@ const parseTestCases = [
  {
    name: "header mapping parse entry",
    in: "a,b,c\ne,f,g\n",
-    header: [
+    columns: [
      {
        name: "this",
        parse: (e: string): string => {
@ -583,7 +584,7 @@ const parseTestCases = [
    parse: (e: string[]): unknown => {
      return { super: e[0], street: e[1], fighter: e[2] };
    },
-    header: false,
+    skipFirstRow: false,
    result: [
      { super: "a", street: "b", fighter: "c" },
      { super: "e", street: "f", fighter: "g" },
@ -592,7 +593,7 @@ const parseTestCases = [
  {
    name: "header mapping object parseline",
    in: "a,b,c\ne,f,g\n",
-    header: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
+    columns: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
    parse: (e: Record<string, unknown>): unknown => {
      return { super: e.this, street: e.is, fighter: e.sparta };
    },
@ -601,6 +602,20 @@ const parseTestCases = [
      { super: "e", street: "f", fighter: "g" },
    ],
  },
+  {
+    name: "provides both opts.skipFirstRow and opts.columns",
+    in: "a,b,1\nc,d,2\ne,f,3",
+    skipFirstRow: true,
+    columns: [
+      { name: "foo" },
+      { name: "bar" },
+      { name: "baz", parse: (e: string) => Number(e) },
+    ],
+    result: [
+      { foo: "c", bar: "d", baz: 2 },
+      { foo: "e", bar: "f", baz: 3 },
+    ],
+  },
 ];

 for (const testCase of parseTestCases) {
@ -608,7 +623,8 @@ for (const testCase of parseTestCases) {
    name: `[CSV] Parse ${testCase.name}`,
    async fn(): Promise<void> {
      const r = await parse(testCase.in, {
-        header: testCase.header,
+        skipFirstRow: testCase.skipFirstRow,
+        columns: testCase.columns,
        parse: testCase.parse as (input: unknown) => unknown,
      });
      assertEquals(r, testCase.result);