encoding: add csv parse (denoland/deno_std#458)

Original: 167f529898
2024-11-22 15:06:54 -05:00 · 2019-05-30 15:50:29 +02:00 · 2019-05-30 15:50:29 +02:00 · 2487c45547
commit 2487c45547
parent a0ce25e606
4 changed files with 349 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -24,6 +24,7 @@ Here are the dedicated documentations of modules:

 - [colors](colors/README.md)
 - [datetime](datetime/README.md)
+- [encoding](encoding/README.md)
 - [examples](examples/README.md)
 - [flags](flags/README.md)
 - [fs](fs/README.md)
@ -33,7 +34,6 @@ Here are the dedicated documentations of modules:
 - [prettier](prettier/README.md)
 - [strings](strings/README.md)
 - [testing](testing/README.md)
- [toml](encoding/toml/README.md)
 - [ws](ws/README.md)

 ## Contributing
--- a/encoding/README.md
+++ b/encoding/README.md
@ -1,11 +1,112 @@
-# TOML
+# Encoding
+
+## CSV
+
+- **`readAll(reader: BufReader, opt: ParseOptions = { comma: ",", trimLeadingSpace: false, lazyQuotes: false } ): Promise<[string[][], BufState]>`**:
+  Read the whole buffer and output the structured CSV datas
+- **`parse(csvString: string, opt: ParseOption): Promise<unknown[]>`**:
+  See [parse](###Parse)
+
+### Parse
+
+Parse the CSV string with the options provided.
+
+#### Options
+
+##### ParseOption
+
+- **`header: boolean | string[] | HeaderOption[];`**: If a boolean is provided,
+  the first line will be used as Header definitions. If `string[]` or
+  `HeaderOption[]`
+  those names will be used for header definition.
+- **`parse?: (input: unknown) => unknown;`**: Parse function for the row, which
+  will be executed after parsing of all columns. Therefore if you don't provide
+  header and parse function with headers, input will be `string[]`.
+
+##### HeaderOption
+
+- **`name: string;`**: Name of the header to be used as property.
+- **`parse?: (input: string) => unknown;`**: Parse function for the column.
+  This is executed on each entry of the header. This can be combined with the
+  Parse function of the rows.
+
+#### Usage
+
+```ts
+// input:
+// a,b,c
+// e,f,g
+
+const r = await parseFile(filepath, {
+  header: false
+});
+// output:
+// [["a", "b", "c"], ["e", "f", "g"]]
+
+const r = await parseFile(filepath, {
+  header: true
+});
+// output:
+// [{ a: "e", b: "f", c: "g" }]
+
+const r = await parseFile(filepath, {
+  header: ["this", "is", "sparta"]
+});
+// output:
+// [
+//   { this: "a", is: "b", sparta: "c" },
+//   { this: "e", is: "f", sparta: "g" }
+// ]
+
+const r = await parseFile(filepath, {
+  header: [
+    {
+      name: "this",
+      parse: (e: string): string => {
+        return `b${e}$$`;
+      }
+    },
+    {
+      name: "is",
+      parse: (e: string): number => {
+        return e.length;
+      }
+    },
+    {
+      name: "sparta",
+      parse: (e: string): unknown => {
+        return { bim: `boom-${e}` };
+      }
+    }
+  ]
+});
+// output:
+// [
+//    { this: "ba$$", is: 1, sparta: { bim: `boom-c` } },
+//    { this: "be$$", is: 1, sparta: { bim: `boom-g` } }
+// ]
+
+const r = await parseFile(filepath, {
+  header: ["this", "is", "sparta"],
+  parse: (e: Record<string, unknown>) => {
+    return { super: e.this, street: e.is, fighter: e.sparta };
+  }
+});
+// output:
+// [
+//   { super: "a", street: "b", fighter: "c" },
+//   { super: "e", street: "f", fighter: "g" }
+// ]
+```
+
+## TOML

 This module parse TOML files. It follows as much as possible the
 [TOML specs](https://github.com/toml-lang/toml). Be sure to read the supported
 types as not every specs is supported at the moment and the handling in
 TypeScript side is a bit different.

-## Supported types and handling
+### Supported types and handling

 - :heavy_check_mark: [Keys](https://github.com/toml-lang/toml#string)
 - :exclamation: [String](https://github.com/toml-lang/toml#string)
@ -27,39 +128,39 @@ TypeScript side is a bit different.

 :exclamation: _Supported with warnings see [Warning](#Warning)._

-### :warning: Warning
+#### :warning: Warning

-#### String
+##### String

 - Regex : Due to the spec, there is no flag to detect regex properly
  in a TOML declaration. So the regex is stored as string.

-#### Integer
+##### Integer

 For **Binary** / **Octal** / **Hexadecimal** numbers,
 they are stored as string to be not interpreted as Decimal.

-#### Local Time
+##### Local Time

 Because local time does not exist in JavaScript, the local time is stored as a string.

-#### Inline Table
+##### Inline Table

 Inline tables are supported. See below:

 ```toml
 animal = { type = { name = "pug" } }
-# Output
+## Output
 animal = { type.name = "pug" }
-# Output { animal : { type : { name : "pug" } }
+## Output { animal : { type : { name : "pug" } }
 animal.as.leaders = "tosin"
-# Output { animal: { as: { leaders: "tosin" } } }
+## Output { animal: { as: { leaders: "tosin" } } }
 "tosin.abasi" = "guitarist"
-# Output
+## Output
 "tosin.abasi" : "guitarist"
 ```

-#### Array of Tables
+##### Array of Tables

 At the moment only simple declarations like below are supported:

@ -89,9 +190,9 @@ will output:
 }
 ```

-## Usage
+### Usage

-### Parse
+#### Parse

 ```ts
 import { parse } from "./parser.ts";
@ -103,7 +204,7 @@ const tomlString = 'foo.bar = "Deno"';
 const tomlObject22 = parse(tomlString);
 ```

-### Stringify
+#### Stringify

 ```ts
 import { stringify } from "./parser.ts";
--- a/encoding/csv.ts
+++ b/encoding/csv.ts
@ -4,6 +4,7 @@

 import { BufReader, EOF } from "../io/bufio.ts";
 import { TextProtoReader } from "../textproto/mod.ts";
+import { StringReader } from "../io/readers.ts";

 const INVALID_RUNE = ["\r", "\n", '"'];

@ -17,28 +18,39 @@ export class ParseError extends Error {
  }
 }

+/**
+ * @property comma - Character which separates values. Default: ','
+ * @property comment - Character to start a comment. Default: '#'
+ * @property trimLeadingSpace - Flag to trim the leading space of the value. Default: 'false'
+ * @property lazyQuotes - Allow unquoted quote in a quoted field or non double
+ *  quoted quotes in quoted field Default: 'false'
+ * @property fieldsPerRecord - Enabling the check of fields for each row. If == 0
+ * first row is used as referal for the number of fields.
+ */
 export interface ParseOptions {
-  comma: string;
+  comma?: string;
  comment?: string;
-  trimLeadingSpace: boolean;
+  trimLeadingSpace?: boolean;
  lazyQuotes?: boolean;
  fieldsPerRecord?: number;
 }

 function chkOptions(opt: ParseOptions): void {
+  if (!opt.comma) opt.comma = ",";
+  if (!opt.trimLeadingSpace) opt.trimLeadingSpace = false;
  if (
-    INVALID_RUNE.includes(opt.comma) ||
-    (opt.comment && INVALID_RUNE.includes(opt.comment)) ||
+    INVALID_RUNE.includes(opt.comma!) ||
+    INVALID_RUNE.includes(opt.comment!) ||
    opt.comma === opt.comment
  ) {
    throw new Error("Invalid Delimiter");
  }
 }

-export async function read(
+async function read(
  Startline: number,
  reader: BufReader,
-  opt: ParseOptions = { comma: ",", comment: "#", trimLeadingSpace: false }
+  opt: ParseOptions = { comma: ",", trimLeadingSpace: false }
 ): Promise<string[] | EOF> {
  const tp = new TextProtoReader(reader);
  let line: string;
@ -68,7 +80,7 @@ export async function read(
    return [];
  }

-  result = line.split(opt.comma);
+  result = line.split(opt.comma!);

  let quoteError = false;
  result = result.map(
@ -138,3 +150,105 @@ export async function readAll(
  }
  return result;
 }
+
+/**
+ * HeaderOption provides the column definition
+ * and the parse function for each entry of the
+ * column.
+ */
+export interface HeaderOption {
+  name: string;
+  parse?: (input: string) => unknown;
+}
+
+export interface ExtendedParseOptions extends ParseOptions {
+  header: boolean | string[] | HeaderOption[];
+  parse?: (input: unknown) => unknown;
+}
+
+/**
+ * Csv parse helper to manipulate data.
+ * Provides an auto/custom mapper for columns and parse function
+ * for columns and rows.
+ * @param input Input to parse. Can be a string or BufReader.
+ * @param opt options of the parser.
+ * @param [opt.header=false] HeaderOptions
+ * @param [opt.parse=null] Parse function for rows.
+ * Example:
+ *     const r = await parseFile('a,b,c\ne,f,g\n', {
+ *      header: ["this", "is", "sparta"],
+ *       parse: (e: Record<string, unknown>) => {
+ *         return { super: e.this, street: e.is, fighter: e.sparta };
+ *       }
+ *     });
+ * // output
+ * [
+ *   { super: "a", street: "b", fighter: "c" },
+ *   { super: "e", street: "f", fighter: "g" }
+ * ]
+ */
+export async function parse(
+  input: string | BufReader,
+  opt: ExtendedParseOptions = {
+    header: false
+  }
+): Promise<unknown[]> {
+  let r: string[][];
+  if (input instanceof BufReader) {
+    r = await readAll(input, opt);
+  } else {
+    r = await readAll(new BufReader(new StringReader(input)), opt);
+  }
+  if (opt.header) {
+    let headers: HeaderOption[] = [];
+    let i = 0;
+    if (Array.isArray(opt.header)) {
+      if (typeof opt.header[0] !== "string") {
+        headers = opt.header as HeaderOption[];
+      } else {
+        const h = opt.header as string[];
+        headers = h.map(
+          (e): HeaderOption => {
+            return {
+              name: e
+            };
+          }
+        );
+      }
+    } else {
+      headers = r.shift()!.map(
+        (e): HeaderOption => {
+          return {
+            name: e
+          };
+        }
+      );
+      i++;
+    }
+    return r.map(
+      (e): unknown => {
+        if (e.length !== headers.length) {
+          throw `Error number of fields line:${i}`;
+        }
+        i++;
+        let out: Record<string, unknown> = {};
+        for (let j = 0; j < e.length; j++) {
+          const h = headers[j];
+          if (h.parse) {
+            out[h.name] = h.parse(e[j]);
+          } else {
+            out[h.name] = e[j];
+          }
+        }
+        if (opt.parse) {
+          return opt.parse(out);
+        }
+        return out;
+      }
+    );
+  }
+  if (opt.parse) {
+    return r.map((e: string[]): unknown => opt.parse!(e));
+  }
+  return r;
+}
--- a/encoding/csv_test.ts
+++ b/encoding/csv_test.ts
@ -2,7 +2,7 @@
 // https://github.com/golang/go/blob/2cc15b1/src/encoding/csv/reader_test.go
 import { test, runIfMain } from "../testing/mod.ts";
 import { assertEquals, assert } from "../testing/asserts.ts";
-import { readAll } from "./csv.ts";
+import { readAll, parse } from "./csv.ts";
 import { StringReader } from "../io/readers.ts";
 import { BufReader } from "../io/bufio.ts";

@ -468,4 +468,114 @@ for (const t of testCases) {
  });
 }

+const parseTestCases = [
+  {
+    name: "simple",
+    in: "a,b,c",
+    header: false,
+    result: [["a", "b", "c"]]
+  },
+  {
+    name: "simple Bufreader",
+    in: new BufReader(new StringReader("a,b,c")),
+    header: false,
+    result: [["a", "b", "c"]]
+  },
+  {
+    name: "multiline",
+    in: "a,b,c\ne,f,g\n",
+    header: false,
+    result: [["a", "b", "c"], ["e", "f", "g"]]
+  },
+  {
+    name: "header mapping boolean",
+    in: "a,b,c\ne,f,g\n",
+    header: true,
+    result: [{ a: "e", b: "f", c: "g" }]
+  },
+  {
+    name: "header mapping array",
+    in: "a,b,c\ne,f,g\n",
+    header: ["this", "is", "sparta"],
+    result: [
+      { this: "a", is: "b", sparta: "c" },
+      { this: "e", is: "f", sparta: "g" }
+    ]
+  },
+  {
+    name: "header mapping object",
+    in: "a,b,c\ne,f,g\n",
+    header: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
+    result: [
+      { this: "a", is: "b", sparta: "c" },
+      { this: "e", is: "f", sparta: "g" }
+    ]
+  },
+  {
+    name: "header mapping parse entry",
+    in: "a,b,c\ne,f,g\n",
+    header: [
+      {
+        name: "this",
+        parse: (e: string): string => {
+          return `b${e}$$`;
+        }
+      },
+      {
+        name: "is",
+        parse: (e: string): number => {
+          return e.length;
+        }
+      },
+      {
+        name: "sparta",
+        parse: (e: string): unknown => {
+          return { bim: `boom-${e}` };
+        }
+      }
+    ],
+    result: [
+      { this: "ba$$", is: 1, sparta: { bim: `boom-c` } },
+      { this: "be$$", is: 1, sparta: { bim: `boom-g` } }
+    ]
+  },
+  {
+    name: "multiline parse",
+    in: "a,b,c\ne,f,g\n",
+    parse: (e: string[]): unknown => {
+      return { super: e[0], street: e[1], fighter: e[2] };
+    },
+    header: false,
+    result: [
+      { super: "a", street: "b", fighter: "c" },
+      { super: "e", street: "f", fighter: "g" }
+    ]
+  },
+  {
+    name: "header mapping object parseline",
+    in: "a,b,c\ne,f,g\n",
+    header: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
+    parse: (e: Record<string, unknown>): unknown => {
+      return { super: e.this, street: e.is, fighter: e.sparta };
+    },
+    result: [
+      { super: "a", street: "b", fighter: "c" },
+      { super: "e", street: "f", fighter: "g" }
+    ]
+  }
+];
+
+for (const testCase of parseTestCases) {
+  test({
+    name: `[CSV] Parse ${testCase.name}`,
+    async fn(): Promise<void> {
+      const r = await parse(testCase.in, {
+        header: testCase.header,
+        parse: testCase.parse as (input: unknown) => unknown
+      });
+      assertEquals(r, testCase.result);
+    }
+  });
+}
+
 runIfMain(import.meta);