feat(std/path): Align globToRegExp() with bash glob expansion (#7209)

- feat: Support escaping glob characters - feat: Support more character classes - feat: Match characters literally on segment parse failure - fix: Match nothing for empty globs - fix: Contain any glob syntax to its path segment - perf: Remove extraneous separators from generated regex - doc: Add detailed JSDoc - chore: Remove old copyright headers
2024-12-23 15:49:44 -05:00 · 2020-10-01 10:37:03 +01:00 · 2020-10-01 10:37:03 +01:00 · 326ccb1095
commit 326ccb1095
parent b689e60b60
3 changed files with 367 additions and 240 deletions
--- a/std/fs/expand_glob.ts
+++ b/std/fs/expand_glob.ts
@ -60,11 +60,13 @@ function comparePath(a: WalkEntry, b: WalkEntry): number {
  return 0;
 }
-/**
+/** Expand the glob string from the specified `root` directory and yield each
 * Expand the glob string from the specified `root` directory and yield each
 * result as a `WalkEntry` object.
 *
- * Examples:
+ * See [`globToRegExp()`](../path/glob.ts#globToRegExp) for details on supported
 * syntax.
 *
 * Example:
 *
 *      for await (const file of expandGlob("**\/*.ts")) {
 *        console.log(file);
@ -168,10 +170,9 @@ export async function* expandGlob(
  yield* currentMatches;
 }
-/**
+/** Synchronous version of `expandGlob()`.
 * Synchronous version of `expandGlob()`.
 *
- * Examples:
+ * Example:
 *
 *      for (const file of expandGlobSync("**\/*.ts")) {
 *        console.log(file);
--- a/std/path/glob.ts
+++ b/std/path/glob.ts
@ -1,5 +1,3 @@
 // globToRegExp() is originall ported from globrex@0.1.2.
 // Copyright 2018 Terkel Gjervig Nielsen. All rights reserved. MIT license.
 // Copyright 2018-2020 the Deno authors. All rights reserved. MIT license.
 // This module is browser compatible.
@ -22,232 +20,296 @@ export interface GlobOptions {
 export type GlobToRegExpOptions = GlobOptions;
-/** Convert a glob string to a regular expressions.
+// deno-fmt-ignore
 const regExpEscapeChars = ["!", "$", "(", ")", "*", "+", ".", "=", "?", "[", "\\", "^", "{", "|"];
 const rangeEscapeChars = ["-", "\\", "]"];
 /** Convert a glob string to a regular expression.
 *
- *      // Looking for all the `ts` files:
+ * Tries to match bash glob expansion as closely as possible.
 *      walkSync(".", {
 *        match: [globToRegExp("*.ts")]
 *      });
 *
- *      Looking for all the `.json` files in any subfolder:
+ * Basic glob syntax:
- *      walkSync(".", {
+ * - `*` - Matches everything without leaving the path segment.
- *        match: [globToRegExp(join("a", "**", "*.json"), {
+ * - `{foo,bar}` - Matches `foo` or `bar`.
- *          extended: true,
+ * - `[abcd]` - Matches `a`, `b`, `c` or `d`.
- *          globstar: true
+ * - `[a-d]` - Matches `a`, `b`, `c` or `d`.
- *        })]
+ * - `[!abcd]` - Matches any single character besides `a`, `b`, `c` or `d`.
- *      }); */
+ * - `[[:<class>:]]` - Matches any character belonging to `<class>`.
 *     - `[[:alnum:]]` - Matches any digit or letter.
 *     - `[[:digit:]abc]` - Matches any digit, `a`, `b` or `c`.
 *     - See https://facelessuser.github.io/wcmatch/glob/#posix-character-classes
 *       for a complete list of supported character classes.
 * - `\` - Escapes the next character for an `os` other than `"windows"`.
 * - \` - Escapes the next character for `os` set to `"windows"`.
 * - `/` - Path separator.
 * - `\` - Additional path separator only for `os` set to `"windows"`.
 *
 * Extended syntax:
 * - Requires `{ extended: true }`.
 * - `?(foo|bar)` - Matches 0 or 1 instance of `{foo,bar}`.
 * - `@(foo|bar)` - Matches 1 instance of `{foo,bar}`. They behave the same.
 * - `*(foo|bar)` - Matches _n_ instances of `{foo,bar}`.
 * - `+(foo|bar)` - Matches _n > 0_ instances of `{foo,bar}`.
 * - `!(foo|bar)` - Matches anything other than `{foo,bar}`.
 * - See https://www.linuxjournal.com/content/bash-extended-globbing.
 *
 * Globstar syntax:
 * - Requires `{ globstar: true }`.
 * - `**` - Matches any number of any path segments.
 *     - Must comprise its entire path segment in the provided glob.
 * - See https://www.linuxjournal.com/content/globstar-new-bash-globbing-option.
 *
 * Note the following properties:
 * - The generated `RegExp` is anchored at both start and end.
 * - Repeating and trailing separators are tolerated. Trailing separators in the
 *   provided glob have no meaning and are discarded.
 * - Absolute globs will only match absolute paths, etc.
 * - Empty globs will match nothing.
 * - Any special glob syntax must be contained to one path segment. For example,
 *   `?(foo|bar/baz)` is invalid. The separator will take precendence and the
 *   first segment ends with an unclosed group.
 * - If a path segment ends with unclosed groups or a dangling escape prefix, a
 *   parse error has occured. Every character for that segment is taken
 *   literally in this event.
 *
 * Limitations:
 * - A negative group like `!(foo|bar)` will wrongly be converted to a negative
 *   look-ahead followed by a wildcard. This means that `!(foo).js` will wrongly
 *   fail to match `foobar.js`, even though `foobar` is not `foo`. Effectively,
 *   `!(foo|bar)` is treated like `!(@(foo|bar)*)`. This will work correctly if
 *   the group occurs not nested at the end of the segment. */
 export function globToRegExp(
  glob: string,
  { extended = true, globstar: globstarOption = true, os = NATIVE_OS }:
    GlobToRegExpOptions = {},
 ): RegExp {
-  const sep = os == "windows" ? `(?:\\\\|\\/)+` : `\\/+`;
+  if (glob == "") {
-  const sepMaybe = os == "windows" ? `(?:\\\\|\\/)*` : `\\/*`;
+    return /(?!)/;
  }
  const sep = os == "windows" ? "(?:\\\\|/)+" : "/+";
  const sepMaybe = os == "windows" ? "(?:\\\\|/)*" : "/*";
  const seps = os == "windows" ? ["\\", "/"] : ["/"];
  const sepRaw = os == "windows" ? `\\` : `/`;
  const globstar = os == "windows"
-    ? `(?:[^\\\\/]*(?:\\\\|\\/|$)+)*`
+    ? "(?:[^\\\\/]*(?:\\\\|/|$)+)*"
-    : `(?:[^/]*(?:\\/|$)+)*`;
+    : "(?:[^/]*(?:/|$)+)*";
-  const wildcard = os == "windows" ? `[^\\\\/]*` : `[^/]*`;
+  const wildcard = os == "windows" ? "[^\\\\/]*" : "[^/]*";
-
+  const escapePrefix = os == "windows" ? "`" : "\\";
  // Keep track of scope for extended syntaxes.
  const extStack = [];
  // If we are doing extended matching, this boolean is true when we are inside
  // a group (eg {*.html,*.js}), and false otherwise.
  let inGroup = false;
  let inRange = false;
  let regExpString = "";
  // Remove trailing separators.
  let newLength = glob.length;
-  for (; newLength > 0 && seps.includes(glob[newLength - 1]); newLength--);
+  for (; newLength > 1 && seps.includes(glob[newLength - 1]); newLength--);
  glob = glob.slice(0, newLength);
-  let c, n;
+  let regExpString = "";
  for (let i = 0; i < glob.length; i++) {
    c = glob[i];
    n = glob[i + 1];
-    if (seps.includes(c)) {
+  // Terminates correctly. Trust that `j` is incremented every iteration.
-      regExpString += sep;
+  for (let j = 0; j < glob.length;) {
-      while (seps.includes(glob[i + 1])) i++;
+    let segment = "";
    const groupStack = [];
    let inRange = false;
    let inEscape = false;
    let endsWithSep = false;
    let i = j;
    // Terminates with `i` at the non-inclusive end of the current segment.
    for (; i < glob.length && !seps.includes(glob[i]); i++) {
      if (inEscape) {
        inEscape = false;
        const escapeChars = inRange ? rangeEscapeChars : regExpEscapeChars;
        segment += escapeChars.includes(glob[i]) ? `\\${glob[i]}` : glob[i];
        continue;
      }
-    if (c == "[") {
+      if (glob[i] == escapePrefix) {
-      if (inRange && n == ":") {
+        inEscape = true;
        i++; // skip [
        let value = "";
        while (glob[++i] !== ":") value += glob[i];
        if (value == "alnum") regExpString += "\\w\\d";
        else if (value == "space") regExpString += "\\s";
        else if (value == "digit") regExpString += "\\d";
        i++; // skip last ]
        continue;
      }
      if (glob[i] == "[") {
        if (!inRange) {
          inRange = true;
-      regExpString += c;
+          segment += "[";
          if (glob[i + 1] == "!") {
            i++;
            segment += "^";
          } else if (glob[i + 1] == "^") {
            i++;
            segment += "\\^";
          }
          continue;
        } else if (glob[i + 1] == ":") {
          let k = i + 1;
          let value = "";
          while (glob[k + 1] != null && glob[k + 1] != ":") {
            value += glob[k + 1];
            k++;
          }
          if (glob[k + 1] == ":" && glob[k + 2] == "]") {
            i = k + 2;
            if (value == "alnum") segment += "\\dA-Za-z";
            else if (value == "alpha") segment += "A-Za-z";
            else if (value == "ascii") segment += "\x00-\x7F";
            else if (value == "blank") segment += "\t ";
            else if (value == "cntrl") segment += "\x00-\x1F\x7F";
            else if (value == "digit") segment += "\\d";
            else if (value == "graph") segment += "\x21-\x7E";
            else if (value == "lower") segment += "a-z";
            else if (value == "print") segment += "\x20-\x7E";
            else if (value == "punct") {
              segment += "!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_‘{|}~";
            } else if (value == "space") segment += "\\s\v";
            else if (value == "upper") segment += "A-Z";
            else if (value == "word") segment += "\\w";
            else if (value == "xdigit") segment += "\\dA-Fa-f";
            continue;
          }
        }
      }
-    if (c == "]") {
+      if (glob[i] == "]" && inRange) {
        inRange = false;
-      regExpString += c;
+        segment += "]";
        continue;
      }
    if (c == "!") {
      if (inRange) {
-        if (glob[i - 1] == "[") {
+        if (glob[i] == "\\") {
-          regExpString += "^";
+          segment += `\\\\`;
        } else {
          segment += glob[i];
        }
        continue;
      }
-      } else if (extended) {
+
-        if (n == "(") {
+      if (
-          extStack.push(c);
+        glob[i] == ")" && groupStack.length > 0 &&
-          regExpString += "(?!";
+        groupStack[groupStack.length - 1] != "BRACE"
      ) {
        segment += ")";
        const type = groupStack.pop()!;
        if (type == "!") {
          segment += wildcard;
        } else if (type != "@") {
          segment += type;
        }
        continue;
      }
      if (
        glob[i] == "|" && groupStack.length > 0 &&
        groupStack[groupStack.length - 1] != "BRACE"
      ) {
        segment += "|";
        continue;
      }
      if (glob[i] == "+" && extended && glob[i + 1] == "(") {
        i++;
        groupStack.push("+");
        segment += "(?:";
        continue;
      }
-        regExpString += `\\${c}`;
+
      if (glob[i] == "@" && extended && glob[i + 1] == "(") {
        i++;
        groupStack.push("@");
        segment += "(?:";
        continue;
      }
      if (glob[i] == "?") {
        if (extended && glob[i + 1] == "(") {
          i++;
          groupStack.push("?");
          segment += "(?:";
        } else {
-        regExpString += `\\${c}`;
+          segment += ".";
        continue;
        }
    }
    if (inRange) {
      if (c == "\\" || c == "^" && glob[i - 1] == "[") regExpString += `\\${c}`;
      else regExpString += c;
        continue;
      }
-    if (["\\", "$", "^", ".", "="].includes(c)) {
+      if (glob[i] == "!" && extended && glob[i + 1] == "(") {
-      regExpString += `\\${c}`;
+        i++;
        groupStack.push("!");
        segment += "(?!";
        continue;
      }
-    if (c == "(") {
+      if (glob[i] == "{") {
-      if (extStack.length) {
+        groupStack.push("BRACE");
-        regExpString += `${c}?:`;
+        segment += "(?:";
        continue;
      }
      regExpString += `\\${c}`;
        continue;
      }
-    if (c == ")") {
+      if (glob[i] == "}" && groupStack[groupStack.length - 1] == "BRACE") {
-      if (extStack.length) {
+        groupStack.pop();
-        regExpString += c;
+        segment += ")";
-        const type = extStack.pop()!;
+        continue;
-        if (type == "@") {
+      }
-          regExpString += "{1}";
+
-        } else if (type == "!") {
+      if (glob[i] == "," && groupStack[groupStack.length - 1] == "BRACE") {
-          regExpString += wildcard;
+        segment += "|";
        continue;
      }
      if (glob[i] == "*") {
        if (extended && glob[i + 1] == "(") {
          i++;
          groupStack.push("*");
          segment += "(?:";
        } else {
          regExpString += type;
        }
        continue;
      }
      regExpString += `\\${c}`;
      continue;
    }
    if (c == "|") {
      if (extStack.length) {
        regExpString += c;
        continue;
      }
      regExpString += `\\${c}`;
      continue;
    }
    if (c == "+") {
      if (n == "(" && extended) {
        extStack.push(c);
        continue;
      }
      regExpString += `\\${c}`;
      continue;
    }
    if (c == "@" && extended) {
      if (n == "(") {
        extStack.push(c);
        continue;
      }
    }
    if (c == "?") {
      if (extended) {
        if (n == "(") {
          extStack.push(c);
        }
        continue;
      } else {
        regExpString += ".";
        continue;
      }
    }
    if (c == "{") {
      inGroup = true;
      regExpString += "(?:";
      continue;
    }
    if (c == "}") {
      inGroup = false;
      regExpString += ")";
      continue;
    }
    if (c == ",") {
      if (inGroup) {
        regExpString += "|";
        continue;
      }
      regExpString += `\\${c}`;
      continue;
    }
    if (c == "*") {
      if (n == "(" && extended) {
        extStack.push(c);
        continue;
      }
      // Move over all consecutive "*"'s.
      // Also store the previous and next characters
          const prevChar = glob[i - 1];
-      let starCount = 1;
+          let numStars = 1;
          while (glob[i + 1] == "*") {
        starCount++;
            i++;
            numStars++;
          }
          const nextChar = glob[i + 1];
-      const isGlobstar = globstarOption && starCount > 1 &&
+          if (
-        // from the start of the segment
+            globstarOption && numStars == 2 &&
-        [sepRaw, "/", undefined].includes(prevChar) &&
+            [...seps, undefined].includes(prevChar) &&
-        // to the end of the segment
+            [...seps, undefined].includes(nextChar)
-        [sepRaw, "/", undefined].includes(nextChar);
+          ) {
-      if (isGlobstar) {
+            segment += globstar;
-        // it's a globstar, so match zero or more path segments
+            endsWithSep = true;
        regExpString += globstar;
        while (seps.includes(glob[i + 1])) i++;
          } else {
-        // it's not a globstar, so only match one path segment
+            segment += wildcard;
-        regExpString += wildcard;
+          }
        }
        continue;
      }
-    regExpString += c;
+      segment += regExpEscapeChars.includes(glob[i]) ? `\\${glob[i]}` : glob[i];
    }
-  regExpString = `^${regExpString}${regExpString != "" ? sepMaybe : ""}$`;
+    // Check for unclosed groups or a dangling backslash.
    if (groupStack.length > 0 || inRange || inEscape) {
      // Parse failure. Take all characters from this segment literally.
      segment = "";
      for (const c of glob.slice(j, i)) {
        segment += regExpEscapeChars.includes(c) ? `\\${c}` : c;
        endsWithSep = false;
      }
    }
    regExpString += segment;
    if (!endsWithSep) {
      regExpString += i < glob.length ? sep : sepMaybe;
      endsWithSep = true;
    }
    // Terminates with `i` at the start of the next segment.
    while (seps.includes(glob[i])) i++;
    // Check that the next value of `j` is indeed higher than the current value.
    if (!(i > j)) {
      throw new Error("Assertion failure: i > j (potential infinite loop)");
    }
    j = i;
  }
  regExpString = `^${regExpString}$`;
  return new RegExp(regExpString);
 }
--- a/std/path/glob_test.ts
+++ b/std/path/glob_test.ts
@ -44,7 +44,14 @@ function match(
 Deno.test({
  name: "[path] globToRegExp() Basic RegExp",
  fn(): void {
-    assertEquals(globToRegExp(""), /^$/);
+    assertEquals(globToRegExp("*.js", { os: "linux" }), /^[^/]*\.js\/*$/);
  },
 });
 Deno.test({
  name: "[path] globToRegExp() Empty glob",
  fn(): void {
    assertEquals(globToRegExp(""), /(?!)/);
    assertEquals(globToRegExp("*.js", { os: "linux" }), /^[^/]*\.js\/*$/);
  },
 });
@ -108,27 +115,6 @@ Deno.test({
        { extended: false, globstar: false },
      ),
    );
    assert(
      match(
        "[[:digit:]]/bar.txt",
        "1/bar.txt",
        { extended: false, globstar: false },
      ),
    );
    assert(
      match(
        "[[:digit:]b]/bar.txt",
        "b/bar.txt",
        { extended: false, globstar: false },
      ),
    );
    assert(
      match(
        "[![:digit:]b]/bar.txt",
        "a/bar.txt",
        { extended: false, globstar: false },
      ),
    );
    assert(
      !match(
        "[[:alnum:]]/bar.txt",
@ -136,20 +122,48 @@ Deno.test({
        { extended: false, globstar: false },
      ),
    );
-    assert(
+    for (const c of "09AGZagz") {
-      !match(
+      assert(match("[[:alnum:]]", c, { extended: false, globstar: false }), c);
-        "[[:digit:]]/bar.txt",
+    }
-        "a/bar.txt",
+    for (const c of "AGZagz") {
-        { extended: false, globstar: false },
+      assert(match("[[:alpha:]]", c, { extended: false, globstar: false }), c);
-      ),
+    }
-    );
+    for (const c of "\x00\x20\x7F") {
-    assert(
+      assert(match("[[:ascii:]]", c, { extended: false, globstar: false }), c);
-      !match(
+    }
-        "[[:digit:]b]/bar.txt",
+    for (const c of "\t ") {
-        "a/bar.txt",
+      assert(match("[[:blank:]]", c, { extended: false, globstar: false }), c);
-        { extended: false, globstar: false },
+    }
-      ),
+    for (const c of "\x00\x1F\x7F") {
-    );
+      assert(match("[[:cntrl:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "09") {
      assert(match("[[:digit:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "\x21\x7E") {
      assert(match("[[:graph:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "az") {
      assert(match("[[:lower:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "\x20\x7E") {
      assert(match("[[:print:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "!\"#$%&'()*+,-./:;<=>?@[\\]^_‘{|}~") {
      assert(match("[[:punct:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "\t\n\v\f\r ") {
      assert(match("[[:space:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "AZ") {
      assert(match("[[:upper:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "09AZaz_") {
      assert(match("[[:word:]]", c, { extended: false, globstar: false }), c);
    }
    for (const c of "09AFaf") {
      assert(match("[[:xdigit:]]", c, { extended: false, globstar: false }), c);
    }
  },
 });
@ -367,8 +381,11 @@ Deno.test({
  name: "[path] globToRegExp() Special RegExp characters in range",
  fn(): void {
    // Excluding characters checked in the previous test.
-    assertEquals(globToRegExp("[\\$^.=]", { os: "linux" }), /^[\\$^.=]\/*$/);
+    assertEquals(globToRegExp("[\\\\$^.=]", { os: "linux" }), /^[\\$^.=]\/*$/);
-    assertEquals(globToRegExp("[!\\$^.=]", { os: "linux" }), /^[^\\$^.=]\/*$/);
+    assertEquals(
      globToRegExp("[!\\\\$^.=]", { os: "linux" }),
      /^[^\\$^.=]\/*$/,
    );
    assertEquals(globToRegExp("[^^]", { os: "linux" }), /^[\^^]\/*$/);
  },
 });
@ -409,6 +426,53 @@ Deno.test({
  },
 });
 Deno.test({
  name: "[path] globToRegExp() Unclosed groups",
  fn() {
    assert(match("{foo,bar}/[ab", "foo/[ab"));
    assert(match("{foo,bar}/{foo,bar", "foo/{foo,bar"));
    assert(match("{foo,bar}/?(foo|bar", "foo/?(foo|bar"));
    assert(match("{foo,bar}/@(foo|bar", "foo/@(foo|bar"));
    assert(match("{foo,bar}/*(foo|bar", "foo/*(foo|bar"));
    assert(match("{foo,bar}/+(foo|bar", "foo/+(foo|bar"));
    assert(match("{foo,bar}/!(foo|bar", "foo/!(foo|bar"));
    assert(match("{foo,bar}/?({)}", "foo/?({)}"));
    assert(match("{foo,bar}/{?(})", "foo/{?(})"));
  },
 });
 Deno.test({
  name: "[path] globToRegExp() Escape glob characters",
  fn() {
    assert(match("\\[ab]", "[ab]", { os: "linux" }));
    assert(match("`[ab]", "[ab]", { os: "windows" }));
    assert(match("\\{foo,bar}", "{foo,bar}", { os: "linux" }));
    assert(match("`{foo,bar}", "{foo,bar}", { os: "windows" }));
    assert(match("\\?(foo|bar)", "?(foo|bar)", { os: "linux" }));
    assert(match("`?(foo|bar)", "?(foo|bar)", { os: "windows" }));
    assert(match("\\@(foo|bar)", "@(foo|bar)", { os: "linux" }));
    assert(match("`@(foo|bar)", "@(foo|bar)", { os: "windows" }));
    assert(match("\\*(foo|bar)", "*(foo|bar)", { os: "linux" }));
    assert(match("`*(foo|bar)", "*(foo|bar)", { os: "windows" }));
    assert(match("\\+(foo|bar)", "+(foo|bar)", { os: "linux" }));
    assert(match("`+(foo|bar)", "+(foo|bar)", { os: "windows" }));
    assert(match("\\!(foo|bar)", "!(foo|bar)", { os: "linux" }));
    assert(match("`!(foo|bar)", "!(foo|bar)", { os: "windows" }));
    assert(match("@\\(foo|bar)", "@(foo|bar)", { os: "linux" }));
    assert(match("@`(foo|bar)", "@(foo|bar)", { os: "windows" }));
    assert(match("{foo,bar}/[ab]\\", "foo/[ab]\\", { os: "linux" }));
    assert(match("{foo,bar}/[ab]`", "foo/[ab]`", { os: "windows" }));
  },
 });
 Deno.test({
  name: "[path] globToRegExp() Dangling escape prefix",
  fn() {
    assert(match("{foo,bar}/[ab]\\", "foo/[ab]\\", { os: "linux" }));
    assert(match("{foo,bar}/[ab]`", "foo/[ab]`", { os: "windows" }));
  },
 });
 Deno.test({
  name: "[path] GlobToRegExpOptions::extended",
  fn() {