Fix URL parentheses rendering

2026-03-18 04:41:12 -07:00 · 2026-03-16 00:29:27 +01:00 · 2026-03-16 00:29:27 +01:00 · 0cde688945
commit 0cde688945
parent dda4c7fb10
2 changed files with 338 additions and 4 deletions
--- a/core/modules/parsers/wikiparser/rules/extlink.js
+++ b/core/modules/parsers/wikiparser/rules/extlink.js
@ -30,21 +30,33 @@ exports.parse = function() {
 	// Move past the match
 	var start = this.parser.pos;
 	this.parser.pos = this.matchRegExp.lastIndex;
+	// Extend the match to include balanced closing parentheses
+	var url = this.match[0];
+	while(this.parser.pos < this.parser.sourceLength && this.parser.source.charAt(this.parser.pos) === ")") {
+		var opens = (url.match(/\(/g) || []).length;
+		var closes = (url.match(/\)/g) || []).length;
+		if(opens > closes) {
+			url += ")";
+			this.parser.pos++;
+		} else {
+			break;
+		}
+	}
 	// Create the link unless it is suppressed
-	if(this.match[0].substr(0,1) === "~") {
-		return [{type: "text", text: this.match[0].substr(1), start: start, end: this.parser.pos}];
+	if(url.substr(0,1) === "~") {
+		return [{type: "text", text: url.substr(1), start: start, end: this.parser.pos}];
 	} else {
 		return [{
 			type: "element",
 			tag: "a",
 			attributes: {
-				href: {type: "string", value: this.match[0]},
+				href: {type: "string", value: url},
 				"class": {type: "string", value: "tc-tiddlylink-external"},
 				target: {type: "string", value: "_blank"},
 				rel: {type: "string", value: "noopener noreferrer"}
 			},
 			children: [{
-				type: "text", text: this.match[0], start: start, end: this.parser.pos
+				type: "text", text: url, start: start, end: this.parser.pos
 			}]
 		}];
 	}
--- a/editions/test/tiddlers/tests/test-extlink-parser.js
+++ b/editions/test/tiddlers/tests/test-extlink-parser.js
@ -0,0 +1,322 @@
+/*\
+title: test-extlink-parser.js
+type: application/javascript
+tags: [[$:/tags/test-spec]]
+
+Tests for the extlink wikitext parser rule.
+URL validity is judged against RFC 1738 (https://datatracker.ietf.org/doc/html/rfc1738).
+
+RFC 1738 character classes:
+  safe       = "$" | "-" | "_" | "." | "+" | "!" | "*" | "'" | "(" | ")"  | ","
+  reserved   = ";" | "/" | "?" | ":" | "@" | "=" | "&"
+  unsafe     = " " | "<" | ">" | "#" | "%" | "{" | "}" | "|" | "\" | "^" | "~" | "[" | "]" | "`"
+  escape     = "%" hex hex
+  unreserved = alpha | digit | safe | extra
+  uchar      = unreserved | escape
+  xchar      = unreserved | reserved | escape
+
+HTTP URL syntax:  http://<host>:<port>/<path>?<searchpart>
+FTP URL syntax:   ftp://<user>:<password>@<host>:<port>/<cwd1>/.../<cwdN>/<name>;type=<typecode>
+FILE URL syntax:  file://<host>/<path>
+MAILTO URL syntax: mailto:<rfc822-addr-spec>
+
+Note: RFC 1738 does not define https: or data: schemes (those came later).
+The '#' character is explicitly unsafe per RFC 1738 — fragment identifiers
+are not considered part of the URL.
+
+\*/
+
+"use strict";
+
+describe("ExtLink parser tests", function() {
+
+	var wiki = new $tw.Wiki();
+
+	// Helper: parse wikitext and return the href of the first link found
+	var getFirstLinkHref = function(text) {
+		var tree = wiki.parseText("text/vnd.tiddlywiki", text).tree;
+		// The extlink produces: p > a[href]
+		if(tree[0] && tree[0].children) {
+			for(var i = 0; i < tree[0].children.length; i++) {
+				var node = tree[0].children[i];
+				if(node.type === "element" && node.tag === "a" && node.attributes && node.attributes.href) {
+					return node.attributes.href.value;
+				}
+			}
+		}
+		return null;
+	};
+
+	// =====================================================================
+	// Basic valid URLs (RFC 1738 compliant)
+	// =====================================================================
+
+	it("should parse basic HTTP and HTTPS URLs", function() {
+		// RFC 1738 §3.3: http://<host>:<port>/<path>?<searchpart>
+		expect(getFirstLinkHref("https://www.tiddlywiki.com/")).toEqual("https://www.tiddlywiki.com/");
+		expect(getFirstLinkHref("http://example.com/path")).toEqual("http://example.com/path");
+	});
+
+	it("should parse URL that is just protocol and domain", function() {
+		expect(getFirstLinkHref("https://example.com")).toEqual("https://example.com");
+	});
+
+	it("should parse URLs with port numbers", function() {
+		// RFC 1738 §3.3: port is optional, defaults to 80 for HTTP
+		expect(getFirstLinkHref("https://localhost:8080/path")).toEqual("https://localhost:8080/path");
+	});
+
+	it("should parse other RFC 1738 schemes", function() {
+		// RFC 1738 §3.2: FTP
+		expect(getFirstLinkHref("ftp://files.example.com/pub/")).toEqual("ftp://files.example.com/pub/");
+		// RFC 1738 §3.5: MAILTO
+		expect(getFirstLinkHref("mailto:user@example.com")).toEqual("mailto:user@example.com");
+		// RFC 1738 §3.10: FILE
+		expect(getFirstLinkHref("file:///home/user/doc.txt")).toEqual("file:///home/user/doc.txt");
+	});
+
+	// =====================================================================
+	// RFC 1738 safe characters: $ - _ . + ! * ' ( ) ,
+	// These should all be valid within a URL path
+	// =====================================================================
+
+	it("should include safe characters in URL paths per RFC 1738", function() {
+		// Parentheses are RFC 1738 safe characters
+		expect(getFirstLinkHref("https://en.wikipedia.org/wiki/Specials_(Unicode_block)")).toEqual("https://en.wikipedia.org/wiki/Specials_(Unicode_block)");
+		expect(getFirstLinkHref("https://example.com/wiki/A_(B)_C")).toEqual("https://example.com/wiki/A_(B)_C");
+		// Plus sign is a safe character
+		expect(getFirstLinkHref("https://example.com/search?q=hello+world")).toEqual("https://example.com/search?q=hello+world");
+		// Exclamation mark is a safe character
+		expect(getFirstLinkHref("https://example.com/path!/sub")).toEqual("https://example.com/path!/sub");
+	});
+
+	it("should handle deeply nested balanced parentheses", function() {
+		// Parentheses are safe chars per RFC 1738; nesting should work
+		expect(getFirstLinkHref("https://example.com/a(b(c(d)e)f)g")).toEqual("https://example.com/a(b(c(d)e)f)g");
+	});
+
+	it("should handle consecutive parenthesized segments", function() {
+		expect(getFirstLinkHref("https://example.com/(a)(b)(c)")).toEqual("https://example.com/(a)(b)(c)");
+	});
+
+	// =====================================================================
+	// RFC 1738 reserved characters: ; / ? : @ = &
+	// Allowed unencoded when used for their reserved purpose
+	// =====================================================================
+
+	it("should include reserved characters used in query strings", function() {
+		// RFC 1738 §3.3: <searchpart> uses reserved chars ? = &
+		expect(getFirstLinkHref("https://example.com/path?foo=bar&baz=qux")).toEqual("https://example.com/path?foo=bar&baz=qux");
+		expect(getFirstLinkHref("https://example.com?q=hello")).toEqual("https://example.com?q=hello");
+	});
+
+	it("should include reserved characters used for authentication", function() {
+		// RFC 1738 §3.1: //<user>:<password>@<host>
+		expect(getFirstLinkHref("https://user:password@example.com/path")).toEqual("https://user:password@example.com/path");
+	});
+
+	it("should include @ in path segments", function() {
+		expect(getFirstLinkHref("https://example.com/@user/repo")).toEqual("https://example.com/@user/repo");
+	});
+
+	it("should include semicolons in URLs", function() {
+		// RFC 1738 §3.2: FTP uses ;type=
+		expect(getFirstLinkHref("https://example.com/path;param=val")).toEqual("https://example.com/path;param=val");
+	});
+
+	// =====================================================================
+	// RFC 1738 unsafe characters: space < > " # % { } | \ ^ ~ [ ] `
+	// The parser should exclude these from matched URLs
+	// =====================================================================
+
+	it("should stop at RFC 1738 unsafe characters", function() {
+		// RFC 1738 §2.2: Characters that are unsafe for various reasons
+		expect(getFirstLinkHref("https://example.com/path<rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path>rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path{rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path}rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path[rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path]rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path`rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path|rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref('https://example.com/path"rest')).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path\\rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path^rest")).toEqual("https://example.com/path");
+	});
+
+	it("should stop at whitespace (space is unsafe per RFC 1738)", function() {
+		expect(getFirstLinkHref("https://example.com/path rest")).toEqual("https://example.com/path");
+		expect(getFirstLinkHref("https://example.com/path\nnext line")).toEqual("https://example.com/path");
+	});
+
+	it("should handle URL wrapped in unsafe delimiters", function() {
+		// RFC 1738 §2.2: angle brackets and square brackets are unsafe
+		// Tests that leading unsafe chars don't prevent URL matching
+		expect(getFirstLinkHref("<https://example.com>")).toEqual("https://example.com");
+		expect(getFirstLinkHref("[https://example.com]")).toEqual("https://example.com");
+	});
+
+	// =====================================================================
+	// RFC 1738 escape sequences (%HH)
+	// =====================================================================
+
+	it("should include percent-encoded characters", function() {
+		// RFC 1738 §2.2: unsafe chars should be encoded as %HH
+		expect(getFirstLinkHref("https://example.com/path%20with%20spaces/file%2Fname")).toEqual("https://example.com/path%20with%20spaces/file%2Fname");
+	});
+
+	// =====================================================================
+	// Fragment identifiers (#)
+	// RFC 1738 says '#' is unsafe and fragments are NOT part of the URL.
+	// However, the parser includes '#' in URLs (common modern practice).
+	// =====================================================================
+
+	it("should include # in URLs (deviates from RFC 1738 which says # is unsafe)", function() {
+		// RFC 1738 §2.2: '#' is unsafe, fragment identifiers are not part of the URL
+		// Parser behavior: includes # and everything after it as part of the URL
+		expect(getFirstLinkHref("https://example.com#anchor")).toEqual("https://example.com#anchor");
+	});
+
+	it("should drop bare trailing # due to word boundary anchor", function() {
+		// '#' is non-word, no word char follows, so regex (?:\/|\b) backtracks
+		expect(getFirstLinkHref("https://example.com/#")).toEqual("https://example.com/");
+	});
+
+	it("should include # with query in reverse order (technically invalid per RFC 1738)", function() {
+		expect(getFirstLinkHref("https://example.com/path#frag?query=1")).toEqual("https://example.com/path#frag?query=1");
+	});
+
+	// =====================================================================
+	// Balanced parenthesis handling
+	// The parser extends regex matches to consume balanced closing parens.
+	// Parentheses are safe chars per RFC 1738, so they are valid in URLs.
+	// =====================================================================
+
+	it("should not include surrounding parentheses as part of the URL", function() {
+		// Common prose pattern: (https://example.com) — parens belong to prose, not URL
+		expect(getFirstLinkHref("(https://example.com)")).toEqual("https://example.com");
+		expect(getFirstLinkHref("visit (https://example.com) for info")).toEqual("https://example.com");
+	});
+
+	it("should not consume unmatched trailing close-parens", function() {
+		// URL has 1 open, text has 2 closes — only one should be consumed
+		expect(getFirstLinkHref("https://example.com/wiki/A_(B))")).toEqual("https://example.com/wiki/A_(B)");
+	});
+
+	it("should not consume any close-paren when URL has no open-parens", function() {
+		expect(getFirstLinkHref("https://example.com/path)")).toEqual("https://example.com/path");
+	});
+
+	it("should consume trailing close-paren to balance open-paren in URL", function() {
+		expect(getFirstLinkHref("https://example.com/(open)")).toEqual("https://example.com/(open)");
+	});
+
+	it("should handle many trailing close-parens after balanced content", function() {
+		// 2 opens, 5 closes: only 2 should be consumed by balanced-paren loop
+		expect(getFirstLinkHref("https://example.com/a(b(c)d))))")).toEqual("https://example.com/a(b(c)d)");
+	});
+
+	it("should handle URL with mixed parentheses and query/fragment", function() {
+		expect(getFirstLinkHref("https://example.com/wiki/A_(B)?action=edit#top")).toEqual("https://example.com/wiki/A_(B)?action=edit#top");
+	});
+
+	it("should handle URL with single open paren but no close paren", function() {
+		// Open paren is a safe char per RFC 1738 — included as-is
+		expect(getFirstLinkHref("https://example.com/path(open")).toEqual("https://example.com/path(open");
+	});
+
+	it("should handle URL surrounded by double parentheses", function() {
+		var href = getFirstLinkHref("((https://example.com))");
+		// Parser may or may not find the link depending on rule ordering
+		expect(href === null || href === "https://example.com").toBe(true);
+	});
+
+	// =====================================================================
+	// Regex word boundary anchor (?:\/|\b) edge cases
+	// The regex requires URLs to end with '/' or at a word boundary.
+	// This causes truncation when a URL ends with non-word characters
+	// that aren't followed by word characters.
+	// =====================================================================
+
+	it("should not include empty parentheses due to word boundary anchor", function() {
+		// '(' and ')' are non-word; regex backtracks to trailing '/'
+		// RFC 1738: () are safe chars and should be valid, but the regex anchor prevents it
+		expect(getFirstLinkHref("https://example.com/()")).toEqual("https://example.com/");
+	});
+
+	it("should truncate data URIs at trailing = due to word boundary anchor", function() {
+		// '=' is a reserved char per RFC 1738 and should be valid in URLs.
+		// The regex (?:\/|\b) anchor backtracks past '=' since it's non-word
+		// followed by end-of-string (no word boundary).
+		expect(getFirstLinkHref("data:text/plain;base64,SGVsbG8=")).toEqual("data:text/plain;base64,SGVsbG8");
+	});
+
+	it("should match bare protocol https:// (regex allows it since // ends with /)", function() {
+		// RFC 1738 §3.3: HTTP requires <host>, so https:// is invalid per spec.
+		// Parser matches it because '//' satisfies the (?:\/|\b) anchor.
+		expect(getFirstLinkHref("https://")).toEqual("https://");
+	});
+
+	it("should handle URL ending with trailing period", function() {
+		// '.' is a safe char per RFC 1738, valid in URLs
+		// But trailing period at end of sentence is ambiguous
+		var href = getFirstLinkHref("Visit https://example.com/path.");
+		expect(href).not.toBeNull();
+	});
+
+	it("should handle URL ending with a comma", function() {
+		// ',' is a safe char per RFC 1738, valid in URLs
+		var href = getFirstLinkHref("See https://example.com/path, for details");
+		expect(href).not.toBeNull();
+	});
+
+	// =====================================================================
+	// Tilde (~) suppression
+	// TiddlyWiki uses ~ prefix to suppress automatic linking.
+	// Note: ~ is an unsafe char per RFC 1738.
+	// =====================================================================
+
+	it("should suppress links preceded by ~", function() {
+		expect(getFirstLinkHref("~https://example.com")).toEqual(null);
+	});
+
+	it("should suppress link with ~ even if URL has parens", function() {
+		expect(getFirstLinkHref("~https://en.wikipedia.org/wiki/Specials_(Unicode_block)")).toEqual(null);
+	});
+
+	it("should handle adjacent suppressed and real links", function() {
+		// First URL is suppressed with ~, second should be found
+		expect(getFirstLinkHref("~https://suppressed.com https://real.com")).toEqual("https://real.com");
+	});
+
+	it("should handle tilde in URL path (not at start — no suppression)", function() {
+		// ~ in the path is not a suppression prefix
+		// RFC 1738: ~ is unsafe, but commonly used in practice for user dirs
+		expect(getFirstLinkHref("https://example.com/~user/page")).toEqual("https://example.com/~user/page");
+	});
+
+	// =====================================================================
+	// Miscellaneous edge cases
+	// =====================================================================
+
+	it("should handle multiple URLs in the same text (first link returned)", function() {
+		expect(getFirstLinkHref("See https://first.com and https://second.com")).toEqual("https://first.com");
+	});
+
+	it("should handle very long URLs", function() {
+		var longPath = "";
+		for(var i = 0; i < 200; i++) { longPath += "/seg" + i; }
+		var longUrl = "https://example.com" + longPath;
+		expect(getFirstLinkHref(longUrl)).toEqual(longUrl);
+	});
+
+	it("should handle URL with double slashes in path", function() {
+		expect(getFirstLinkHref("https://example.com//double//slash")).toEqual("https://example.com//double//slash");
+	});
+
+	it("should handle URL with unicode path segments", function() {
+		// RFC 1738 §2.2: non-ASCII chars should be encoded, but parser accepts them
+		expect(getFirstLinkHref("https://example.com/日本語/page")).toEqual("https://example.com/日本語/page");
+	});
+
+});