mirror of
https://github.com/Jermolene/TiddlyWiki5.git
synced 2026-03-18 04:41:12 -07:00
Fix URL parentheses rendering
This commit is contained in:
parent
dda4c7fb10
commit
0cde688945
2 changed files with 338 additions and 4 deletions
|
|
@ -30,21 +30,33 @@ exports.parse = function() {
|
|||
// Move past the match
|
||||
var start = this.parser.pos;
|
||||
this.parser.pos = this.matchRegExp.lastIndex;
|
||||
// Extend the match to include balanced closing parentheses
|
||||
var url = this.match[0];
|
||||
while(this.parser.pos < this.parser.sourceLength && this.parser.source.charAt(this.parser.pos) === ")") {
|
||||
var opens = (url.match(/\(/g) || []).length;
|
||||
var closes = (url.match(/\)/g) || []).length;
|
||||
if(opens > closes) {
|
||||
url += ")";
|
||||
this.parser.pos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Create the link unless it is suppressed
|
||||
if(this.match[0].substr(0,1) === "~") {
|
||||
return [{type: "text", text: this.match[0].substr(1), start: start, end: this.parser.pos}];
|
||||
if(url.substr(0,1) === "~") {
|
||||
return [{type: "text", text: url.substr(1), start: start, end: this.parser.pos}];
|
||||
} else {
|
||||
return [{
|
||||
type: "element",
|
||||
tag: "a",
|
||||
attributes: {
|
||||
href: {type: "string", value: this.match[0]},
|
||||
href: {type: "string", value: url},
|
||||
"class": {type: "string", value: "tc-tiddlylink-external"},
|
||||
target: {type: "string", value: "_blank"},
|
||||
rel: {type: "string", value: "noopener noreferrer"}
|
||||
},
|
||||
children: [{
|
||||
type: "text", text: this.match[0], start: start, end: this.parser.pos
|
||||
type: "text", text: url, start: start, end: this.parser.pos
|
||||
}]
|
||||
}];
|
||||
}
|
||||
|
|
|
|||
322
editions/test/tiddlers/tests/test-extlink-parser.js
Normal file
322
editions/test/tiddlers/tests/test-extlink-parser.js
Normal file
|
|
@ -0,0 +1,322 @@
|
|||
/*\
|
||||
title: test-extlink-parser.js
|
||||
type: application/javascript
|
||||
tags: [[$:/tags/test-spec]]
|
||||
|
||||
Tests for the extlink wikitext parser rule.
|
||||
URL validity is judged against RFC 1738 (https://datatracker.ietf.org/doc/html/rfc1738).
|
||||
|
||||
RFC 1738 character classes:
|
||||
safe = "$" | "-" | "_" | "." | "+" | "!" | "*" | "'" | "(" | ")" | ","
|
||||
reserved = ";" | "/" | "?" | ":" | "@" | "=" | "&"
|
||||
unsafe = " " | "<" | ">" | "#" | "%" | "{" | "}" | "|" | "\" | "^" | "~" | "[" | "]" | "`"
|
||||
escape = "%" hex hex
|
||||
unreserved = alpha | digit | safe | extra
|
||||
uchar = unreserved | escape
|
||||
xchar = unreserved | reserved | escape
|
||||
|
||||
HTTP URL syntax: http://<host>:<port>/<path>?<searchpart>
|
||||
FTP URL syntax: ftp://<user>:<password>@<host>:<port>/<cwd1>/.../<cwdN>/<name>;type=<typecode>
|
||||
FILE URL syntax: file://<host>/<path>
|
||||
MAILTO URL syntax: mailto:<rfc822-addr-spec>
|
||||
|
||||
Note: RFC 1738 does not define https: or data: schemes (those came later).
|
||||
The '#' character is explicitly unsafe per RFC 1738 — fragment identifiers
|
||||
are not considered part of the URL.
|
||||
|
||||
\*/
|
||||
|
||||
"use strict";
|
||||
|
||||
describe("ExtLink parser tests", function() {
|
||||
|
||||
var wiki = new $tw.Wiki();
|
||||
|
||||
// Helper: parse wikitext and return the href of the first link found
|
||||
var getFirstLinkHref = function(text) {
|
||||
var tree = wiki.parseText("text/vnd.tiddlywiki", text).tree;
|
||||
// The extlink produces: p > a[href]
|
||||
if(tree[0] && tree[0].children) {
|
||||
for(var i = 0; i < tree[0].children.length; i++) {
|
||||
var node = tree[0].children[i];
|
||||
if(node.type === "element" && node.tag === "a" && node.attributes && node.attributes.href) {
|
||||
return node.attributes.href.value;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
// =====================================================================
|
||||
// Basic valid URLs (RFC 1738 compliant)
|
||||
// =====================================================================
|
||||
|
||||
it("should parse basic HTTP and HTTPS URLs", function() {
|
||||
// RFC 1738 §3.3: http://<host>:<port>/<path>?<searchpart>
|
||||
expect(getFirstLinkHref("https://www.tiddlywiki.com/")).toEqual("https://www.tiddlywiki.com/");
|
||||
expect(getFirstLinkHref("http://example.com/path")).toEqual("http://example.com/path");
|
||||
});
|
||||
|
||||
it("should parse URL that is just protocol and domain", function() {
|
||||
expect(getFirstLinkHref("https://example.com")).toEqual("https://example.com");
|
||||
});
|
||||
|
||||
it("should parse URLs with port numbers", function() {
|
||||
// RFC 1738 §3.3: port is optional, defaults to 80 for HTTP
|
||||
expect(getFirstLinkHref("https://localhost:8080/path")).toEqual("https://localhost:8080/path");
|
||||
});
|
||||
|
||||
it("should parse other RFC 1738 schemes", function() {
|
||||
// RFC 1738 §3.2: FTP
|
||||
expect(getFirstLinkHref("ftp://files.example.com/pub/")).toEqual("ftp://files.example.com/pub/");
|
||||
// RFC 1738 §3.5: MAILTO
|
||||
expect(getFirstLinkHref("mailto:user@example.com")).toEqual("mailto:user@example.com");
|
||||
// RFC 1738 §3.10: FILE
|
||||
expect(getFirstLinkHref("file:///home/user/doc.txt")).toEqual("file:///home/user/doc.txt");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// RFC 1738 safe characters: $ - _ . + ! * ' ( ) ,
|
||||
// These should all be valid within a URL path
|
||||
// =====================================================================
|
||||
|
||||
it("should include safe characters in URL paths per RFC 1738", function() {
|
||||
// Parentheses are RFC 1738 safe characters
|
||||
expect(getFirstLinkHref("https://en.wikipedia.org/wiki/Specials_(Unicode_block)")).toEqual("https://en.wikipedia.org/wiki/Specials_(Unicode_block)");
|
||||
expect(getFirstLinkHref("https://example.com/wiki/A_(B)_C")).toEqual("https://example.com/wiki/A_(B)_C");
|
||||
// Plus sign is a safe character
|
||||
expect(getFirstLinkHref("https://example.com/search?q=hello+world")).toEqual("https://example.com/search?q=hello+world");
|
||||
// Exclamation mark is a safe character
|
||||
expect(getFirstLinkHref("https://example.com/path!/sub")).toEqual("https://example.com/path!/sub");
|
||||
});
|
||||
|
||||
it("should handle deeply nested balanced parentheses", function() {
|
||||
// Parentheses are safe chars per RFC 1738; nesting should work
|
||||
expect(getFirstLinkHref("https://example.com/a(b(c(d)e)f)g")).toEqual("https://example.com/a(b(c(d)e)f)g");
|
||||
});
|
||||
|
||||
it("should handle consecutive parenthesized segments", function() {
|
||||
expect(getFirstLinkHref("https://example.com/(a)(b)(c)")).toEqual("https://example.com/(a)(b)(c)");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// RFC 1738 reserved characters: ; / ? : @ = &
|
||||
// Allowed unencoded when used for their reserved purpose
|
||||
// =====================================================================
|
||||
|
||||
it("should include reserved characters used in query strings", function() {
|
||||
// RFC 1738 §3.3: <searchpart> uses reserved chars ? = &
|
||||
expect(getFirstLinkHref("https://example.com/path?foo=bar&baz=qux")).toEqual("https://example.com/path?foo=bar&baz=qux");
|
||||
expect(getFirstLinkHref("https://example.com?q=hello")).toEqual("https://example.com?q=hello");
|
||||
});
|
||||
|
||||
it("should include reserved characters used for authentication", function() {
|
||||
// RFC 1738 §3.1: //<user>:<password>@<host>
|
||||
expect(getFirstLinkHref("https://user:password@example.com/path")).toEqual("https://user:password@example.com/path");
|
||||
});
|
||||
|
||||
it("should include @ in path segments", function() {
|
||||
expect(getFirstLinkHref("https://example.com/@user/repo")).toEqual("https://example.com/@user/repo");
|
||||
});
|
||||
|
||||
it("should include semicolons in URLs", function() {
|
||||
// RFC 1738 §3.2: FTP uses ;type=
|
||||
expect(getFirstLinkHref("https://example.com/path;param=val")).toEqual("https://example.com/path;param=val");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// RFC 1738 unsafe characters: space < > " # % { } | \ ^ ~ [ ] `
|
||||
// The parser should exclude these from matched URLs
|
||||
// =====================================================================
|
||||
|
||||
it("should stop at RFC 1738 unsafe characters", function() {
|
||||
// RFC 1738 §2.2: Characters that are unsafe for various reasons
|
||||
expect(getFirstLinkHref("https://example.com/path<rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path>rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path{rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path}rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path[rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path]rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path`rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path|rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref('https://example.com/path"rest')).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path\\rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path^rest")).toEqual("https://example.com/path");
|
||||
});
|
||||
|
||||
it("should stop at whitespace (space is unsafe per RFC 1738)", function() {
|
||||
expect(getFirstLinkHref("https://example.com/path rest")).toEqual("https://example.com/path");
|
||||
expect(getFirstLinkHref("https://example.com/path\nnext line")).toEqual("https://example.com/path");
|
||||
});
|
||||
|
||||
it("should handle URL wrapped in unsafe delimiters", function() {
|
||||
// RFC 1738 §2.2: angle brackets and square brackets are unsafe
|
||||
// Tests that leading unsafe chars don't prevent URL matching
|
||||
expect(getFirstLinkHref("<https://example.com>")).toEqual("https://example.com");
|
||||
expect(getFirstLinkHref("[https://example.com]")).toEqual("https://example.com");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// RFC 1738 escape sequences (%HH)
|
||||
// =====================================================================
|
||||
|
||||
it("should include percent-encoded characters", function() {
|
||||
// RFC 1738 §2.2: unsafe chars should be encoded as %HH
|
||||
expect(getFirstLinkHref("https://example.com/path%20with%20spaces/file%2Fname")).toEqual("https://example.com/path%20with%20spaces/file%2Fname");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// Fragment identifiers (#)
|
||||
// RFC 1738 says '#' is unsafe and fragments are NOT part of the URL.
|
||||
// However, the parser includes '#' in URLs (common modern practice).
|
||||
// =====================================================================
|
||||
|
||||
it("should include # in URLs (deviates from RFC 1738 which says # is unsafe)", function() {
|
||||
// RFC 1738 §2.2: '#' is unsafe, fragment identifiers are not part of the URL
|
||||
// Parser behavior: includes # and everything after it as part of the URL
|
||||
expect(getFirstLinkHref("https://example.com#anchor")).toEqual("https://example.com#anchor");
|
||||
});
|
||||
|
||||
it("should drop bare trailing # due to word boundary anchor", function() {
|
||||
// '#' is non-word, no word char follows, so regex (?:\/|\b) backtracks
|
||||
expect(getFirstLinkHref("https://example.com/#")).toEqual("https://example.com/");
|
||||
});
|
||||
|
||||
it("should include # with query in reverse order (technically invalid per RFC 1738)", function() {
|
||||
expect(getFirstLinkHref("https://example.com/path#frag?query=1")).toEqual("https://example.com/path#frag?query=1");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// Balanced parenthesis handling
|
||||
// The parser extends regex matches to consume balanced closing parens.
|
||||
// Parentheses are safe chars per RFC 1738, so they are valid in URLs.
|
||||
// =====================================================================
|
||||
|
||||
it("should not include surrounding parentheses as part of the URL", function() {
|
||||
// Common prose pattern: (https://example.com) — parens belong to prose, not URL
|
||||
expect(getFirstLinkHref("(https://example.com)")).toEqual("https://example.com");
|
||||
expect(getFirstLinkHref("visit (https://example.com) for info")).toEqual("https://example.com");
|
||||
});
|
||||
|
||||
it("should not consume unmatched trailing close-parens", function() {
|
||||
// URL has 1 open, text has 2 closes — only one should be consumed
|
||||
expect(getFirstLinkHref("https://example.com/wiki/A_(B))")).toEqual("https://example.com/wiki/A_(B)");
|
||||
});
|
||||
|
||||
it("should not consume any close-paren when URL has no open-parens", function() {
|
||||
expect(getFirstLinkHref("https://example.com/path)")).toEqual("https://example.com/path");
|
||||
});
|
||||
|
||||
it("should consume trailing close-paren to balance open-paren in URL", function() {
|
||||
expect(getFirstLinkHref("https://example.com/(open)")).toEqual("https://example.com/(open)");
|
||||
});
|
||||
|
||||
it("should handle many trailing close-parens after balanced content", function() {
|
||||
// 2 opens, 5 closes: only 2 should be consumed by balanced-paren loop
|
||||
expect(getFirstLinkHref("https://example.com/a(b(c)d))))")).toEqual("https://example.com/a(b(c)d)");
|
||||
});
|
||||
|
||||
it("should handle URL with mixed parentheses and query/fragment", function() {
|
||||
expect(getFirstLinkHref("https://example.com/wiki/A_(B)?action=edit#top")).toEqual("https://example.com/wiki/A_(B)?action=edit#top");
|
||||
});
|
||||
|
||||
it("should handle URL with single open paren but no close paren", function() {
|
||||
// Open paren is a safe char per RFC 1738 — included as-is
|
||||
expect(getFirstLinkHref("https://example.com/path(open")).toEqual("https://example.com/path(open");
|
||||
});
|
||||
|
||||
it("should handle URL surrounded by double parentheses", function() {
|
||||
var href = getFirstLinkHref("((https://example.com))");
|
||||
// Parser may or may not find the link depending on rule ordering
|
||||
expect(href === null || href === "https://example.com").toBe(true);
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// Regex word boundary anchor (?:\/|\b) edge cases
|
||||
// The regex requires URLs to end with '/' or at a word boundary.
|
||||
// This causes truncation when a URL ends with non-word characters
|
||||
// that aren't followed by word characters.
|
||||
// =====================================================================
|
||||
|
||||
it("should not include empty parentheses due to word boundary anchor", function() {
|
||||
// '(' and ')' are non-word; regex backtracks to trailing '/'
|
||||
// RFC 1738: () are safe chars and should be valid, but the regex anchor prevents it
|
||||
expect(getFirstLinkHref("https://example.com/()")).toEqual("https://example.com/");
|
||||
});
|
||||
|
||||
it("should truncate data URIs at trailing = due to word boundary anchor", function() {
|
||||
// '=' is a reserved char per RFC 1738 and should be valid in URLs.
|
||||
// The regex (?:\/|\b) anchor backtracks past '=' since it's non-word
|
||||
// followed by end-of-string (no word boundary).
|
||||
expect(getFirstLinkHref("data:text/plain;base64,SGVsbG8=")).toEqual("data:text/plain;base64,SGVsbG8");
|
||||
});
|
||||
|
||||
it("should match bare protocol https:// (regex allows it since // ends with /)", function() {
|
||||
// RFC 1738 §3.3: HTTP requires <host>, so https:// is invalid per spec.
|
||||
// Parser matches it because '//' satisfies the (?:\/|\b) anchor.
|
||||
expect(getFirstLinkHref("https://")).toEqual("https://");
|
||||
});
|
||||
|
||||
it("should handle URL ending with trailing period", function() {
|
||||
// '.' is a safe char per RFC 1738, valid in URLs
|
||||
// But trailing period at end of sentence is ambiguous
|
||||
var href = getFirstLinkHref("Visit https://example.com/path.");
|
||||
expect(href).not.toBeNull();
|
||||
});
|
||||
|
||||
it("should handle URL ending with a comma", function() {
|
||||
// ',' is a safe char per RFC 1738, valid in URLs
|
||||
var href = getFirstLinkHref("See https://example.com/path, for details");
|
||||
expect(href).not.toBeNull();
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// Tilde (~) suppression
|
||||
// TiddlyWiki uses ~ prefix to suppress automatic linking.
|
||||
// Note: ~ is an unsafe char per RFC 1738.
|
||||
// =====================================================================
|
||||
|
||||
it("should suppress links preceded by ~", function() {
|
||||
expect(getFirstLinkHref("~https://example.com")).toEqual(null);
|
||||
});
|
||||
|
||||
it("should suppress link with ~ even if URL has parens", function() {
|
||||
expect(getFirstLinkHref("~https://en.wikipedia.org/wiki/Specials_(Unicode_block)")).toEqual(null);
|
||||
});
|
||||
|
||||
it("should handle adjacent suppressed and real links", function() {
|
||||
// First URL is suppressed with ~, second should be found
|
||||
expect(getFirstLinkHref("~https://suppressed.com https://real.com")).toEqual("https://real.com");
|
||||
});
|
||||
|
||||
it("should handle tilde in URL path (not at start — no suppression)", function() {
|
||||
// ~ in the path is not a suppression prefix
|
||||
// RFC 1738: ~ is unsafe, but commonly used in practice for user dirs
|
||||
expect(getFirstLinkHref("https://example.com/~user/page")).toEqual("https://example.com/~user/page");
|
||||
});
|
||||
|
||||
// =====================================================================
|
||||
// Miscellaneous edge cases
|
||||
// =====================================================================
|
||||
|
||||
it("should handle multiple URLs in the same text (first link returned)", function() {
|
||||
expect(getFirstLinkHref("See https://first.com and https://second.com")).toEqual("https://first.com");
|
||||
});
|
||||
|
||||
it("should handle very long URLs", function() {
|
||||
var longPath = "";
|
||||
for(var i = 0; i < 200; i++) { longPath += "/seg" + i; }
|
||||
var longUrl = "https://example.com" + longPath;
|
||||
expect(getFirstLinkHref(longUrl)).toEqual(longUrl);
|
||||
});
|
||||
|
||||
it("should handle URL with double slashes in path", function() {
|
||||
expect(getFirstLinkHref("https://example.com//double//slash")).toEqual("https://example.com//double//slash");
|
||||
});
|
||||
|
||||
it("should handle URL with unicode path segments", function() {
|
||||
// RFC 1738 §2.2: non-ASCII chars should be encoded, but parser accepts them
|
||||
expect(getFirstLinkHref("https://example.com/日本語/page")).toEqual("https://example.com/日本語/page");
|
||||
});
|
||||
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue