Merge pull request #2397 from thelounge/xpaw/linkify-it

Switch to `linkify-it` for url detection in messages
2024-05-20 07:06:36 +02:00 · 2018-05-26 16:44:18 -04:00 · 2018-05-26 16:44:18 -04:00 · 19cce592c8
parent 0277ac1a73 97dfdbf7c0
commit 19cce592c8
7 changed files with 343 additions and 102 deletions
--- a/client/js/libs/handlebars/ircmessageparser/findLinks.js
+++ b/client/js/libs/handlebars/ircmessageparser/findLinks.js
@ -1,13 +1,30 @@
 "use strict";

-const URI = require("urijs");
+const LinkifyIt = require("linkify-it");

-// Known schemes to detect in a text. If a text contains `foo...bar://foo.com`,
-// the parsed scheme should be `foo...bar` but if it contains
-// `foo...http://foo.com`, we assume the scheme to extract will be `http`.
+LinkifyIt.prototype.normalize = function normalize(match) {
+	if (!match.schema) {
+		match.schema = "https:";
+		match.url = "https://" + match.url;
+	}
+
+	if (match.schema === "//") {
+		match.schema = "https:";
+		match.url = "https:" + match.url;
+	}
+
+	if (match.schema === "mailto:" && !/^mailto:/i.test(match.url)) {
+		match.url = "mailto:" + match.url;
+	}
+};
+
+const linkify = LinkifyIt()
+	.tlds(require("tlds"))
+	.tlds("onion", true);
+
+// Known schemes to detect in text
 const commonSchemes = [
-	"http", "https",
-	"ftp", "sftp",
+	"sftp",
 	"smb", "file",
 	"irc", "ircs",
 	"svn", "git",
@ -15,54 +32,22 @@ const commonSchemes = [
 	"svn+ssh", "ssh",
 ];

+for (const schema of commonSchemes) {
+	linkify.add(schema + ":", "http:");
+}
+
 function findLinks(text) {
-	const result = [];
+	const matches = linkify.match(text);

-	// URI.withinString() identifies URIs within text, e.g. to translate them to
-	// <a>-Tags.
-	// See https://medialize.github.io/URI.js/docs.html#static-withinString
-	// In our case, we store each URI encountered in a result array.
-	try {
-		URI.withinString(text, function(url, start, end) {
-			let parsedScheme;
-
-			try {
-				// Extract the scheme of the URL detected, if there is one
-				parsedScheme = URI(url).scheme().toLowerCase();
-			} catch (e) {
-				// URI may throw an exception for malformed urls,
-				// as to why withinString finds these in the first place is a mystery
-				return;
-			}
-
-			// Check if the scheme of the detected URL matches a common one above.
-			// In a URL like `foo..http://example.com`, the scheme would be `foo..http`,
-			// so we need to clean up the end of the scheme and filter out the rest.
-			const matchedScheme = commonSchemes.find((scheme) => parsedScheme.endsWith(scheme));
-
-			// A known scheme was found, extract the unknown part from the URL
-			if (matchedScheme) {
-				const prefix = parsedScheme.length - matchedScheme.length;
-				start += prefix;
-				url = url.slice(prefix);
-			}
-
-			// The URL matched but does not start with a scheme (`www.foo.com`), add it
-			if (!parsedScheme.length) {
-				url = "http://" + url;
-			}
-
-			result.push({
-				start: start,
-				end: end,
-				link: url,
-			});
-		});
-	} catch (e) {
-		// withinString is wrapped in a try/catch due to https://github.com/medialize/URI.js/issues/359
+	if (!matches) {
+		return [];
 	}

-	return result;
+	return matches.map((url) => ({
+		start: url.index,
+		end: url.lastIndex,
+		link: url.url,
+	}));
 }

 module.exports = findLinks;
--- a/package.json
+++ b/package.json
@ -36,7 +36,7 @@
  ],
  "license": "MIT",
  "engines": {
-    "node": ">=6"
+    "node": ">=6.13.0"
  },
  "dependencies": {
    "bcryptjs": "2.4.3",
@ -46,6 +46,7 @@
    "express": "4.16.3",
    "fs-extra": "6.0.1",
    "irc-framework": "2.11.0",
+    "linkify-it": "2.0.3",
    "lodash": "4.17.10",
    "mime-types": "2.1.18",
    "moment": "2.22.1",
@ -58,6 +59,7 @@
    "spdy": "3.4.7",
    "sqlite3": "4.0.0",
    "thelounge-ldapjs-non-maintained-fork": "1.0.2",
+    "tlds": "1.203.1",
    "ua-parser-js": "0.7.18",
    "urijs": "1.19.1",
    "uuid": "3.2.1",
--- a/src/plugins/irc-events/link.js
+++ b/src/plugins/irc-events/link.js
@ -2,8 +2,7 @@

 const cheerio = require("cheerio");
 const request = require("request");
-const url = require("url");
-const URI = require("urijs");
+const URL = require("url").URL;
 const mime = require("mime-types");
 const Helper = require("../../helper");
 const cleanIrcMessage = require("../../../client/js/libs/handlebars/ircmessageparser/cleanIrcMessage");
@ -11,7 +10,6 @@ const findLinks = require("../../../client/js/libs/handlebars/ircmessageparser/f
 const storage = require("../storage");

 const mediaTypeRegex = /^(audio|video)\/.+/;
-const linkRegex = /^https?:\/\//;

 // Fix ECDH curve client compatibility in Node v8/v9
 // This is fixed in Node 10, but The Lounge supports LTS versions
@ -33,26 +31,36 @@ module.exports = function(client, chan, msg) {
 	// Remove all IRC formatting characters before searching for links
 	const cleanText = cleanIrcMessage(msg.text);

-	// We will only try to prefetch http(s) links
-	const links = findLinks(cleanText).filter((w) => linkRegex.test(w.link));
+	msg.previews = findLinks(cleanText).reduce((cleanLinks, link) => {
+		const url = normalizeURL(link.link);

-	if (links.length === 0) {
-		return;
-	}
+		// If the URL is invalid and cannot be normalized, don't fetch it
+		if (url === null) {
+			return cleanLinks;
+		}

-	msg.previews = Array.from(new Set( // Remove duplicate links
-		links.map((link) => link.link)
-	)).map((link) => ({
-		type: "loading",
-		head: "",
-		body: "",
-		thumb: "",
-		link: link,
-		shown: true,
-	})).slice(0, 5); // Only preview the first 5 URLs in message to avoid abuse
+		// If there are too many urls in this message, only fetch first X valid links
+		if (cleanLinks.length > 4) {
+			return cleanLinks;
+		}

-	msg.previews.forEach((preview) => {
-		fetch(normalizeURL(preview.link), {
+		// Do not fetch duplicate links twice
+		if (cleanLinks.some((l) => l.link === link.link)) {
+			return cleanLinks;
+		}
+
+		const preview = {
+			type: "loading",
+			head: "",
+			body: "",
+			thumb: "",
+			link: link.link, // Send original matched link to the client
+			shown: true,
+		};
+
+		cleanLinks.push(preview);
+
+		fetch(url, {
 			accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 			language: client.language,
 		}, function(res, err) {
@ -69,7 +77,9 @@ module.exports = function(client, chan, msg) {

 			parse(msg, preview, res, client);
 		});
-	});
+
+		return cleanLinks;
+	}, []);
 };

 function parseHtml(preview, res, client) {
@ -94,18 +104,14 @@ function parseHtml(preview, res, client) {
 					|| $('link[rel="image_src"]').attr("href")
 					|| "";

+				// Make sure thumbnail is a valid and absolute url
 				if (preview.thumb.length) {
-					preview.thumb = url.resolve(preview.link, preview.thumb);
-				}
-
-				// Make sure thumbnail is a valid url
-				if (!linkRegex.test(preview.thumb)) {
-					preview.thumb = "";
+					preview.thumb = normalizeURL(preview.thumb, preview.link) || "";
 				}

 				// Verify that thumbnail pic exists and is under allowed size
 				if (preview.thumb.length) {
-					fetch(normalizeURL(preview.thumb), {language: client.language}, (resThumb) => {
+					fetch(preview.thumb, {language: client.language}, (resThumb) => {
 						if (resThumb === null
 						|| !(/^image\/.+/.test(resThumb.type))
 						|| resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) {
@ -135,16 +141,19 @@ function parseHtmlMedia($, preview, res, client) {

 				if (mediaTypeRegex.test(mimeType)) {
 					// If we match a clean video or audio tag, parse that as a preview instead
-					const mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");
+					let mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");

 					// Make sure media is a valid url
-					if (!mediaUrl.startsWith("https://")) {
+					mediaUrl = normalizeURL(mediaUrl, preview.link, true);
+
+					// Make sure media is a valid url
+					if (!mediaUrl) {
 						return;
 					}

 					foundMedia = true;

-					fetch(normalizeURL(mediaUrl), {
+					fetch(mediaUrl, {
 						accept: type === "video" ?
 							"video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5" :
 							"audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5",
@ -361,6 +370,31 @@ function fetch(uri, headers, cb) {
 		});
 }

-function normalizeURL(header) {
-	return URI(header).normalize().toString();
+function normalizeURL(link, baseLink, disallowHttp = false) {
+	try {
+		const url = new URL(link, baseLink);
+
+		// Only fetch http and https links
+		if (url.protocol !== "http:" && url.protocol !== "https:") {
+			return null;
+		}
+
+		if (disallowHttp && url.protocol === "http:") {
+			return null;
+		}
+
+		// Do not fetch links without hostname or ones that contain authorization
+		if (!url.hostname || url.username || url.password) {
+			return null;
+		}
+
+		// Drop hash from the url, if any
+		url.hash = "";
+
+		return url.toString();
+	} catch (e) {
+		// if an exception was thrown, the url is not valid
+	}
+
+	return null;
 }
--- a/test/client/js/libs/handlebars/ircmessageparser/findLinks.js
+++ b/test/client/js/libs/handlebars/ircmessageparser/findLinks.js
@ -22,7 +22,7 @@ describe("findLinks", () => {
 		const expected = [{
 			start: 0,
 			end: 24,
-			link: "http://www.nooooooooooooooo.com",
+			link: "https://www.nooooooooooooooo.com",
 		}];

 		const actual = findLinks(input);
@ -46,7 +46,7 @@ describe("findLinks", () => {
 	it("should find urls in strings starting with www", () => {
 		const input = "use www.duckduckgo.com for privacy reasons";
 		const expected = [{
-			link: "http://www.duckduckgo.com",
+			link: "https://www.duckduckgo.com",
 			start: 4,
 			end: 22,
 		}];
@ -69,12 +69,12 @@ describe("findLinks", () => {
 		expect(actual).to.deep.equal(expected);
 	});

-	it("should find urls with starting with www. and odd surroundings", () => {
-		const input = ".:www.github.com:.";
+	it("should find urls with starting with http:// and odd surroundings", () => {
+		const input = ".:http://www.github.com:. .:www.github.com:.";
 		const expected = [{
 			link: "http://www.github.com",
 			start: 2,
-			end: 16,
+			end: 23,
 		}];

 		const actual = findLinks(input);
@ -94,7 +94,7 @@ describe("findLinks", () => {
 	it("should handle multiple www. correctly", () => {
 		const input = "www.www.test.com";
 		const expected = [{
-			link: "http://www.www.test.com",
+			link: "https://www.www.test.com",
 			start: 0,
 			end: 16,
 		}];
@ -104,16 +104,150 @@ describe("findLinks", () => {
 		expect(actual).to.deep.equal(expected);
 	});

+	it("should find domains without www. but valid tld", () => {
+		const input = "google.com google.lv google.museum";
+		const expected = [{
+			link: "https://google.com",
+			start: 0,
+			end: 10,
+		}, {
+			link: "https://google.lv",
+			start: 11,
+			end: 20,
+		}, {
+			link: "https://google.museum",
+			start: 21,
+			end: 34,
+		}];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should find .onion domains", () => {
+		const input = "facebookcorewwwi.onion/test?url";
+		const expected = [{
+			link: "https://facebookcorewwwi.onion/test?url",
+			start: 0,
+			end: 31,
+		}];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should not consider invalid TLDs as domains", () => {
+		const input = "google.wtfgugl google.xx www.google.wtfgugl www.google.xx";
+		const expected = [];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should consider invalid TLDs as domains if protocol is specified", () => {
+		const input = "http://google.wtfgugl http://google.xx http://www.google.wtfgugl http://www.google.xx";
+		const expected = [{
+			link: "http://google.wtfgugl",
+			start: 0,
+			end: 21,
+		}, {
+			link: "http://google.xx",
+			start: 22,
+			end: 38,
+		}, {
+			link: "http://www.google.wtfgugl",
+			start: 39,
+			end: 64,
+		}, {
+			link: "http://www.google.xx",
+			start: 65,
+			end: 85,
+		}];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should correctly stop at punctuation", () => { // Issue #2351
+		const input =
+			"https://en.wikipedia.org/wiki/Dig! " +
+			"https://en.wikipedia.org/wiki/Dig? " +
+			"https://en.wikipedia.org/wiki/Dig. " +
+			"https://www.google.com* " +
+			"https://www.google.com/test* " +
+			"https://www.google.com@ " +
+			"https://www.google.com/test@ " +
+			"https://www.google.com! ";
+		const expected = [{
+			link: "https://en.wikipedia.org/wiki/Dig",
+			start: 0,
+			end: 33,
+		}, {
+			link: "https://en.wikipedia.org/wiki/Dig",
+			start: 35,
+			end: 68,
+		}, {
+			link: "https://en.wikipedia.org/wiki/Dig",
+			start: 70,
+			end: 103,
+		}, {
+			link: "https://www.google.com",
+			start: 105,
+			end: 127,
+		}, {
+			link: "https://www.google.com/test*",
+			start: 129,
+			end: 157,
+		}, {
+			link: "https://www.google.com",
+			start: 158,
+			end: 180,
+		}, {
+			link: "https://www.google.com/test@",
+			start: 182,
+			end: 210,
+		}, {
+			link: "https://www.google.com",
+			start: 211,
+			end: 233,
+		}];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should correctly stop at apostrophe", () => {
+		const input = "https://www.google.com's www.google.com's google.com's"; // Issue #1302
+		const expected = [{
+			link: "https://www.google.com",
+			start: 0,
+			end: 22,
+		}, {
+			link: "https://www.google.com",
+			start: 25,
+			end: 39,
+		}, {
+			link: "https://google.com",
+			start: 42,
+			end: 52,
+		}];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
 	it("does not find invalid urls", () => {
 		const input = "www.example.com ssh://-oProxyCommand=whois"; // Issue #1412
 		const expected = [{
 			start: 0,
 			end: 15,
-			link: "http://www.example.com",
-		}, {
-			end: 42,
-			start: 16,
-			link: "ssh://-oProxyCommand=whois",
+			link: "https://www.example.com",
 		}];

 		const actual = findLinks(input);
@ -124,7 +258,11 @@ describe("findLinks", () => {
 		const expected2 = [{
 			start: 0,
 			end: 15,
-			link: "http://www.example.com",
+			link: "https://www.example.com",
+		}, {
+			start: 16,
+			end: 57,
+			link: "http://root:'some%pass'@hostname/database",
 		}];

 		const actual2 = findLinks(input2);
@ -137,7 +275,11 @@ describe("findLinks", () => {
 		const expected = [{
 			start: 0,
 			end: 15,
-			link: "http://www.example.com",
+			link: "https://www.example.com",
+		}, {
+			start: 16,
+			end: 29,
+			link: "http://a:%p@c",
 		}, {
 			start: 30,
 			end: 51,
@ -148,4 +290,17 @@ describe("findLinks", () => {

 		expect(actual).to.deep.equal(expected);
 	});
+
+	it("should add protocol to protocol-aware urls", () => {
+		const input = "//example.com";
+		const expected = [{
+			link: "https://example.com",
+			start: 0,
+			end: 13,
+		}];
+
+		const actual = findLinks(input);
+
+		expect(actual).to.deep.equal(expected);
+	});
 });
--- a/test/client/js/libs/handlebars/parse.js
+++ b/test/client/js/libs/handlebars/parse.js
@ -7,7 +7,7 @@ describe("parse Handlebars helper", () => {
 	it("should not introduce xss", () => {
 		const testCases = [{
 			input: "<img onerror='location.href=\"//youtube.com\"'>",
-			expected: "&lt;img onerror&#x3D;&#x27;location.href&#x3D;&quot;//youtube.com&quot;&#x27;&gt;",
+			expected: "&lt;img onerror&#x3D;&#x27;location.href&#x3D;&quot;<a href=\"https://youtube.com\" target=\"_blank\" rel=\"noopener\">//youtube.com</a>&quot;&#x27;&gt;",
 		}, {
 			input: '#&">bug',
 			expected: '<span class="inline-channel" role="button" tabindex="0" data-chan="#&amp;&quot;&gt;bug">#&amp;&quot;&gt;bug</span>',
@ -41,7 +41,7 @@ describe("parse Handlebars helper", () => {
 		}, {
 			input: "www.nooooooooooooooo.com",
 			expected:
-				'<a href="http://www.nooooooooooooooo.com" target="_blank" rel="noopener">' +
+				'<a href="https://www.nooooooooooooooo.com" target="_blank" rel="noopener">' +
 					"www.nooooooooooooooo.com" +
 				"</a>",
 		}, {
@ -56,7 +56,7 @@ describe("parse Handlebars helper", () => {
 			input: "use www.duckduckgo.com for privacy reasons",
 			expected:
 				"use " +
-				'<a href="http://www.duckduckgo.com" target="_blank" rel="noopener">' +
+				'<a href="https://www.duckduckgo.com" target="_blank" rel="noopener">' +
 					"www.duckduckgo.com" +
 				"</a>" +
 				" for privacy reasons",
@ -101,7 +101,7 @@ describe("parse Handlebars helper", () => {
 			input: "abc (www.example.com)",
 			expected:
 				"abc (" +
-				'<a href="http://www.example.com" target="_blank" rel="noopener">' +
+				'<a href="https://www.example.com" target="_blank" rel="noopener">' +
 					"www.example.com" +
 				"</a>" +
 				")",
@ -114,7 +114,7 @@ describe("parse Handlebars helper", () => {
 		}, {
 			input: "www.example.com/Test_(Page)",
 			expected:
-				'<a href="http://www.example.com/Test_(Page)" target="_blank" rel="noopener">' +
+				'<a href="https://www.example.com/Test_(Page)" target="_blank" rel="noopener">' +
 					"www.example.com/Test_(Page)" +
 				"</a>",
 		}];
--- a/test/plugins/link.js
+++ b/test/plugins/link.js
@ -371,4 +371,55 @@ describe("Link plugin", function() {
 			}
 		});
 	});
+
+	it("should fetch protocol-aware links", function(done) {
+		const message = this.irc.createMessage({
+			text: "//localhost:9002",
+		});
+
+		link(this.irc, this.network.channels[0], message);
+
+		this.irc.once("msg:preview", function(data) {
+			expect(data.preview.link).to.equal("https://localhost:9002");
+			done();
+		});
+	});
+
+	it("should de-duplicate links", function(done) {
+		const message = this.irc.createMessage({
+			text: "//localhost:9002 https://localhost:9002 https://localhost:9002",
+		});
+
+		link(this.irc, this.network.channels[0], message);
+
+		expect(message.previews).to.deep.equal([{
+			type: "loading",
+			head: "",
+			body: "",
+			thumb: "",
+			link: "https://localhost:9002",
+			shown: true,
+		}]);
+
+		this.irc.once("msg:preview", function(data) {
+			expect(data.preview.link).to.equal("https://localhost:9002");
+			done();
+		});
+	});
+
+	it("should not try to fetch links with wrong protocol", function() {
+		const message = this.irc.createMessage({
+			text: "ssh://example.com ftp://example.com irc://example.com http:////////example.com",
+		});
+
+		expect(message.previews).to.be.empty;
+	});
+
+	it("should not try to fetch links with username or password", function() {
+		const message = this.irc.createMessage({
+			text: "http://root:'some%pass'@hostname/database http://a:%p@c http://a:%p@example.com http://test@example.com",
+		});
+
+		expect(message.previews).to.be.empty;
+	});
 });
--- a/yarn.lock
+++ b/yarn.lock
@ -4459,6 +4459,12 @@ levn@^0.3.0, levn@~0.3.0:
    prelude-ls "~1.1.2"
    type-check "~0.3.2"

+linkify-it@2.0.3:
+  version "2.0.3"
+  resolved "https://registry.yarnpkg.com/linkify-it/-/linkify-it-2.0.3.tgz#d94a4648f9b1c179d64fa97291268bdb6ce9434f"
+  dependencies:
+    uc.micro "^1.0.1"
+
 listr-silent-renderer@^1.1.1:
  version "1.1.1"
  resolved "https://registry.yarnpkg.com/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz#924b5a3757153770bf1a8e3fbf74b8bbf3f9242e"
@ -7602,6 +7608,10 @@ timers-browserify@^2.0.4:
  dependencies:
    setimmediate "^1.0.4"

+tlds@1.203.1:
+  version "1.203.1"
+  resolved "https://registry.yarnpkg.com/tlds/-/tlds-1.203.1.tgz#4dc9b02f53de3315bc98b80665e13de3edfc1dfc"
+
 tmp@^0.0.33:
  version "0.0.33"
  resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.0.33.tgz#6d34335889768d21b2bcda0aa277ced3b1bfadf9"
@ -7719,6 +7729,10 @@ ua-parser-js@0.7.18:
  version "0.7.18"
  resolved "https://registry.yarnpkg.com/ua-parser-js/-/ua-parser-js-0.7.18.tgz#a7bfd92f56edfb117083b69e31d2aa8882d4b1ed"

+uc.micro@^1.0.1:
+  version "1.0.5"
+  resolved "https://registry.yarnpkg.com/uc.micro/-/uc.micro-1.0.5.tgz#0c65f15f815aa08b560a61ce8b4db7ffc3f45376"
+
 uglify-es@^3.3.4:
  version "3.3.9"
  resolved "https://registry.yarnpkg.com/uglify-es/-/uglify-es-3.3.9.tgz#0c1c4f0700bed8dbc124cdb304d2592ca203e677"