From dd563eb752f0883e295b6042db3387f802d5871f Mon Sep 17 00:00:00 2001 From: dumbmoron Date: Sat, 9 Dec 2023 11:00:54 +0000 Subject: [PATCH] api: rework url parsing - tlds are now parsed and validated correctly (e.g. ".co.uk" works now) - url patterns are pre-compiled instead of being compiled for every request - aliases are computed in a safe manner using the URL object where possible --- package.json | 1 + src/modules/api.js | 41 ++++----- src/modules/config.js | 9 ++ src/modules/processing/hostOverrides.js | 112 ++++++++++++++++++------ src/modules/sub/utils.js | 34 +------ 5 files changed, 116 insertions(+), 81 deletions(-) diff --git a/package.json b/package.json index c4b03eb4..ed639ba2 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,7 @@ "hls-parser": "^0.10.7", "nanoid": "^4.0.2", "node-cache": "^5.1.2", + "psl": "^1.9.0", "set-cookie-parser": "2.6.0", "undici": "^5.19.1", "url-pattern": "1.0.3", diff --git a/src/modules/api.js b/src/modules/api.js index 62e9a7c6..19f657cc 100644 --- a/src/modules/api.js +++ b/src/modules/api.js @@ -1,35 +1,32 @@ -import UrlPattern from "url-pattern"; +import { services } from "./config.js"; -import { services as patterns } from "./config.js"; - -import { cleanURL, apiJSON } from "./sub/utils.js"; +import { apiJSON } from "./sub/utils.js"; import { errorUnsupported } from "./sub/errors.js"; import loc from "../localization/manager.js"; import match from "./processing/match.js"; -import hostOverrides from "./processing/hostOverrides.js"; +import { hasValidHostname, normalizeURL } from "./processing/url.js"; export async function getJSON(originalURL, lang, obj) { try { - let patternMatch, url = encodeURI(decodeURIComponent(originalURL)), - hostname = new URL(url).hostname.split('.'), - host = hostname[hostname.length - 2]; + const url = normalizeURL(decodeURIComponent(originalURL)); - if (!url.startsWith('https://')) return apiJSON(0, { t: errorUnsupported(lang) }); - - let overrides = hostOverrides(host, url); - host = overrides.host; - url = overrides.url; - - if (!(host && host.length < 20 && host in patterns && patterns[host]["enabled"])) return apiJSON(0, { t: errorUnsupported(lang) }); - - let pathToMatch = cleanURL(url, host).split(`.${patterns[host]['tld'] ? patterns[host]['tld'] : "com"}/`)[1].replace('.', ''); - for (let i in patterns[host]["patterns"]) { - patternMatch = new UrlPattern(patterns[host]["patterns"][i]).match(pathToMatch); - if (patternMatch) break + if (!hasValidHostname(url) || !services[host].enabled) { + return apiJSON(0, { t: errorUnsupported(lang) }); } - if (!patternMatch) return apiJSON(0, { t: errorUnsupported(lang) }); - return await match(host, patternMatch, url, lang, obj) + let patternMatch; + for (const pattern of services[host].patterns) { + patternMatch = pattern.match( + url.pathname.substring(1) + url.search + ); + if (patternMatch) break; + } + + if (!patternMatch) { + return apiJSON(0, { t: errorUnsupported(lang) }); + } + + return await match(host, patternMatch, url.toString(), lang, obj) } catch (e) { return apiJSON(0, { t: loc(lang, 'ErrorSomethingWentWrong') }) } diff --git a/src/modules/config.js b/src/modules/config.js index 6fbe9d43..a0525ae8 100644 --- a/src/modules/config.js +++ b/src/modules/config.js @@ -1,8 +1,17 @@ +import UrlPattern from "url-pattern"; import { loadJSON } from "./sub/loadFromFs.js"; const config = loadJSON("./src/config.json"); const packageJson = loadJSON("./package.json"); const servicesConfigJson = loadJSON("./src/modules/processing/servicesConfig.json"); +Object.values(servicesConfigJson.config).forEach(service => { + service.patterns = service.patterns.map( + pattern => new UrlPattern(pattern, { + segmentValueCharset: UrlPattern.defaultOptions.segmentValueCharset + '\\.' + }) + ) +}) + export const services = servicesConfigJson.config, audioIgnore = servicesConfigJson.audioIgnore, diff --git a/src/modules/processing/hostOverrides.js b/src/modules/processing/hostOverrides.js index 88553e35..86d45add 100644 --- a/src/modules/processing/hostOverrides.js +++ b/src/modules/processing/hostOverrides.js @@ -1,48 +1,102 @@ -export default function (inHost, inURL) { - let host = String(inHost); - let url = String(inURL); +import { services } from "./config.js"; +import { strict as assert } from "node:assert"; +import psl from "psl"; - switch(host) { +export function aliasURL(url) { + assert(url instanceof URL); + + const host = psl.parse(url.hostname); + const parts = url.pathname.split('/'); + + switch (host.sld) { case "youtube": - if (url.startsWith("https://youtube.com/live/") || url.startsWith("https://www.youtube.com/live/")) { - url = url.split("?")[0].replace("www.", ""); - url = `https://youtube.com/watch?v=${url.replace("https://youtube.com/live/", "")}` - } - if (url.includes('youtube.com/shorts/')) { - url = url.split('?')[0].replace('shorts/', 'watch?v='); + if (url.pathname.startsWith('/live/') || url.pathname.startsWith('/shorts/')) { + url.pathname = '/watch'; + // ['', 'live' || 'shorts', id, ...rest] + url.search = `?v=${encodeURIComponent(parts[2])}` } break; case "youtu": - if (url.startsWith("https://youtu.be/")) { - host = "youtube"; - url = `https://youtube.com/watch?v=${url.replace("https://youtu.be/", "")}` + if (url.hostname === 'youtu.be' && parts.length === 2) { + /* youtu.be urls can be weird, e.g. https://youtu.be///asdasd// still works + ** but we only care about the 1st segment of the path */ + url = new URL(`https://youtube.com/watch?v=${ + encodeURIComponent(parts[1]) + }`) } break; + case "vxtwitter": case "x": - if (url.startsWith("https://x.com/")) { - host = "twitter"; - url = url.replace("https://x.com/", "https://twitter.com/") - } - if (url.startsWith("https://vxtwitter.com/")) { - host = "twitter"; - url = url.replace("https://vxtwitter.com/", "https://twitter.com/") + if (['x.com', 'vxtwitter.com'].includes(url.hostname)) { + url.hostname = 'twitter.com' } break; + case "tumblr": - if (!url.includes("blog/view")) { - if (url.slice(-1) === '/') url = url.slice(0, -1); - url = url.replace(url.split('/')[5], '') + if (!url.pathname.includes("/blog/view")) { + if (url.pathname.endsWith('/')) + url.pathname = url.pathname.slice(0, -1); + url.pathname = url.pathname.replace(parts[5], '') } break; + case "twitch": - if (url.includes('clips.twitch.tv')) { - url = url.split('?')[0].replace('clips.twitch.tv/', 'twitch.tv/_/clip/'); + if (url.hostname === 'clips.twitch.tv' && parts.length >= 2) { + url = new URL(`https://twitch.tv/_/clip/${parts[1]}`); } break; } - return { - host: host, - url: url - } + + return { url, host: host.sld } } + +export function cleanURL({ url, host }) { + assert(url instanceof URL); + let stripQuery = true; + + if (host === 'pinterest') { + url.hostname = 'pinterest.com' + } else if (host === 'vk' && url.pathname.includes('/clip')) { + if (url.searchParams.get('z')) + url.search = '?z=' + encodeURIComponent(url.searchParams.get('z')); + stripQuery = false; + } else if (host === 'youtube' && url.searchParams.get('v')) { + url.search = '?v=' + encodeURIComponent(url.searchParams.get('v')); + stripQuery = false; + } + + if (stripQuery) { + url.search = url.hash = '' + } + + if (url.pathname.endsWith('/')) + url.pathname = url.pathname.slice(0, -1); + + return url +} + +export function normalizeURL(url) { + return cleanURL( + aliasURL( + new URL(url.replace(/^https\/\//, 'https://')) + ) + ); +} + +export function hasValidHostname(url) { + const host = psl.parse(url.hostname); + if (host.error) return false; + + const service = services[host.sld]; + if (!service) return false; + + if ((service.tld ?? 'com') !== host.tld) return false; + + const anySubdomainAllowed = service.subdomains === '*'; + const validSubdomain = [null, 'www', ...(service.subdomains ?? [])].includes(host.subdomain); + if (!validSubdomain && !anySubdomainAllowed) + return false; + + return true; +} \ No newline at end of file diff --git a/src/modules/sub/utils.js b/src/modules/sub/utils.js index e165a68a..ef64d07b 100644 --- a/src/modules/sub/utils.js +++ b/src/modules/sub/utils.js @@ -52,29 +52,7 @@ export function metadataManager(obj) { for (let i in keys) { if (tags.includes(keys[i])) commands.push('-metadata', `${keys[i]}=${obj[keys[i]]}`) } return commands; } -export function cleanURL(url, host) { - switch (host) { - case "vk": - url = url.includes('clip') ? url.split('&')[0] : url.split('?')[0]; - break; - case "youtube": - url = url.split('&')[0]; - break; - case "tiktok": - url = url.replace(/@([a-zA-Z]+(\.[a-zA-Z]+)+)/, "@a") - case "pinterest": - url = url.replace(/:\/\/(?:www.)pinterest(?:\.[a-z.]+)/, "://pinterest.com") - default: - url = url.split('?')[0]; - if (url.substring(url.length - 1) === "/") url = url.substring(0, url.length - 1); - break; - } - for (let i in forbiddenChars) { - url = url.replaceAll(forbiddenChars[i], '') - } - url = url.replace('https//', 'https://') - return url.slice(0, 128) -} + export function cleanString(string) { for (let i in forbiddenCharsString) { string = string.replaceAll("/", "_").replaceAll(forbiddenCharsString[i], '') @@ -121,13 +99,9 @@ export function checkJSONPost(obj) { } } - if (def.dubLang) def.dubLang = verifyLanguageCode(obj.dubLang); - - obj["url"] = decodeURIComponent(String(obj["url"])); - let hostname = obj["url"].replace("https://", "").replace(' ', '').split('&')[0].split("/")[0].split("."), - host = hostname[hostname.length - 2]; - def["url"] = encodeURIComponent(cleanURL(obj["url"], host)); - + if (def.dubLang) + def.dubLang = verifyLanguageCode(obj.dubLang); + def.url = obj.url; return def } catch (e) { return false