2022-12-27 19:13:50 +00:00
// @unimport-disable
2023-01-08 06:21:09 +00:00
import type { mastodon } from 'masto'
2022-12-04 22:10:10 +00:00
import type { Node } from 'ultrahtml'
2023-01-07 22:42:17 +00:00
import { DOCUMENT_NODE , ELEMENT_NODE , TEXT_NODE , h , parse , render } from 'ultrahtml'
2023-01-02 04:53:53 +00:00
import { findAndReplaceEmojisInText } from '@iconify/utils'
2023-01-09 22:08:42 +00:00
import { decode } from 'tiny-decode'
2023-01-02 04:53:53 +00:00
import { emojiRegEx , getEmojiAttributes } from '../config/emojis'
2022-12-27 18:37:22 +00:00
2023-01-07 09:31:48 +00:00
export interface ContentParseOptions {
2023-01-08 06:21:09 +00:00
emojis? : Record < string , mastodon.v1.CustomEmoji >
2023-01-07 09:31:48 +00:00
markdown? : boolean
replaceUnicodeEmoji? : boolean
astTransforms? : Transform [ ]
2023-01-08 09:39:11 +00:00
convertMentionLink? : boolean
2023-01-07 09:31:48 +00:00
}
const sanitizerBasicClasses = filterClasses ( / ^ ( h - \ S * | p - \ S * | u - \ S * | d t - \ S * | e - \ S * | m e n t i o n | h a s h t a g | e l l i p s i s | i n v i s i b l e ) $ / u )
const sanitizer = sanitize ( {
// Allow basic elements as seen in https://github.com/mastodon/mastodon/blob/17f79082b098e05b68d6f0d38fabb3ac121879a9/lib/sanitize_ext/sanitize_config.rb
br : { } ,
p : { } ,
a : {
href : filterHref ( ) ,
class : sanitizerBasicClasses ,
rel : set ( 'nofollow noopener noreferrer' ) ,
target : set ( '_blank' ) ,
} ,
span : {
class : sanitizerBasicClasses ,
} ,
// Allow elements potentially created for Markdown code blocks above
pre : { } ,
code : {
class : filterClasses ( /^language-\w+$/ ) ,
} ,
} )
2022-11-30 05:27:24 +00:00
/ * *
* Parse raw HTML form Mastodon server to AST ,
* with interop of custom emojis and inline Markdown syntax
* /
2023-01-07 09:31:48 +00:00
export function parseMastodonHTML (
html : string ,
options : ContentParseOptions = { } ,
) {
const {
markdown = true ,
replaceUnicodeEmoji = true ,
2023-01-08 09:39:11 +00:00
convertMentionLink = false ,
2023-01-07 09:31:48 +00:00
} = options
2022-12-17 21:01:20 +00:00
if ( markdown ) {
2023-01-05 07:21:09 +00:00
// Handle code blocks
html = html
2023-01-07 09:31:48 +00:00
. replace ( />(```|~~~)(\w*)([\s\S]+?)\1/g , ( _1 , _2 , lang : string , raw : string ) = > {
2023-01-11 16:24:13 +00:00
const code = htmlToText ( raw ) . replace ( /</g , '<' ) . replace ( />/g , '>' )
2022-12-17 21:01:20 +00:00
const classes = lang ? ` class="language- ${ lang } " ` : ''
return ` ><pre><code ${ classes } > ${ code } </code></pre> `
} )
2023-01-11 16:24:13 +00:00
. replace ( /`([^`\n]*)`/g , ( _1 , raw ) = > {
return raw ? ` <code> ${ htmlToText ( raw ) . replace ( /</g , '<' ) . replace ( />/g , '>' ) } </code> ` : ''
} )
2022-12-17 21:01:20 +00:00
}
2022-11-30 05:27:24 +00:00
2023-01-05 07:21:09 +00:00
// Always sanitize the raw HTML data *after* it has been modified
2023-01-07 09:31:48 +00:00
const transforms : Transform [ ] = [
sanitizer ,
. . . options . astTransforms || [ ] ,
]
if ( replaceUnicodeEmoji )
transforms . push ( transformUnicodeEmoji )
if ( markdown )
transforms . push ( transformMarkdown )
2023-01-08 09:39:11 +00:00
if ( convertMentionLink )
transforms . push ( transformMentionLink )
2023-01-07 09:31:48 +00:00
transforms . push ( replaceCustomEmoji ( options . emojis || { } ) )
2023-01-07 22:42:17 +00:00
transforms . push ( transformParagraphs )
2023-01-07 09:31:48 +00:00
return transformSync ( parse ( html ) , transforms )
2022-11-30 05:27:24 +00:00
}
2023-01-02 04:53:53 +00:00
/ * *
* Converts raw HTML form Mastodon server to HTML for Tiptap editor
* /
2023-01-08 06:21:09 +00:00
export function convertMastodonHTML ( html : string , customEmojis : Record < string , mastodon.v1.CustomEmoji > = { } ) {
2023-01-07 09:31:48 +00:00
const tree = parseMastodonHTML ( html , {
emojis : customEmojis ,
markdown : true ,
replaceUnicodeEmoji : false ,
2023-01-08 09:39:11 +00:00
convertMentionLink : true ,
2023-01-07 09:31:48 +00:00
} )
2022-12-17 21:01:20 +00:00
return render ( tree )
2022-11-30 05:27:24 +00:00
}
2022-11-25 16:17:15 +00:00
export function htmlToText ( html : string ) {
2022-12-04 22:10:10 +00:00
const tree = parse ( html )
return ( tree . children as Node [ ] ) . map ( n = > treeToText ( n ) ) . join ( '' ) . trim ( )
2022-11-24 03:42:03 +00:00
}
2022-11-30 06:50:47 +00:00
export function treeToText ( input : Node ) : string {
2022-11-24 03:42:03 +00:00
let pre = ''
2022-11-25 16:17:15 +00:00
let body = ''
let post = ''
2022-11-24 03:42:03 +00:00
2022-12-04 22:10:10 +00:00
if ( input . type === TEXT_NODE )
2023-01-09 22:08:42 +00:00
return decode ( input . value )
2022-11-24 03:42:03 +00:00
2022-12-04 22:10:10 +00:00
if ( input . name === 'br' )
2022-11-24 03:42:03 +00:00
return '\n'
2022-12-04 22:10:10 +00:00
if ( [ 'p' , 'pre' ] . includes ( input . name ) )
2022-11-24 03:42:03 +00:00
pre = '\n'
2022-12-04 22:10:10 +00:00
if ( input . name === 'code' ) {
if ( input . parent ? . name === 'pre' ) {
const lang = input . attributes . class ? . replace ( 'language-' , '' )
2022-11-25 19:21:53 +00:00
2022-11-30 04:50:29 +00:00
pre = ` \` \` \` ${ lang || '' } \ n `
post = '\n```'
}
else {
pre = '`'
post = '`'
}
}
2022-12-04 22:10:10 +00:00
else if ( input . name === 'b' || input . name === 'strong' ) {
2022-11-30 04:50:29 +00:00
pre = '**'
post = '**'
}
2022-12-04 22:10:10 +00:00
else if ( input . name === 'i' || input . name === 'em' ) {
2022-11-30 04:50:29 +00:00
pre = '*'
post = '*'
2022-11-25 16:17:15 +00:00
}
2022-12-04 22:10:10 +00:00
else if ( input . name === 'del' ) {
2022-11-30 06:50:47 +00:00
pre = '~~'
post = '~~'
}
2022-11-25 16:17:15 +00:00
2022-12-04 22:10:10 +00:00
if ( 'children' in input )
body = ( input . children as Node [ ] ) . map ( n = > treeToText ( n ) ) . join ( '' )
2022-11-24 03:42:03 +00:00
2023-01-02 04:53:53 +00:00
if ( input . name === 'img' ) {
if ( input . attributes . class ? . includes ( 'custom-emoji' ) )
return ` : ${ input . attributes [ 'data-emoji-id' ] } : `
if ( input . attributes . class ? . includes ( 'iconify-emoji' ) )
return input . attributes . alt
}
2022-11-30 06:50:47 +00:00
2022-11-25 16:17:15 +00:00
return pre + body + post
2022-11-24 03:42:03 +00:00
}
2023-01-02 04:53:53 +00:00
2023-01-05 07:21:09 +00:00
// A tree transform function takes an ultrahtml Node object and returns
// new content that will replace the given node in the tree.
// Returning a null removes the node from the tree.
// Strings get converted to text nodes.
// The input node's children have been transformed before the node itself
// gets transformed.
type Transform = ( node : Node ) = > ( Node | string ) [ ] | Node | string | null
// Helpers for transforming (filtering, modifying, ...) a parsed HTML tree
// by running the given chain of transform functions one-by-one.
function transformSync ( doc : Node , transforms : Transform [ ] ) {
function visit ( node : Node , transform : Transform , isRoot = false ) {
if ( Array . isArray ( node . children ) ) {
const children = [ ] as ( Node | string ) [ ]
for ( let i = 0 ; i < node . children . length ; i ++ ) {
const result = visit ( node . children [ i ] , transform )
if ( Array . isArray ( result ) )
children . push ( . . . result )
else if ( result )
children . push ( result )
}
node . children = children . map ( ( value ) = > {
if ( typeof value === 'string' )
return { type : TEXT_NODE , value , parent : node }
value . parent = node
return value
} )
}
return isRoot ? node : transform ( node )
}
for ( const transform of transforms )
doc = visit ( doc , transform , true ) as Node
return doc
}
// A tree transform for sanitizing elements & their attributes.
type AttrSanitizers = Record < string , ( value : string | undefined ) = > string | undefined >
function sanitize ( allowedElements : Record < string , AttrSanitizers > ) : Transform {
return ( node ) = > {
if ( node . type !== ELEMENT_NODE )
return node
if ( ! Object . prototype . hasOwnProperty . call ( allowedElements , node . name ) )
return null
const attrSanitizers = allowedElements [ node . name ]
const attrs = { } as Record < string , string >
for ( const [ name , func ] of Object . entries ( attrSanitizers ) ) {
const value = func ( node . attributes [ name ] )
if ( value !== undefined )
attrs [ name ] = value
}
node . attributes = attrs
return node
}
}
function filterClasses ( allowed : RegExp ) {
return ( c : string | undefined ) = > {
if ( ! c )
return undefined
return c . split ( /\s/g ) . filter ( cls = > allowed . test ( cls ) ) . join ( ' ' )
}
}
function set ( value : string ) {
return ( ) = > value
}
function filterHref() {
const LINK_PROTOCOLS = new Set ( [
'http:' ,
'https:' ,
'dat:' ,
'dweb:' ,
'ipfs:' ,
'ipns:' ,
'ssb:' ,
'gopher:' ,
'xmpp:' ,
'magnet:' ,
'gemini:' ,
] )
return ( href : string | undefined ) = > {
if ( href === undefined )
return undefined
// Allow relative links
if ( href . startsWith ( '/' ) || href . startsWith ( '.' ) )
return href
let url
try {
url = new URL ( href )
}
catch ( err ) {
if ( err instanceof TypeError )
return undefined
throw err
}
if ( LINK_PROTOCOLS . has ( url . protocol ) )
return url . toString ( )
return '#'
}
}
2023-01-07 09:31:48 +00:00
function transformUnicodeEmoji ( node : Node ) {
if ( node . type !== TEXT_NODE )
return node
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
let start = 0
const matches = [ ] as ( string | Node ) [ ]
findAndReplaceEmojisInText ( emojiRegEx , node . value , ( match , result ) = > {
const attrs = getEmojiAttributes ( match )
matches . push ( result . slice ( start ) )
matches . push ( h ( 'img' , { src : attrs.src , alt : attrs.alt , class : attrs . class } ) )
start = result . length + match . match . length
return undefined
} )
if ( matches . length === 0 )
return node
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
matches . push ( node . value . slice ( start ) )
return matches . filter ( Boolean )
2023-01-05 07:21:09 +00:00
}
2023-01-08 06:21:09 +00:00
function replaceCustomEmoji ( customEmojis : Record < string , mastodon.v1.CustomEmoji > ) : Transform {
2023-01-05 07:21:09 +00:00
return ( node ) = > {
if ( node . type !== TEXT_NODE )
return node
const split = node . value . split ( /:([\w-]+?):/g )
if ( split . length === 1 )
return node
return split . map ( ( name , i ) = > {
if ( i % 2 === 0 )
return name
const emoji = customEmojis [ name ]
if ( ! emoji )
return ` : ${ name } : `
return h ( 'img' , { 'src' : emoji . url , 'alt' : ` : ${ name } : ` , 'class' : 'custom-emoji' , 'data-emoji-id' : name } )
} ) . filter ( Boolean )
}
}
2023-01-07 09:31:48 +00:00
const _markdownReplacements : [ RegExp , ( c : ( string | Node ) [ ] ) = > Node ] [ ] = [
[ /\*\*\*(.*?)\*\*\*/g , c = > h ( 'b' , null , [ h ( 'em' , null , c ) ] ) ] ,
[ /\*\*(.*?)\*\*/g , c = > h ( 'b' , null , c ) ] ,
[ /\*(.*?)\*/g , c = > h ( 'em' , null , c ) ] ,
[ /~~(.*?)~~/g , c = > h ( 'del' , null , c ) ] ,
[ /`([^`]+?)`/g , c = > h ( 'code' , null , c ) ] ,
2023-01-08 08:21:35 +00:00
// transform @username@twitter.com as links
2023-01-09 14:47:41 +00:00
[ /\B@([a-zA-Z0-9_]+)@twitter\.com\b/gi , c = > h ( 'a' , { href : ` https://twitter.com/ ${ c } ` , target : '_blank' , rel : 'nofollow noopener noreferrer' , class : 'mention external' } , ` @ ${ c } @twitter.com ` ) ] ,
2023-01-07 09:31:48 +00:00
]
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
function _markdownProcess ( value : string ) {
const results = [ ] as ( string | Node ) [ ]
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
let start = 0
while ( true ) {
let found : { match : RegExpMatchArray ; replacer : ( c : ( string | Node ) [ ] ) = > Node } | undefined
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
for ( const [ re , replacer ] of _markdownReplacements ) {
re . lastIndex = start
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
const match = re . exec ( value )
if ( match ) {
if ( ! found || match . index < found . match . index ! )
found = { match , replacer }
2023-01-05 07:21:09 +00:00
}
}
2023-01-07 09:31:48 +00:00
if ( ! found )
break
2023-01-05 07:21:09 +00:00
2023-01-07 09:31:48 +00:00
results . push ( value . slice ( start , found . match . index ) )
results . push ( found . replacer ( _markdownProcess ( found . match [ 1 ] ) ) )
start = found . match . index ! + found . match [ 0 ] . length
2023-01-05 07:21:09 +00:00
}
2023-01-07 09:31:48 +00:00
results . push ( value . slice ( start ) )
return results . filter ( Boolean )
}
function transformMarkdown ( node : Node ) {
if ( node . type !== TEXT_NODE )
return node
return _markdownProcess ( node . value )
2023-01-02 04:53:53 +00:00
}
2023-01-07 22:42:17 +00:00
function transformParagraphs ( node : Node ) : Node | Node [ ] {
// For top level paragraphs, inject an empty <p> to preserve status paragraphs in our editor (except for the last one)
if ( node . parent ? . type === DOCUMENT_NODE && node . name === 'p' && node . parent . children . at ( - 1 ) !== node )
return [ node , h ( 'p' ) ]
return node
}
2023-01-08 09:39:11 +00:00
function transformMentionLink ( node : Node ) : string | Node | ( string | Node ) [ ] | null {
if ( node . name === 'a' && node . attributes . class ? . includes ( 'mention' ) ) {
const href = node . attributes . href
if ( href ) {
const matchUser = href . match ( UserLinkRE )
if ( matchUser ) {
const [ , server , username ] = matchUser
const handle = ` ${ username } @ ${ server . replace ( /(.+\.)(.+\..+)/ , '$2' ) } `
// convert to TipTap mention node
return h ( 'span' , { 'data-type' : 'mention' , 'data-id' : handle } , handle )
}
}
}
return node
}