mirror of
https://github.com/cheeaun/phanpy.git
synced 2025-02-24 08:48:47 +01:00
Further improve lang detection perf
This commit is contained in:
parent
f9a73777e7
commit
7546b42c7c
3 changed files with 53 additions and 5 deletions
|
@ -1866,7 +1866,16 @@ const Textarea = forwardRef((props, ref) => {
|
|||
// Newline to prevent multiple line breaks at the end from being collapsed, no idea why
|
||||
}, 500);
|
||||
|
||||
const debouncedAutoDetectLanguage = useDebouncedCallback((text) => {
|
||||
const debouncedAutoDetectLanguage = useDebouncedCallback(() => {
|
||||
// Make use of the highlightRef to get the DOM
|
||||
// Clone the dom
|
||||
const dom = composeHighlightRef.current?.cloneNode(true);
|
||||
if (!dom) return;
|
||||
// Remove mark
|
||||
dom.querySelectorAll('mark').forEach((mark) => {
|
||||
mark.remove();
|
||||
});
|
||||
const text = dom.innerText?.trim();
|
||||
if (!text) return;
|
||||
const langs = detectLangs(text);
|
||||
if (langs?.length) {
|
||||
|
@ -1875,7 +1884,7 @@ const Textarea = forwardRef((props, ref) => {
|
|||
languages: langs,
|
||||
});
|
||||
}
|
||||
}, 1000);
|
||||
}, 2000);
|
||||
|
||||
return (
|
||||
<text-expander
|
||||
|
@ -1944,7 +1953,7 @@ const Textarea = forwardRef((props, ref) => {
|
|||
autoResizeTextarea(target);
|
||||
props.onInput?.(e);
|
||||
throttleHighlightText(text);
|
||||
debouncedAutoDetectLanguage(text);
|
||||
debouncedAutoDetectLanguage();
|
||||
}}
|
||||
style={{
|
||||
width: '100%',
|
||||
|
|
|
@ -161,6 +161,8 @@ const SIZE_CLASS = {
|
|||
};
|
||||
|
||||
const detectLang = mem((text) => {
|
||||
text = text?.trim();
|
||||
|
||||
// Ref: https://github.com/komodojp/tinyld/blob/develop/docs/benchmark.md
|
||||
// 500 should be enough for now, also the default max chars for Mastodon
|
||||
if (text?.length > 500) {
|
||||
|
@ -284,7 +286,40 @@ function Status({
|
|||
emojiReactions,
|
||||
} = status;
|
||||
|
||||
let languageAutoDetected = content && detectLang(getHTMLText(content));
|
||||
const [languageAutoDetected, setLanguageAutoDetected] = useState(null);
|
||||
useEffect(() => {
|
||||
if (!content) return;
|
||||
if (_language) return;
|
||||
let timer;
|
||||
timer = setTimeout(() => {
|
||||
let detected = detectLang(
|
||||
getHTMLText(content, {
|
||||
preProcess: (dom) => {
|
||||
// Remove anything that can skew the language detection
|
||||
|
||||
// Remove .mention, .hashtag, pre, code, a:has(.invisible)
|
||||
dom
|
||||
.querySelectorAll(
|
||||
'.mention, .hashtag, pre, code, a:has(.invisible)',
|
||||
)
|
||||
.forEach((a) => {
|
||||
a.remove();
|
||||
});
|
||||
|
||||
// Remove links that contains text that starts with https?://
|
||||
dom.querySelectorAll('a').forEach((a) => {
|
||||
const text = a.innerText.trim();
|
||||
if (text.startsWith('https://') || text.startsWith('http://')) {
|
||||
a.remove();
|
||||
}
|
||||
});
|
||||
},
|
||||
}),
|
||||
);
|
||||
setLanguageAutoDetected(detected);
|
||||
}, 1000);
|
||||
return () => clearTimeout(timer);
|
||||
}, [content, _language]);
|
||||
const language = _language || languageAutoDetected;
|
||||
|
||||
// if (!mediaAttachments?.length) mediaFirst = false;
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import mem from './mem';
|
||||
|
||||
const div = document.createElement('div');
|
||||
function getHTMLText(html) {
|
||||
function getHTMLText(html, opts) {
|
||||
if (!html) return '';
|
||||
const { preProcess } = opts || {};
|
||||
|
||||
div.innerHTML = html
|
||||
.replace(/<\/p>/g, '</p>\n\n')
|
||||
.replace(/<\/li>/g, '</li>\n');
|
||||
|
@ -10,6 +12,8 @@ function getHTMLText(html) {
|
|||
br.replaceWith('\n');
|
||||
});
|
||||
|
||||
preProcess?.(div);
|
||||
|
||||
// MASTODON-SPECIFIC classes
|
||||
// Remove .invisible
|
||||
div.querySelectorAll('.invisible').forEach((el) => {
|
||||
|
|
Loading…
Reference in a new issue