2023-03-12 15:00:57 +00:00
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
2021-08-10 12:32:39 +01:00
package dereferencing
import (
2021-08-25 14:34:33 +01:00
"context"
2023-06-24 08:32:10 +01:00
"errors"
"net/http"
2021-08-10 12:32:39 +01:00
"net/url"
2022-07-19 09:47:55 +01:00
"codeberg.org/gruf/go-kv"
2023-06-03 10:35:15 +01:00
"github.com/superseriousbusiness/activity/pub"
2021-08-10 12:32:39 +01:00
"github.com/superseriousbusiness/gotosocial/internal/ap"
2021-12-07 12:31:39 +00:00
"github.com/superseriousbusiness/gotosocial/internal/config"
2023-06-24 08:32:10 +01:00
"github.com/superseriousbusiness/gotosocial/internal/db"
"github.com/superseriousbusiness/gotosocial/internal/gtscontext"
2023-05-28 13:08:35 +01:00
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
2022-09-25 12:09:41 +01:00
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
2022-07-19 09:47:55 +01:00
"github.com/superseriousbusiness/gotosocial/internal/log"
2021-08-10 12:32:39 +01:00
)
2022-09-25 12:09:41 +01:00
// maxIter defines how many iterations of descendants or
// ancesters we are willing to follow before returning error.
const maxIter = 1000
2023-10-23 10:58:13 +01:00
func ( d * Dereferencer ) dereferenceThread ( ctx context . Context , username string , statusIRI * url . URL , status * gtsmodel . Status , statusable ap . Statusable ) {
2022-09-25 12:09:41 +01:00
// Ensure that ancestors have been fully dereferenced
2023-06-24 08:32:10 +01:00
if err := d . DereferenceStatusAncestors ( ctx , username , status ) ; err != nil {
log . Error ( ctx , err )
2021-08-10 12:32:39 +01:00
}
2022-09-25 12:09:41 +01:00
// Ensure that descendants have been fully dereferenced
2023-06-24 08:32:10 +01:00
if err := d . DereferenceStatusDescendants ( ctx , username , statusIRI , statusable ) ; err != nil {
log . Error ( ctx , err )
2021-08-10 12:32:39 +01:00
}
}
2023-10-23 10:58:13 +01:00
// DereferenceStatusAncestors iterates upwards from the given status, using InReplyToURI, to ensure that as many parent statuses as possible are dereferenced.
func ( d * Dereferencer ) DereferenceStatusAncestors ( ctx context . Context , username string , status * gtsmodel . Status ) error {
2023-09-05 11:22:02 +01:00
// Start log entry with fields
l := log . WithContext ( ctx ) .
WithFields ( kv . Fields {
{ "username" , username } ,
{ "original" , status . URI } ,
} ... )
// Keep track of already dereferenced statuses
// for this ancestor thread to prevent recursion.
derefdStatuses := make ( map [ string ] struct { } , 10 )
2023-06-24 08:32:10 +01:00
// Mark given status as the one
// we're currently working on.
2023-09-05 11:22:02 +01:00
current := status
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
for i := 0 ; i < maxIter ; i ++ {
2023-06-24 08:32:10 +01:00
if current . InReplyToURI == "" {
// Status has no parent, we've
// reached the top of the chain.
2021-08-10 12:32:39 +01:00
return nil
}
2022-06-11 15:25:41 +01:00
2023-09-05 11:22:02 +01:00
// Add new log fields for this iteration.
l = l . WithFields ( kv . Fields {
{ "current" , current . URI } ,
{ "parent" , current . InReplyToURI } ,
} ... )
l . Trace ( "following status ancestors" )
// Check whether this parent has already been deref'd.
if _ , ok := derefdStatuses [ current . InReplyToURI ] ; ok {
l . Warn ( "self referencing status ancestors" )
return nil
}
// Add this status URI to map of deref'd.
derefdStatuses [ current . URI ] = struct { } { }
2023-06-24 08:32:10 +01:00
if current . InReplyToID != "" {
// We already have an InReplyToID set. This means
// the status's parent has, at some point, been
// inserted into the database, either because it
// is a status from our instance, or a status from
// remote that we've dereferenced before, or found
// out about in some other way.
//
// Working on this assumption, check if the parent
// status exists, either as a copy pinned on the
// current status, or in the database.
if current . InReplyTo != nil {
// We have the parent already, and the child
// doesn't need to be updated; keep iterating
// from this parent upwards.
current = current . InReplyTo
continue
}
2021-08-10 12:32:39 +01:00
2023-06-24 08:32:10 +01:00
// Parent isn't pinned to this status (yet), see
// if we can get it from the db (we should be
// able to, since it has an ID already).
parent , err := d . state . DB . GetStatusByID (
gtscontext . SetBarebones ( ctx ) ,
current . InReplyToID ,
)
if err != nil && ! errors . Is ( err , db . ErrNoEntries ) {
// Real db error, stop.
return gtserror . Newf ( "db error getting status %s: %w" , current . InReplyToID , err )
2022-09-25 12:09:41 +01:00
}
2023-06-24 08:32:10 +01:00
if parent != nil {
// We got the parent from the db, and the child
// doesn't need to be updated; keep iterating
// from this parent upwards.
current . InReplyTo = parent
current = parent
continue
2022-09-25 12:09:41 +01:00
}
2023-06-24 08:32:10 +01:00
// If we arrive here, we know this child *did* have
// a parent at some point, but it no longer exists in
// the database, presumably because it's been deleted
// by another action.
//
// TODO: clean this up in a nightly task.
2023-09-05 11:22:02 +01:00
l . Warn ( "orphaned status (parent no longer exists)" )
2023-06-24 08:32:10 +01:00
return nil // Cannot iterate further.
}
2022-09-25 12:09:41 +01:00
2023-06-24 08:32:10 +01:00
// If we reach this point, we know the status has
// an InReplyToURI set, but it doesn't yet have an
// InReplyToID, which means that the parent status
// has not yet been dereferenced.
inReplyToURI , err := url . Parse ( current . InReplyToURI )
if err != nil || inReplyToURI == nil {
// Parent URI is not something we can handle.
2023-09-05 11:22:02 +01:00
l . Warn ( "orphaned status (invalid InReplyToURI)" )
2023-06-24 08:32:10 +01:00
return nil //nolint:nilerr
}
2022-09-25 12:09:41 +01:00
2023-06-24 08:32:10 +01:00
// Parent URI is valid, try to get it.
// getStatusByURI guards against the following conditions:
2023-09-05 11:22:02 +01:00
// - refetching recently fetched statuses (recursion!)
2023-06-24 08:32:10 +01:00
// - remote domain is blocked (will return unretrievable)
2023-09-05 11:22:02 +01:00
// - any http type error for a new status returns unretrievable
2023-06-24 08:32:10 +01:00
parent , _ , err := d . getStatusByURI ( ctx , username , inReplyToURI )
if err == nil {
// We successfully fetched the parent.
// Update current status with new info.
current . InReplyToID = parent . ID
current . InReplyToAccountID = parent . AccountID
if err := d . state . DB . UpdateStatus (
ctx , current ,
"in_reply_to_id" ,
"in_reply_to_account_id" ,
) ; err != nil {
return gtserror . Newf ( "db error updating status %s: %w" , current . ID , err )
}
// Mark parent as next status to
// work on, and keep iterating.
current = parent
continue
}
// We could not fetch the parent, check if we can do anything
// useful with the error. For example, HTTP status code returned
// from remote may indicate that the parent has been deleted.
switch code := gtserror . StatusCode ( err ) ; {
2023-06-24 12:59:28 +01:00
case code == http . StatusGone :
2023-06-24 08:32:10 +01:00
// 410 means the status has definitely been deleted.
// Update this status to reflect that, then bail.
2023-09-05 11:22:02 +01:00
l . Debug ( "orphaned status: parent returned 410 Gone" )
2023-06-24 08:32:10 +01:00
current . InReplyToURI = ""
if err := d . state . DB . UpdateStatus (
ctx , current ,
"in_reply_to_uri" ,
) ; err != nil {
return gtserror . Newf ( "db error updating status %s: %w" , current . ID , err )
2022-09-25 12:09:41 +01:00
}
2023-09-05 11:22:02 +01:00
2023-06-24 08:32:10 +01:00
return nil
case code != 0 :
2023-09-05 11:22:02 +01:00
// We had a code, but not one indicating deletion, log the code
// but don't return error or update the status; we can try again later.
l . Warnf ( "orphaned status: http error dereferencing parent: %v)" , err )
2023-06-24 08:32:10 +01:00
return nil
2021-08-10 12:32:39 +01:00
2023-06-24 08:32:10 +01:00
case gtserror . Unretrievable ( err ) :
// Not retrievable for some other reason, so just
2023-09-05 11:22:02 +01:00
// bail for now; we can try again later if necessary.
l . Warnf ( "orphaned status: parent unretrievable: %v)" , err )
2023-06-24 08:32:10 +01:00
return nil
default :
// Some other error that stops us in our tracks.
return gtserror . Newf ( "error dereferencing parent %s: %w" , current . InReplyToURI , err )
2022-09-25 12:09:41 +01:00
}
2021-08-10 12:32:39 +01:00
}
2023-06-24 08:32:10 +01:00
return gtserror . Newf ( "reached %d ancestor iterations for %q" , maxIter , status . URI )
2021-08-10 12:32:39 +01:00
}
2023-10-23 10:58:13 +01:00
// DereferenceStatusDescendents iterates downwards from the given status, using its replies, to ensure that as many children statuses as possible are dereferenced.
func ( d * Dereferencer ) DereferenceStatusDescendants ( ctx context . Context , username string , statusIRI * url . URL , parent ap . Statusable ) error {
2023-09-05 11:22:02 +01:00
statusIRIStr := statusIRI . String ( )
2022-07-19 09:47:55 +01:00
2022-09-25 12:09:41 +01:00
// Start log entry with fields
2023-02-17 11:02:29 +00:00
l := log . WithContext ( ctx ) .
WithFields ( kv . Fields {
{ "username" , username } ,
2023-09-05 11:22:02 +01:00
{ "status" , statusIRIStr } ,
2023-02-17 11:02:29 +00:00
} ... )
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
// Log function start
l . Trace ( "beginning" )
2023-09-05 11:22:02 +01:00
// OUR instance hostname.
localhost := config . GetHost ( )
// Keep track of already dereferenced collection
// pages for this thread to prevent recursion.
derefdPages := make ( map [ string ] struct { } , 10 )
// frame represents a single stack frame when
// iteratively derefencing status descendants.
2022-09-25 12:09:41 +01:00
type frame struct {
2023-09-05 11:22:02 +01:00
// page is the current activity streams
// collection page we are on (as we often
// push a frame to stack mid-paging).
2023-09-23 18:28:12 +01:00
page ap . CollectionPageIterator
2023-09-05 11:22:02 +01:00
// pageURI is the URI string of
// the frame's collection page
// (is useful for logging).
pageURI string
2021-08-10 12:32:39 +01:00
}
2022-09-25 12:09:41 +01:00
var (
2023-09-05 11:22:02 +01:00
// current stack frame
2022-09-25 12:09:41 +01:00
current * frame
// stack is a list of "shelved" descendand iterator
// frames. this is pushed to when a child status frame
// is found that we need to further iterate down, and
// popped from into 'current' when that child's tree
// of further descendants is exhausted.
stack = [ ] * frame {
2023-09-05 11:22:02 +01:00
func ( ) * frame {
// Start input frame is built from the first input.
2023-09-23 18:28:12 +01:00
page , pageURI := getAttachedStatusCollectionPage ( parent )
2023-09-05 11:22:02 +01:00
if page == nil {
return nil
}
return & frame { page : page , pageURI : pageURI }
} ( ) ,
2022-09-25 12:09:41 +01:00
}
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
// popStack will remove and return the top frame
// from the stack, or nil if currently empty.
popStack = func ( ) * frame {
if len ( stack ) == 0 {
return nil
}
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
// Get frame index
idx := len ( stack ) - 1
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
// Pop last frame
frame := stack [ idx ]
stack = stack [ : idx ]
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
return frame
}
)
2021-08-10 12:32:39 +01:00
2022-09-25 12:09:41 +01:00
stackLoop :
for i := 0 ; i < maxIter ; i ++ {
// Pop next frame, nil means we are at end
if current = popStack ( ) ; current == nil {
return nil
2021-08-10 12:32:39 +01:00
}
2022-09-26 09:14:36 +01:00
pageLoop :
for {
2023-09-05 11:22:02 +01:00
l . Tracef ( "following collection page: %s" , current . pageURI )
2022-09-25 12:09:41 +01:00
itemLoop :
2022-09-26 09:50:14 +01:00
for {
2023-09-23 18:28:12 +01:00
// Get next item from page iter.
next := current . page . NextItem ( )
if next == nil {
2022-09-26 09:50:14 +01:00
break itemLoop
}
2021-08-10 12:32:39 +01:00
2023-06-03 10:35:15 +01:00
// Check for available IRI on item
2023-09-23 18:28:12 +01:00
itemIRI , _ := pub . ToId ( next )
2022-09-25 12:09:41 +01:00
if itemIRI == nil {
continue itemLoop
}
2023-09-05 11:22:02 +01:00
if itemIRI . Host == localhost {
2022-09-25 12:09:41 +01:00
// This child is one of ours,
continue itemLoop
}
2023-05-12 10:15:54 +01:00
// Dereference the remote status and store in the database.
2023-06-24 08:32:10 +01:00
// getStatusByURI guards against the following conditions:
2023-09-05 11:22:02 +01:00
// - refetching recently fetched statuses (recursion!)
2023-06-24 08:32:10 +01:00
// - remote domain is blocked (will return unretrievable)
2023-09-05 11:22:02 +01:00
// - any http type error for a new status returns unretrievable
2023-05-12 10:15:54 +01:00
_ , statusable , err := d . getStatusByURI ( ctx , username , itemIRI )
2022-09-25 12:09:41 +01:00
if err != nil {
2023-06-24 08:32:10 +01:00
if ! gtserror . Unretrievable ( err ) {
l . Errorf ( "error dereferencing remote status %s: %v" , itemIRI , err )
}
2023-05-12 10:15:54 +01:00
continue itemLoop
}
if statusable == nil {
2023-09-05 11:22:02 +01:00
// A nil statusable return from
// getStatusByURI() indicates a
// remote status that was already
// dereferenced recently (so no
// need to go through descendents).
continue itemLoop
}
2023-09-23 18:28:12 +01:00
// Extract any attached collection + ID URI from status.
page , pageURI := getAttachedStatusCollectionPage ( statusable )
2023-09-05 11:22:02 +01:00
if page == nil {
2022-09-25 12:09:41 +01:00
continue itemLoop
2021-08-10 12:32:39 +01:00
}
2022-09-25 12:09:41 +01:00
// Put current and next frame at top of stack
stack = append ( stack , current , & frame {
2023-09-05 11:22:02 +01:00
pageURI : pageURI ,
page : page ,
2022-09-25 12:09:41 +01:00
} )
2022-09-26 08:39:59 +01:00
// Now start at top of loop
continue stackLoop
2021-08-10 12:32:39 +01:00
}
2023-09-23 18:28:12 +01:00
// Get the next page from iterator.
next := current . page . NextPage ( )
if next == nil || ! next . IsIRI ( ) {
2022-09-25 12:09:41 +01:00
continue stackLoop
}
2023-09-23 18:28:12 +01:00
// Get the next page IRI.
nextURI := next . GetIRI ( )
nextURIStr := nextURI . String ( )
2023-09-05 11:22:02 +01:00
// Check whether this page has already been deref'd.
2023-09-23 18:28:12 +01:00
if _ , ok := derefdPages [ nextURIStr ] ; ok {
l . Warnf ( "self referencing collection page(s): %s" , nextURIStr )
2022-09-26 09:14:36 +01:00
continue stackLoop
}
2022-09-25 12:09:41 +01:00
2023-09-05 11:22:02 +01:00
// Mark this collection page as deref'd.
2023-09-23 18:28:12 +01:00
derefdPages [ nextURIStr ] = struct { } { }
2023-09-05 11:22:02 +01:00
// Dereference this next collection page by its IRI.
2023-05-12 10:15:54 +01:00
collectionPage , err := d . dereferenceCollectionPage ( ctx ,
username ,
2023-09-23 18:28:12 +01:00
nextURI ,
2023-05-12 10:15:54 +01:00
)
2022-09-25 12:09:41 +01:00
if err != nil {
2023-09-23 18:28:12 +01:00
l . Errorf ( "error dereferencing collection page %q: %s" , nextURIStr , err )
2022-09-25 12:09:41 +01:00
continue stackLoop
}
2023-09-05 11:22:02 +01:00
// Set the next collection page.
2022-09-25 12:09:41 +01:00
current . page = collectionPage
2023-09-23 18:28:12 +01:00
current . pageURI = nextURIStr
2022-09-26 09:14:36 +01:00
continue pageLoop
2021-08-10 12:32:39 +01:00
}
}
2023-09-05 11:22:02 +01:00
return gtserror . Newf ( "reached %d descendant iterations for %q" , maxIter , statusIRIStr )
}