diff --git a/README.md b/README.md
index 40e98f415..b11f44d94 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,8 @@ Here's a screenshot of the instance landing page! Check out the project's [offic
- [Known Issues](#known-issues)
- [Installing GoToSocial](#installing-gotosocial)
- [Supported Platforms](#supported-platforms)
- - [FreeBSD](#freebsd)
+ - [64-bit](#64-bit)
+ - [BSDs](#bsds)
- [32-bit](#32-bit)
- [OpenBSD](#openbsd)
- [Stable Releases](#stable-releases)
@@ -434,6 +435,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
- [superseriousbusiness/exif-terminator](https://codeberg.org/superseriousbusiness/exif-terminator); EXIF data removal. [GNU AGPL v3 LICENSE](https://spdx.org/licenses/AGPL-3.0-or-later.html).
- [superseriousbusiness/httpsig](https://github.com/superseriousbusiness/httpsig) forked from [go-fed/httpsig](https://github.com/go-fed/httpsig); secure HTTP signature library. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
- [superseriousbusiness/oauth2](https://github.com/superseriousbusiness/oauth2) forked from [go-oauth2/oauth2](https://github.com/go-oauth2/oauth2); OAuth server framework and token handling. [MIT License](https://spdx.org/licenses/MIT.html).
+- [temoto/robotstxt](https://github.com/temoto/robotstxt); robots.txt parsing. [MIT License](https://spdx.org/licenses/MIT.html).
- [tdewolff/minify](https://github.com/tdewolff/minify); HTML minification for Markdown-submitted posts. [MIT License](https://spdx.org/licenses/MIT.html).
- [uber-go/automaxprocs](https://github.com/uber-go/automaxprocs); GOMAXPROCS automation. [MIT License](https://spdx.org/licenses/MIT.html).
- [ulule/limiter](https://github.com/ulule/limiter); http rate limit middleware. [MIT License](https://spdx.org/licenses/MIT.html).
diff --git a/go.mod b/go.mod
index 59c924a09..b7d816a63 100644
--- a/go.mod
+++ b/go.mod
@@ -60,6 +60,7 @@ require (
github.com/superseriousbusiness/oauth2/v4 v4.3.2-SSB.0.20230227143000-f4900831d6c8
github.com/tdewolff/minify/v2 v2.21.3
github.com/technologize/otel-go-contrib v1.1.1
+ github.com/temoto/robotstxt v1.1.2
github.com/tetratelabs/wazero v1.8.2
github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80
github.com/ulule/limiter/v3 v3.11.2
diff --git a/go.sum b/go.sum
index 3a1613c89..a3481ce82 100644
--- a/go.sum
+++ b/go.sum
@@ -540,6 +540,8 @@ github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739 h1:IkjBCtQOOjIn03
github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739/go.mod h1:XPuWBzvdUzhCuxWO1ojpXsyzsA5bFoS3tO/Q3kFuTG8=
github.com/technologize/otel-go-contrib v1.1.1 h1:wZH9aSPNWZWIkEh3vfaKfMb15AJ80jJ1aVj/4GZdqIw=
github.com/technologize/otel-go-contrib v1.1.1/go.mod h1:dCN/wj2WyUO8aFZFdIN+6tfJHImjTML/8r2YVYAy3So=
+github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
+github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4=
github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
github.com/tidwall/btree v0.0.0-20191029221954-400434d76274 h1:G6Z6HvJuPjG6XfNGi/feOATzeJrfgTNJY+rGrHbA04E=
diff --git a/internal/api/util/mime.go b/internal/api/util/mime.go
index 4d8946e5d..da96be786 100644
--- a/internal/api/util/mime.go
+++ b/internal/api/util/mime.go
@@ -36,6 +36,8 @@
TextHTML = `text/html`
TextCSS = `text/css`
TextCSV = `text/csv`
+ TextPlain = `text/plain`
+ UTF8 = `utf-8`
)
// JSONContentType returns whether is application/json(;charset=utf-8)? content-type.
@@ -74,6 +76,14 @@ func XMLXRDContentType(ct string) bool {
p[0] == AppXMLXRD
}
+// TextPlainContentType returns whether is text/plain(;charset=utf-8)? content-type.
+func TextPlainContentType(ct string) bool {
+ p := splitContentType(ct)
+ p, ok := isUTF8ContentType(p)
+ return ok && len(p) == 1 &&
+ p[0] == TextPlain
+}
+
// ASContentType returns whether is valid ActivityStreams content-types:
// - application/activity+json
// - application/ld+json;profile=https://w3.org/ns/activitystreams
@@ -118,7 +128,7 @@ func NodeInfo2ContentType(ct string) bool {
// type parts list, removes it and returns whether is utf-8.
func isUTF8ContentType(p []string) ([]string, bool) {
const charset = "charset="
- const charsetUTF8 = charset + "utf-8"
+ const charsetUTF8 = charset + UTF8
for i, part := range p {
// Only handle charset slice parts.
diff --git a/internal/federation/dereferencing/instance.go b/internal/federation/dereferencing/instance.go
index 90ce074cd..66d0a21be 100644
--- a/internal/federation/dereferencing/instance.go
+++ b/internal/federation/dereferencing/instance.go
@@ -19,20 +19,20 @@
import (
"context"
- "fmt"
"net/url"
+ "github.com/superseriousbusiness/gotosocial/internal/gtserror"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
)
func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) {
if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil {
- return nil, fmt.Errorf("GetRemoteInstance: domain %s is blocked", remoteInstanceURI.Host)
+ return nil, gtserror.Newf("domain %s is blocked", remoteInstanceURI.Host)
}
transport, err := d.transportController.NewTransportForUsername(ctx, username)
if err != nil {
- return nil, fmt.Errorf("transport err: %s", err)
+ return nil, gtserror.Newf("transport err: %w", err)
}
return transport.DereferenceInstance(ctx, remoteInstanceURI)
diff --git a/internal/federation/dereferencing/instance_test.go b/internal/federation/dereferencing/instance_test.go
new file mode 100644
index 000000000..15f075479
--- /dev/null
+++ b/internal/federation/dereferencing/instance_test.go
@@ -0,0 +1,94 @@
+// GoToSocial
+// Copyright (C) GoToSocial Authors admin@gotosocial.org
+// SPDX-License-Identifier: AGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package dereferencing_test
+
+import (
+ "context"
+ "net/url"
+ "testing"
+
+ "github.com/stretchr/testify/suite"
+ "github.com/superseriousbusiness/gotosocial/internal/gtscontext"
+ "github.com/superseriousbusiness/gotosocial/testrig"
+)
+
+type InstanceTestSuite struct {
+ DereferencerStandardTestSuite
+}
+
+func (suite *InstanceTestSuite) TestDerefInstance() {
+ type testCase struct {
+ instanceIRI *url.URL
+ expectedSoftware string
+ }
+
+ for _, tc := range []testCase{
+ {
+ // Fossbros anonymous doesn't shield their nodeinfo or
+ // well-known or anything so we should be able to fetch.
+ instanceIRI: testrig.URLMustParse("https://fossbros-anonymous.io"),
+ expectedSoftware: "Hellsoft 6.6.6",
+ },
+ {
+ // Furtive nerds forbids /nodeinfo using
+ // robots.txt so we should get bare minimum only.
+ //
+ // Debug-level logs should show something like:
+ //
+ // - "can't fetch /nodeinfo/2.1: robots.txt disallows it"
+ instanceIRI: testrig.URLMustParse("https://furtive-nerds.example.org"),
+ expectedSoftware: "",
+ },
+ {
+ // Robotic furtive nerds forbids *everything* using
+ // robots.txt so we should get bare minimum only.
+ //
+ // Debug-level logs should show something like:
+ //
+ // - "can't fetch api/v1/instance: robots.txt disallows it"
+ // - "can't fetch .well-known/nodeinfo: robots.txt disallows it"
+ instanceIRI: testrig.URLMustParse("https://robotic.furtive-nerds.example.org"),
+ expectedSoftware: "",
+ },
+ {
+ // Really furtive nerds forbids .well-known/nodeinfo using
+ // X-Robots-Tagheaders, so we should get bare minimum only.
+ //
+ // Debug-level logs should show something like:
+ //
+ // - "can't use fetched .well-known/nodeinfo: robots tags disallows it"
+ instanceIRI: testrig.URLMustParse("https://really.furtive-nerds.example.org"),
+ expectedSoftware: "",
+ },
+ } {
+ instance, err := suite.dereferencer.GetRemoteInstance(
+ gtscontext.SetFastFail(context.Background()),
+ suite.testAccounts["admin_account"].Username,
+ tc.instanceIRI,
+ )
+ if err != nil {
+ suite.FailNow(err.Error())
+ }
+
+ suite.Equal(tc.expectedSoftware, instance.Version)
+ }
+}
+
+func TestInstanceTestSuite(t *testing.T) {
+ suite.Run(t, new(InstanceTestSuite))
+}
diff --git a/internal/transport/derefinstance.go b/internal/transport/derefinstance.go
index bbeb51000..e7971093d 100644
--- a/internal/transport/derefinstance.go
+++ b/internal/transport/derefinstance.go
@@ -25,6 +25,7 @@
"io"
"net/http"
"net/url"
+ "slices"
"strings"
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
@@ -35,18 +36,29 @@
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/util"
"github.com/superseriousbusiness/gotosocial/internal/validate"
+ "github.com/temoto/robotstxt"
)
func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) {
+ // Try to fetch robots.txt to check
+ // if we're allowed to try endpoints:
+ //
+ // - /api/v1/instance
+ // - /.well-known/nodeinfo
+ // - /nodeinfo/2.0|2.1 endpoints
+ robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host)
+ if err != nil {
+ log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err)
+ }
+
var i *gtsmodel.Instance
- var err error
// First try to dereference using /api/v1/instance.
// This will provide the most complete picture of an instance, and avoid unnecessary api calls.
//
// This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial.
log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host)
- i, err = dereferenceByAPIV1Instance(ctx, t, iri)
+ i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt)
if err == nil {
log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance")
return i, nil
@@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts
// If that doesn't work, try to dereference using /.well-known/nodeinfo.
// This will involve two API calls and return less info overall, but should be more widely compatible.
log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host)
- i, err = dereferenceByNodeInfo(ctx, t, iri)
+ i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt)
if err == nil {
log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo")
return i, nil
@@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts
}, nil
}
-func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) {
+func (t *transport) dereferenceByAPIV1Instance(
+ ctx context.Context,
+ iri *url.URL,
+ robotsTxt *robotstxt.RobotsData,
+) (*gtsmodel.Instance, error) {
+ const path = "api/v1/instance"
+
+ // Bail if we're not allowed to fetch this endpoint.
+ if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) {
+ err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path)
+ return nil, gtserror.SetNotPermitted(err)
+ }
+
cleanIRI := &url.URL{
Scheme: iri.Scheme,
Host: iri.Host,
- Path: "api/v1/instance",
+ Path: path,
}
// Build IRI just once
@@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
return nil, gtserror.NewFromResponse(resp)
}
+ // Ensure that we can use data returned from this endpoint.
+ robots := resp.Header.Values("X-Robots-Tag")
+ if slices.ContainsFunc(
+ robots,
+ func(key string) bool {
+ return strings.Contains(key, "noindex")
+ },
+ ) {
+ err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path)
+ return nil, gtserror.SetNotPermitted(err)
+ }
+
// Ensure that the incoming request content-type is expected.
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
err := gtserror.Newf("non json response type: %s", ct)
@@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
return nil, errors.New("response bytes was len 0")
}
- // try to parse the returned bytes directly into an Instance model
+ // Try to parse the returned bytes
+ // directly into an Instance model.
apiResp := &apimodel.InstanceV1{}
if err := json.Unmarshal(b, apiResp); err != nil {
return nil, err
@@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
return i, nil
}
-func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) {
- niIRI, err := callNodeInfoWellKnown(c, t, iri)
+func (t *transport) dereferenceByNodeInfo(
+ ctx context.Context,
+ iri *url.URL,
+ robotsTxt *robotstxt.RobotsData,
+) (*gtsmodel.Instance, error) {
+ // Retrieve the nodeinfo IRI from .well-known/nodeinfo.
+ niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt)
if err != nil {
- return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err)
+ return nil, gtserror.Newf("error during initial call to .well-known: %w", err)
}
- ni, err := callNodeInfo(c, t, niIRI)
+ // Use the returned nodeinfo IRI to make a followup call.
+ ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt)
if err != nil {
- return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err)
+ return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err)
}
- // we got a response of some kind! take what we can from it...
+ // We got a response of some kind!
+ //
+ // Start building out the bare minimum
+ // instance model, we'll add to it if we can.
id, err := id.NewRandomULID()
if err != nil {
- return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err)
+ return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err)
}
- // this is the bare minimum instance we'll return, and we'll add more stuff to it if we can
i := >smodel.Instance{
ID: id,
Domain: iri.Host,
@@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm
return i, nil
}
-func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) {
+func (t *transport) callNodeInfoWellKnown(
+ ctx context.Context,
+ iri *url.URL,
+ robotsTxt *robotstxt.RobotsData,
+) (*url.URL, error) {
+ const path = ".well-known/nodeinfo"
+
+ // Bail if we're not allowed to fetch this endpoint.
+ if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) {
+ err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path)
+ return nil, gtserror.SetNotPermitted(err)
+ }
+
cleanIRI := &url.URL{
Scheme: iri.Scheme,
Host: iri.Host,
- Path: ".well-known/nodeinfo",
+ Path: path,
}
// Build IRI just once
@@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
return nil, gtserror.NewFromResponse(resp)
}
- // Ensure that the incoming request content-type is expected.
+ // Ensure that we can use data returned from this endpoint.
+ robots := resp.Header.Values("X-Robots-Tag")
+ if slices.ContainsFunc(
+ robots,
+ func(key string) bool {
+ return strings.Contains(key, "noindex")
+ },
+ ) {
+ err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path)
+ return nil, gtserror.SetNotPermitted(err)
+ }
+
+ // Ensure that the returned content-type is expected.
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
err := gtserror.Newf("non json response type: %s", ct)
return nil, gtserror.SetMalformed(err)
@@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err)
}
- // look through the links for the first one that matches the nodeinfo schema, this is what we need
+ // Look through the links for the first one that
+ // matches nodeinfo schema, this is what we need.
var nodeinfoHref *url.URL
for _, l := range wellKnownResp.Links {
if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") {
@@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
return nodeinfoHref, nil
}
-func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) {
+func (t *transport) callNodeInfo(
+ ctx context.Context,
+ iri *url.URL,
+ robotsTxt *robotstxt.RobotsData,
+) (*apimodel.Nodeinfo, error) {
+ // Normalize robots.txt test path.
+ testPath := iri.Path
+ if !strings.HasPrefix(testPath, "/") {
+ testPath = "/" + testPath
+ }
+
+ // Bail if we're not allowed to fetch this endpoint.
+ if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) {
+ err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath)
+ return nil, gtserror.SetNotPermitted(err)
+ }
+
// Build IRI just once
iriStr := iri.String()
@@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No
return nil, gtserror.SetMalformed(err)
}
+ // Ensure that we can use data returned from this endpoint.
+ robots := resp.Header.Values("X-Robots-Tag")
+ if slices.ContainsFunc(
+ robots,
+ func(key string) bool {
+ return strings.Contains(key, "noindex")
+ },
+ ) {
+ err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path)
+ return nil, gtserror.SetNotPermitted(err)
+ }
+
b, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
diff --git a/internal/transport/derefrobots.go b/internal/transport/derefrobots.go
new file mode 100644
index 000000000..d6c4f3058
--- /dev/null
+++ b/internal/transport/derefrobots.go
@@ -0,0 +1,91 @@
+// GoToSocial
+// Copyright (C) GoToSocial Authors admin@gotosocial.org
+// SPDX-License-Identifier: AGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package transport
+
+import (
+ "context"
+ "net/http"
+ "net/url"
+
+ "codeberg.org/gruf/go-bytesize"
+ "codeberg.org/gruf/go-iotools"
+ apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
+ "github.com/superseriousbusiness/gotosocial/internal/gtserror"
+ "github.com/temoto/robotstxt"
+)
+
+func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) {
+ robotsIRI := &url.URL{
+ Scheme: protocol,
+ Host: host,
+ Path: "robots.txt",
+ }
+
+ // Build IRI just once
+ iriStr := robotsIRI.String()
+
+ // Prepare new HTTP request to endpoint
+ req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil)
+ if err != nil {
+ return nil, err
+ }
+
+ // We want text/plain utf-8 encoding.
+ //
+ // https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method
+ req.Header.Add("Accept", apiutil.TextPlain)
+ req.Header.Add("Accept-Charset", apiutil.UTF8)
+
+ // Perform the HTTP request
+ rsp, err := t.GET(req)
+ if err != nil {
+ return nil, err
+ }
+
+ // Ensure a non-error status response.
+ if rsp.StatusCode != http.StatusOK {
+ err := gtserror.NewFromResponse(rsp)
+ _ = rsp.Body.Close() // close early.
+ return nil, err
+ }
+
+ // Ensure that the incoming request content-type is expected.
+ if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) {
+ err := gtserror.Newf("non text/plain response: %s", ct)
+ _ = rsp.Body.Close() // close early.
+ return nil, gtserror.SetMalformed(err)
+ }
+
+ // Limit the robots.txt size to 500KiB
+ //
+ // https://www.rfc-editor.org/rfc/rfc9309.html#name-limits
+ const maxsz = int64(500 * bytesize.KiB)
+
+ // Check body claims to be within size limit.
+ if rsp.ContentLength > maxsz {
+ _ = rsp.Body.Close() // close early.
+ sz := bytesize.Size(maxsz) //nolint:gosec
+ return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz)
+ }
+
+ // Update response body with maximum size.
+ rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz)
+ defer rsp.Body.Close()
+
+ return robotstxt.FromResponse(rsp)
+}
diff --git a/testrig/transportcontroller.go b/testrig/transportcontroller.go
index b886e5c40..00f8ad2a6 100644
--- a/testrig/transportcontroller.go
+++ b/testrig/transportcontroller.go
@@ -133,6 +133,12 @@ func NewMockHTTPClient(do func(req *http.Request) (*http.Response, error), relat
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WebfingerResponse(req)
} else if strings.Contains(reqURLString, ".well-known/host-meta") {
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = HostMetaResponse(req)
+ } else if strings.Contains(reqURLString, ".well-known/nodeinfo") {
+ responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WellKnownNodeInfoResponse(req)
+ } else if strings.Contains(reqURLString, "/robots.txt") {
+ responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = RobotsTxtResponse(req)
+ } else if strings.Contains(reqURLString, "/nodeinfo/2.1") {
+ responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = NodeInfoResponse(req)
} else if strings.Contains(reqURLString, "lists.example.org") {
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = DomainPermissionSubscriptionResponse(req)
} else if note, ok := mockHTTPClient.TestRemoteStatuses[reqURLString]; ok {
@@ -318,6 +324,162 @@ func HostMetaResponse(req *http.Request) (
return
}
+func WellKnownNodeInfoResponse(req *http.Request) (
+ responseCode int,
+ responseBytes []byte,
+ responseContentType string,
+ responseContentLength int,
+ extraHeaders map[string]string,
+) {
+ var wkr *apimodel.WellKnownResponse
+
+ switch req.URL.String() {
+ case "https://fossbros-anonymous.io/.well-known/nodeinfo":
+ wkr = &apimodel.WellKnownResponse{
+ Links: []apimodel.Link{
+ {
+ Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1",
+ Href: "https://fossbros-anonymous.io/nodeinfo/2.1",
+ },
+ },
+ }
+ case "https://furtive-nerds.example.org/.well-known/nodeinfo":
+ wkr = &apimodel.WellKnownResponse{
+ Links: []apimodel.Link{
+ {
+ Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1",
+ Href: "https://furtive-nerds.example.org/nodeinfo/2.1",
+ },
+ },
+ }
+ case "https://really.furtive-nerds.example.org/.well-known/nodeinfo":
+ wkr = &apimodel.WellKnownResponse{
+ Links: []apimodel.Link{
+ {
+ Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1",
+ Href: "https://really.furtive-nerds.example.org/nodeinfo/2.1",
+ },
+ },
+ }
+ extraHeaders = map[string]string{"X-Robots-Tag": "noindex,nofollow"}
+ default:
+ log.Debugf(nil, "nodeinfo response not available for %s", req.URL)
+ responseCode = http.StatusNotFound
+ responseBytes = []byte(``)
+ responseContentType = "application/json"
+ responseContentLength = len(responseBytes)
+ return
+ }
+
+ niJSON, err := json.Marshal(wkr)
+ if err != nil {
+ panic(err)
+ }
+ responseCode = http.StatusOK
+ responseBytes = niJSON
+ responseContentType = "application/json"
+ responseContentLength = len(niJSON)
+
+ return
+}
+
+func NodeInfoResponse(req *http.Request) (
+ responseCode int,
+ responseBytes []byte,
+ responseContentType string,
+ responseContentLength int,
+ extraHeaders map[string]string,
+) {
+ var ni *apimodel.Nodeinfo
+
+ switch req.URL.String() {
+ case "https://fossbros-anonymous.io/nodeinfo/2.1":
+ ni = &apimodel.Nodeinfo{
+ Version: "2.1",
+ Software: apimodel.NodeInfoSoftware{
+ Name: "Hellsoft",
+ Version: "6.6.6",
+ Repository: "https://forge.hellsoft.fossbros-anonymous.io",
+ Homepage: "https://hellsoft.fossbros-anonymous.io",
+ },
+ Protocols: []string{"activitypub"},
+ }
+ case "https://furtive-nerds.example.org/nodeinfo/2.1":
+ ni = &apimodel.Nodeinfo{
+ Version: "2.1",
+ Software: apimodel.NodeInfoSoftware{
+ Name: "GoToSocial",
+ Version: "1.3.1.2",
+ Repository: "https://github.com/superseriousbusiness/gotosocial",
+ Homepage: "https://docs.gotosocial.org",
+ },
+ Protocols: []string{"activitypub"},
+ }
+ case "https://really.furtive-nerds.example.org/nodeinfo/2.1":
+ ni = &apimodel.Nodeinfo{
+ Version: "2.1",
+ Software: apimodel.NodeInfoSoftware{
+ Name: "GoToSocial",
+ Version: "1.3.1.2",
+ Repository: "https://github.com/superseriousbusiness/gotosocial",
+ Homepage: "https://docs.gotosocial.org",
+ },
+ Protocols: []string{"activitypub"},
+ }
+ default:
+ log.Debugf(nil, "nodeinfo response not available for %s", req.URL)
+ responseCode = http.StatusNotFound
+ responseBytes = []byte(``)
+ responseContentType = "application/json"
+ responseContentLength = len(responseBytes)
+ return
+ }
+
+ niJSON, err := json.Marshal(ni)
+ if err != nil {
+ panic(err)
+ }
+ responseCode = http.StatusOK
+ responseBytes = niJSON
+ responseContentType = "application/json"
+ responseContentLength = len(niJSON)
+
+ return
+}
+
+func RobotsTxtResponse(req *http.Request) (
+ responseCode int,
+ responseBytes []byte,
+ responseContentType string,
+ responseContentLength int,
+ extraHeaders map[string]string,
+) {
+ var robots string
+
+ switch req.URL.String() {
+ case "https://furtive-nerds.example.org/robots.txt":
+ // Disallow nodeinfo.
+ robots = "User-agent: *\nDisallow: /nodeinfo"
+ case "https://robotic.furtive-nerds.example.org/robots.txt":
+ // Disallow everything.
+ robots = "User-agent: *\nDisallow: /"
+ default:
+ log.Debugf(nil, "robots response not available for %s", req.URL)
+ responseCode = http.StatusNotFound
+ responseBytes = []byte(``)
+ responseContentType = "text/plain"
+ responseContentLength = len(responseBytes)
+ return
+ }
+
+ responseCode = http.StatusOK
+ responseBytes = []byte(robots)
+ responseContentType = "text/plain"
+ responseContentLength = len(responseBytes)
+
+ return
+}
+
func WebfingerResponse(req *http.Request) (
responseCode int,
responseBytes []byte,
diff --git a/vendor/github.com/temoto/robotstxt/.gitignore b/vendor/github.com/temoto/robotstxt/.gitignore
new file mode 100644
index 000000000..6205f9eae
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/.gitignore
@@ -0,0 +1,15 @@
+*.cgo?.*
+*.o
+*.so
+*.sublime-*
+*.zip
+.DS_Store
+.idea/
+.tags*
+_cgo_*
+_gofuzz/crashers/
+_gofuzz/suppressions/
+_obj
+_test
+coverage.txt
+robots.txt-check/robots.txt-check
diff --git a/vendor/github.com/temoto/robotstxt/.golangci.yml b/vendor/github.com/temoto/robotstxt/.golangci.yml
new file mode 100644
index 000000000..24e5858fa
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/.golangci.yml
@@ -0,0 +1,20 @@
+linters:
+ enable:
+ - goconst
+ - gofmt
+ - gosec
+ - maligned
+ - prealloc
+ - staticcheck
+ disable:
+ - deadcode
+ - structcheck
+ - varcheck
+
+linters-settings:
+ gofmt:
+ simplify: true
+ govet:
+ check-shadowing: true
+ maligned:
+ suggest-new: true
diff --git a/vendor/github.com/temoto/robotstxt/.travis.yml b/vendor/github.com/temoto/robotstxt/.travis.yml
new file mode 100644
index 000000000..ad90dac37
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/.travis.yml
@@ -0,0 +1,30 @@
+cache:
+ go: true
+ directories:
+ - $HOME/.cache
+ - $HOME/bin
+ - $HOME/gopath/pkg/mod
+language: go
+go:
+- 1.11
+- 1.12
+- 1.13
+- 1.14
+- 1.x
+- master
+install: true
+script: GO111MODULE=on go test -race
+
+matrix:
+ include:
+ - go: 1.x
+ env: task=coverage
+ script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt
+ after_success: bash <(curl -s https://codecov.io/bash)
+ - go: 1.x
+ env: task=bench
+ script: GO111MODULE=on ./script/bench
+ - go: 1.x
+ install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1
+ env: task=clean
+ script: GO111MODULE=on ./script/clean
diff --git a/vendor/github.com/temoto/robotstxt/LICENSE b/vendor/github.com/temoto/robotstxt/LICENSE
new file mode 100644
index 000000000..c125145b6
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2010 Sergey Shepelev
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/vendor/github.com/temoto/robotstxt/README.rst b/vendor/github.com/temoto/robotstxt/README.rst
new file mode 100644
index 000000000..92f1ae161
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/README.rst
@@ -0,0 +1,115 @@
+What
+====
+
+This is a robots.txt exclusion protocol implementation for Go language (golang).
+
+
+Build
+=====
+
+To build and run tests run `go test` in source directory.
+
+
+Contribute
+==========
+
+Warm welcome.
+
+* If desired, add your name in README.rst, section Who.
+* Run `script/test && script/clean && echo ok`
+* You can ignore linter warnings, but everything else must pass.
+* Send your change as pull request or just a regular patch to current maintainer (see section Who).
+
+Thank you.
+
+
+Usage
+=====
+
+As usual, no special installation is required, just
+
+ import "github.com/temoto/robotstxt"
+
+run `go get` and you're ready.
+
+1. Parse
+^^^^^^^^
+
+First of all, you need to parse robots.txt data. You can do it with
+functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`::
+
+ robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:"))
+ robots, err := robotstxt.FromString("User-agent: *\nDisallow:")
+
+As of 2012-10-03, `FromBytes` is the most efficient method, everything else
+is a wrapper for this core function.
+
+There are few convenient constructors for various purposes:
+
+* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data
+from HTTP response. It *does not* call `response.Body.Close()`::
+
+ robots, err := robotstxt.FromResponse(resp)
+ resp.Body.Close()
+ if err != nil {
+ log.Println("Error parsing robots.txt:", err.Error())
+ }
+
+* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or
+`FromStatusAndString` if you prefer to read bytes (string) yourself.
+Passing status code applies following logic in line with Google's interpretation
+of robots.txt files:
+
+ * status 2xx -> parse body with `FromBytes` and apply rules listed there.
+ * status 4xx -> allow all (even 401/403, as recommended by Google).
+ * other (5xx) -> disallow all, consider this a temporary unavailability.
+
+2. Query
+^^^^^^^^
+
+Parsing robots.txt content builds a kind of logic database, which you can
+query with `(r *RobotsData) TestAgent(url, agent string) (bool)`.
+
+Explicit passing of agent is useful if you want to query for different agents. For
+single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)`
+returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`.
+
+Simple query with explicit user agent. Each call will scan all rules.
+
+::
+
+ allow := robots.TestAgent("/", "FooBot")
+
+Or query several paths against same user agent for performance.
+
+::
+
+ group := robots.FindGroup("BarBot")
+ group.Test("/")
+ group.Test("/download.mp3")
+ group.Test("/news/article-2012-1")
+
+
+Who
+===
+
+Honorable contributors (in undefined order):
+
+ * Ilya Grigorik (igrigorik)
+ * Martin Angers (PuerkitoBio)
+ * Micha Gorelick (mynameisfiber)
+
+Initial commit and other: Sergey Shepelev temotor@gmail.com
+
+
+Flair
+=====
+
+.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master
+ :target: https://travis-ci.org/temoto/robotstxt
+
+.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg
+ :target: https://codecov.io/gh/temoto/robotstxt
+
+.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt
+ :target: https://goreportcard.com/report/github.com/temoto/robotstxt
diff --git a/vendor/github.com/temoto/robotstxt/codecov.yml b/vendor/github.com/temoto/robotstxt/codecov.yml
new file mode 100644
index 000000000..b80be28f6
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/codecov.yml
@@ -0,0 +1,2 @@
+codecov:
+ token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04
diff --git a/vendor/github.com/temoto/robotstxt/fuzz.go b/vendor/github.com/temoto/robotstxt/fuzz.go
new file mode 100644
index 000000000..de4b0587a
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/fuzz.go
@@ -0,0 +1,29 @@
+// +build gofuzz
+
+package robotstxt
+
+import "testing/quick"
+
+func Fuzz(data []byte) int {
+ r, err := FromBytes(data)
+ if err != nil {
+ if r != nil {
+ panic("r != nil on error")
+ }
+ return 0
+ }
+
+ // FindGroup must never return nil
+ f1 := func(agent string) bool { return r.FindGroup(agent) != nil }
+ if err := quick.Check(f1, nil); err != nil {
+ panic(err)
+ }
+
+ // just check TestAgent doesn't panic
+ f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true }
+ if err := quick.Check(f2, nil); err != nil {
+ panic(err)
+ }
+
+ return 1
+}
diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go
new file mode 100644
index 000000000..46eb6b184
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/parser.go
@@ -0,0 +1,271 @@
+package robotstxt
+
+// Comments explaining the logic are taken from either the google's spec:
+// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+//
+// or the Wikipedia's entry on robots.txt:
+// http://en.wikipedia.org/wiki/Robots.txt
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+)
+
+type lineType uint
+
+const (
+ lIgnore lineType = iota
+ lUnknown
+ lUserAgent
+ lAllow
+ lDisallow
+ lCrawlDelay
+ lSitemap
+ lHost
+)
+
+type parser struct {
+ tokens []string
+ pos int
+}
+
+type lineInfo struct {
+ t lineType // Type of line key
+ k string // String representation of the type of key
+ vs string // String value of the key
+ vf float64 // Float value of the key
+ vr *regexp.Regexp // Regexp value of the key
+}
+
+func newParser(tokens []string) *parser {
+ return &parser{tokens: tokens}
+}
+
+func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
+ var g *Group
+ for _, a := range agents {
+ if g = groups[a]; g == nil {
+ g = new(Group)
+ groups[a] = g
+ }
+ fun(g)
+ }
+}
+
+func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
+ groups = make(map[string]*Group, 16)
+ agents := make([]string, 0, 4)
+ isEmptyGroup := true
+
+ // Reset internal fields, tokens are assigned at creation time, never change
+ p.pos = 0
+
+ for {
+ if li, err := p.parseLine(); err != nil {
+ if err == io.EOF {
+ break
+ }
+ errs = append(errs, err)
+ } else {
+ switch li.t {
+ case lUserAgent:
+ // Two successive user-agent lines are part of the same group.
+ if !isEmptyGroup {
+ // End previous group
+ agents = make([]string, 0, 4)
+ }
+ if len(agents) == 0 {
+ isEmptyGroup = true
+ }
+ agents = append(agents, li.vs)
+
+ case lDisallow:
+ // Error if no current group
+ if len(agents) == 0 {
+ errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
+ } else {
+ isEmptyGroup = false
+ var r *rule
+ if li.vr != nil {
+ r = &rule{"", false, li.vr}
+ } else {
+ r = &rule{li.vs, false, nil}
+ }
+ parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
+ }
+
+ case lAllow:
+ // Error if no current group
+ if len(agents) == 0 {
+ errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
+ } else {
+ isEmptyGroup = false
+ var r *rule
+ if li.vr != nil {
+ r = &rule{"", true, li.vr}
+ } else {
+ r = &rule{li.vs, true, nil}
+ }
+ parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
+ }
+
+ case lHost:
+ host = li.vs
+
+ case lSitemap:
+ sitemaps = append(sitemaps, li.vs)
+
+ case lCrawlDelay:
+ if len(agents) == 0 {
+ errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
+ } else {
+ isEmptyGroup = false
+ delay := time.Duration(li.vf * float64(time.Second))
+ parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
+ }
+ }
+ }
+ }
+ return
+}
+
+func (p *parser) parseLine() (li *lineInfo, err error) {
+ t1, ok1 := p.popToken()
+ if !ok1 {
+ // proper EOF
+ return nil, io.EOF
+ }
+
+ t2, ok2 := p.peekToken()
+ if !ok2 {
+ // EOF, no value associated with the token, so ignore token and return
+ return nil, io.EOF
+ }
+
+ // Helper closure for all string-based tokens, common behaviour:
+ // - Consume t2 token
+ // - If empty, return unknown line info
+ // - Otherwise return the specified line info
+ returnStringVal := func(t lineType) (*lineInfo, error) {
+ p.popToken()
+ if t2 != "" {
+ return &lineInfo{t: t, k: t1, vs: t2}, nil
+ }
+ return &lineInfo{t: lIgnore}, nil
+ }
+
+ // Helper closure for all path tokens (allow/disallow), common behaviour:
+ // - Consume t2 token
+ // - If empty, return unknown line info
+ // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
+ // - Detect if wildcards are present, if so, compile into a regexp
+ // - Return the specified line info
+ returnPathVal := func(t lineType) (*lineInfo, error) {
+ p.popToken()
+ if t2 != "" {
+ if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
+ t2 = "/" + t2
+ }
+ t2 = strings.TrimRightFunc(t2, isAsterisk)
+ // From google's spec:
+ // Google, Bing, Yahoo, and Ask support a limited form of
+ // "wildcards" for path values. These are:
+ // * designates 0 or more instances of any valid character
+ // $ designates the end of the URL
+ if strings.ContainsAny(t2, "*$") {
+ // Must compile a regexp, this is a pattern.
+ // Escape string before compile.
+ t2 = regexp.QuoteMeta(t2)
+ t2 = strings.Replace(t2, `\*`, `.*`, -1)
+ t2 = strings.Replace(t2, `\$`, `$`, -1)
+ if r, e := regexp.Compile(t2); e != nil {
+ return nil, e
+ } else {
+ return &lineInfo{t: t, k: t1, vr: r}, nil
+ }
+ } else {
+ // Simple string path
+ return &lineInfo{t: t, k: t1, vs: t2}, nil
+ }
+ }
+ return &lineInfo{t: lIgnore}, nil
+ }
+
+ switch strings.ToLower(t1) {
+ case tokEOL:
+ // Don't consume t2 and continue parsing
+ return &lineInfo{t: lIgnore}, nil
+
+ case "user-agent", "useragent":
+ // From google's spec:
+ // Handling of elements with simple errors / typos (eg "useragent"
+ // instead of "user-agent") is undefined and may be interpreted as correct
+ // directives by some user-agents.
+ // The user-agent is non-case-sensitive.
+ t2 = strings.ToLower(t2)
+ return returnStringVal(lUserAgent)
+
+ case "disallow":
+ // From google's spec:
+ // When no path is specified, the directive is ignored (so an empty Disallow
+ // CAN be an allow, since allow is the default. The actual result depends
+ // on the other rules in the group).
+ return returnPathVal(lDisallow)
+
+ case "allow":
+ // From google's spec:
+ // When no path is specified, the directive is ignored.
+ return returnPathVal(lAllow)
+
+ case "host":
+ // Host directive to specify main site mirror
+ // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
+ return returnStringVal(lHost)
+
+ case "sitemap":
+ // Non-group field, applies to the host as a whole, not to a specific user-agent
+ return returnStringVal(lSitemap)
+
+ case "crawl-delay", "crawldelay":
+ // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
+ // Several major crawlers support a Crawl-delay parameter, set to the
+ // number of seconds to wait between successive requests to the same server.
+ p.popToken()
+ if cd, e := strconv.ParseFloat(t2, 64); e != nil {
+ return nil, e
+ } else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
+ return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
+ } else {
+ return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
+ }
+ }
+
+ // Consume t2 token
+ p.popToken()
+ return &lineInfo{t: lUnknown, k: t1}, nil
+}
+
+func (p *parser) popToken() (tok string, ok bool) {
+ tok, ok = p.peekToken()
+ if !ok {
+ return
+ }
+ p.pos++
+ return tok, true
+}
+
+func (p *parser) peekToken() (tok string, ok bool) {
+ if p.pos >= len(p.tokens) {
+ return "", false
+ }
+ return p.tokens[p.pos], true
+}
+
+func isAsterisk(r rune) bool {
+ return r == '*'
+}
diff --git a/vendor/github.com/temoto/robotstxt/robotstxt.go b/vendor/github.com/temoto/robotstxt/robotstxt.go
new file mode 100644
index 000000000..52d3637c6
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/robotstxt.go
@@ -0,0 +1,227 @@
+// Package robotstxt implements the robots.txt Exclusion Protocol
+// as specified in http://www.robotstxt.org/wc/robots.html
+// with various extensions.
+package robotstxt
+
+// Comments explaining the logic are taken from either the Google's spec:
+// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+
+import (
+ "bytes"
+ "errors"
+ "io/ioutil"
+ "net/http"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+)
+
+type RobotsData struct {
+ // private
+ groups map[string]*Group
+ allowAll bool
+ disallowAll bool
+ Host string
+ Sitemaps []string
+}
+
+type Group struct {
+ rules []*rule
+ Agent string
+ CrawlDelay time.Duration
+}
+
+type rule struct {
+ path string
+ allow bool
+ pattern *regexp.Regexp
+}
+
+type ParseError struct {
+ Errs []error
+}
+
+func newParseError(errs []error) *ParseError {
+ return &ParseError{errs}
+}
+
+func (e ParseError) Error() string {
+ var b bytes.Buffer
+
+ b.WriteString("Parse error(s): " + "\n")
+ for _, er := range e.Errs {
+ b.WriteString(er.Error() + "\n")
+ }
+ return b.String()
+}
+
+var allowAll = &RobotsData{allowAll: true}
+var disallowAll = &RobotsData{disallowAll: true}
+var emptyGroup = &Group{}
+
+func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
+ switch {
+ case statusCode >= 200 && statusCode < 300:
+ return FromBytes(body)
+
+ // From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+ //
+ // Google treats all 4xx errors in the same way and assumes that no valid
+ // robots.txt file exists. It is assumed that there are no restrictions.
+ // This is a "full allow" for crawling. Note: this includes 401
+ // "Unauthorized" and 403 "Forbidden" HTTP result codes.
+ case statusCode >= 400 && statusCode < 500:
+ return allowAll, nil
+
+ // From Google's spec:
+ // Server errors (5xx) are seen as temporary errors that result in a "full
+ // disallow" of crawling.
+ case statusCode >= 500 && statusCode < 600:
+ return disallowAll, nil
+ }
+
+ return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))
+}
+
+func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
+ return FromStatusAndBytes(statusCode, []byte(body))
+}
+
+func FromResponse(res *http.Response) (*RobotsData, error) {
+ if res == nil {
+ // Edge case, if res is nil, return nil data
+ return nil, nil
+ }
+ buf, e := ioutil.ReadAll(res.Body)
+ if e != nil {
+ return nil, e
+ }
+ return FromStatusAndBytes(res.StatusCode, buf)
+}
+
+func FromBytes(body []byte) (r *RobotsData, err error) {
+ var errs []error
+
+ // special case (probably not worth optimization?)
+ trimmed := bytes.TrimSpace(body)
+ if len(trimmed) == 0 {
+ return allowAll, nil
+ }
+
+ sc := newByteScanner("bytes", true)
+ //sc.Quiet = !print_errors
+ sc.feed(body, true)
+ tokens := sc.scanAll()
+
+ // special case worth optimization
+ if len(tokens) == 0 {
+ return allowAll, nil
+ }
+
+ r = &RobotsData{}
+ parser := newParser(tokens)
+ r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
+ if len(errs) > 0 {
+ return nil, newParseError(errs)
+ }
+
+ return r, nil
+}
+
+func FromString(body string) (r *RobotsData, err error) {
+ return FromBytes([]byte(body))
+}
+
+func (r *RobotsData) TestAgent(path, agent string) bool {
+ if r.allowAll {
+ return true
+ }
+ if r.disallowAll {
+ return false
+ }
+
+ // Find a group of rules that applies to this agent
+ // From Google's spec:
+ // The user-agent is non-case-sensitive.
+ g := r.FindGroup(agent)
+ return g.Test(path)
+}
+
+// FindGroup searches block of declarations for specified user-agent.
+// From Google's spec:
+// Only one group of group-member records is valid for a particular crawler.
+// The crawler must determine the correct group of records by finding the group
+// with the most specific user-agent that still matches. All other groups of
+// records are ignored by the crawler. The user-agent is non-case-sensitive.
+// The order of the groups within the robots.txt file is irrelevant.
+func (r *RobotsData) FindGroup(agent string) (ret *Group) {
+ var prefixLen int
+
+ agent = strings.ToLower(agent)
+ if ret = r.groups["*"]; ret != nil {
+ // Weakest match possible
+ prefixLen = 1
+ }
+ for a, g := range r.groups {
+ if a != "*" && strings.HasPrefix(agent, a) {
+ if l := len(a); l > prefixLen {
+ prefixLen = l
+ ret = g
+ }
+ }
+ }
+
+ if ret == nil {
+ return emptyGroup
+ }
+ return
+}
+
+func (g *Group) Test(path string) bool {
+ if r := g.findRule(path); r != nil {
+ return r.allow
+ }
+
+ // From Google's spec:
+ // By default, there are no restrictions for crawling for the designated crawlers.
+ return true
+}
+
+// From Google's spec:
+// The path value is used as a basis to determine whether or not a rule applies
+// to a specific URL on a site. With the exception of wildcards, the path is
+// used to match the beginning of a URL (and any valid URLs that start with the
+// same path).
+//
+// At a group-member level, in particular for allow and disallow directives,
+// the most specific rule based on the length of the [path] entry will trump
+// the less specific (shorter) rule. The order of precedence for rules with
+// wildcards is undefined.
+func (g *Group) findRule(path string) (ret *rule) {
+ var prefixLen int
+
+ for _, r := range g.rules {
+ if r.pattern != nil {
+ if r.pattern.MatchString(path) {
+ // Consider this a match equal to the length of the pattern.
+ // From Google's spec:
+ // The order of precedence for rules with wildcards is undefined.
+ if l := len(r.pattern.String()); l > prefixLen {
+ prefixLen = l
+ ret = r
+ }
+ }
+ } else if r.path == "/" && prefixLen == 0 {
+ // Weakest match possible
+ prefixLen = 1
+ ret = r
+ } else if strings.HasPrefix(path, r.path) {
+ if l := len(r.path); l > prefixLen {
+ prefixLen = l
+ ret = r
+ }
+ }
+ }
+ return
+}
diff --git a/vendor/github.com/temoto/robotstxt/scanner.go b/vendor/github.com/temoto/robotstxt/scanner.go
new file mode 100644
index 000000000..6bd98c2ec
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/scanner.go
@@ -0,0 +1,185 @@
+package robotstxt
+
+import (
+ "bytes"
+ "fmt"
+ "go/token"
+ "os"
+ "sync"
+ "unicode/utf8"
+)
+
+type byteScanner struct {
+ pos token.Position
+ buf []byte
+ ErrorCount int
+ ch rune
+ Quiet bool
+ keyTokenFound bool
+ lastChunk bool
+}
+
+const tokEOL = "\n"
+
+var WhitespaceChars = []rune{' ', '\t', '\v'}
+var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
+
+func newByteScanner(srcname string, quiet bool) *byteScanner {
+ return &byteScanner{
+ Quiet: quiet,
+ ch: -1,
+ pos: token.Position{Filename: srcname},
+ }
+}
+
+func (s *byteScanner) feed(input []byte, end bool) {
+ s.buf = input
+ s.pos.Offset = 0
+ s.pos.Line = 1
+ s.pos.Column = 1
+ s.lastChunk = end
+
+ // Read first char into look-ahead buffer `s.ch`.
+ if !s.nextChar() {
+ return
+ }
+
+ // Skip UTF-8 byte order mark
+ if s.ch == 65279 {
+ s.nextChar()
+ s.pos.Column = 1
+ }
+}
+
+func (s *byteScanner) GetPosition() token.Position {
+ return s.pos
+}
+
+func (s *byteScanner) scan() string {
+ // Note Offset > len, not >=, so we can scan last character.
+ if s.lastChunk && s.pos.Offset > len(s.buf) {
+ return ""
+ }
+
+ s.skipSpace()
+
+ if s.ch == -1 {
+ return ""
+ }
+
+ // EOL
+ if s.isEol() {
+ s.keyTokenFound = false
+ // skip subsequent newline chars
+ for s.ch != -1 && s.isEol() {
+ s.nextChar()
+ }
+ // emit newline as separate token
+ return tokEOL
+ }
+
+ // skip comments
+ if s.ch == '#' {
+ s.keyTokenFound = false
+ s.skipUntilEol()
+ if s.ch == -1 {
+ return ""
+ }
+ // emit newline as separate token
+ return tokEOL
+ }
+
+ // else we found something
+ tok := tokBuffers.Get().(*bytes.Buffer)
+ defer tokBuffers.Put(tok)
+ tok.Reset()
+ tok.WriteRune(s.ch)
+ s.nextChar()
+ for s.ch != -1 && !s.isSpace() && !s.isEol() {
+ // Do not consider ":" to be a token separator if a first key token
+ // has already been found on this line (avoid cutting an absolute URL
+ // after the "http:")
+ if s.ch == ':' && !s.keyTokenFound {
+ s.nextChar()
+ s.keyTokenFound = true
+ break
+ }
+
+ tok.WriteRune(s.ch)
+ s.nextChar()
+ }
+ return tok.String()
+}
+
+func (s *byteScanner) scanAll() []string {
+ results := make([]string, 0, 64) // random guess of average tokens length
+ for {
+ token := s.scan()
+ if token != "" {
+ results = append(results, token)
+ } else {
+ break
+ }
+ }
+ return results
+}
+
+func (s *byteScanner) error(pos token.Position, msg string) {
+ s.ErrorCount++
+ if !s.Quiet {
+ fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
+ }
+}
+
+func (s *byteScanner) isEol() bool {
+ return s.ch == '\n' || s.ch == '\r'
+}
+
+func (s *byteScanner) isSpace() bool {
+ for _, r := range WhitespaceChars {
+ if s.ch == r {
+ return true
+ }
+ }
+ return false
+}
+
+func (s *byteScanner) skipSpace() {
+ for s.ch != -1 && s.isSpace() {
+ s.nextChar()
+ }
+}
+
+func (s *byteScanner) skipUntilEol() {
+ for s.ch != -1 && !s.isEol() {
+ s.nextChar()
+ }
+ // skip subsequent newline chars
+ for s.ch != -1 && s.isEol() {
+ s.nextChar()
+ }
+}
+
+// Reads next Unicode char.
+func (s *byteScanner) nextChar() bool {
+ if s.pos.Offset >= len(s.buf) {
+ s.ch = -1
+ return false
+ }
+ s.pos.Column++
+ if s.ch == '\n' {
+ s.pos.Line++
+ s.pos.Column = 1
+ }
+ r, w := rune(s.buf[s.pos.Offset]), 1
+ if r >= 0x80 {
+ r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
+ if r == utf8.RuneError && w == 1 {
+ s.error(s.pos, "illegal UTF-8 encoding")
+ }
+ }
+ s.pos.Column++
+ s.pos.Offset += w
+ s.ch = r
+ return true
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 49ca611b2..04314f34f 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -412,6 +412,8 @@ github.com/jackc/puddle/v2/internal/genstack
# github.com/jessevdk/go-flags v1.5.0
## explicit; go 1.15
github.com/jessevdk/go-flags
+# github.com/jimsmart/grobotstxt v1.0.3
+## explicit; go 1.14
# github.com/jinzhu/inflection v1.0.0
## explicit
github.com/jinzhu/inflection
@@ -831,6 +833,9 @@ github.com/tdewolff/parse/v2/strconv
# github.com/technologize/otel-go-contrib v1.1.1
## explicit; go 1.17
github.com/technologize/otel-go-contrib/otelginmetrics
+# github.com/temoto/robotstxt v1.1.2
+## explicit; go 1.11
+github.com/temoto/robotstxt
# github.com/tetratelabs/wazero v1.8.2
## explicit; go 1.21
github.com/tetratelabs/wazero