diff --git a/README.md b/README.md index 40e98f415..b11f44d94 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,8 @@ Here's a screenshot of the instance landing page! Check out the project's [offic - [Known Issues](#known-issues) - [Installing GoToSocial](#installing-gotosocial) - [Supported Platforms](#supported-platforms) - - [FreeBSD](#freebsd) + - [64-bit](#64-bit) + - [BSDs](#bsds) - [32-bit](#32-bit) - [OpenBSD](#openbsd) - [Stable Releases](#stable-releases) @@ -434,6 +435,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia - [superseriousbusiness/exif-terminator](https://codeberg.org/superseriousbusiness/exif-terminator); EXIF data removal. [GNU AGPL v3 LICENSE](https://spdx.org/licenses/AGPL-3.0-or-later.html). - [superseriousbusiness/httpsig](https://github.com/superseriousbusiness/httpsig) forked from [go-fed/httpsig](https://github.com/go-fed/httpsig); secure HTTP signature library. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html). - [superseriousbusiness/oauth2](https://github.com/superseriousbusiness/oauth2) forked from [go-oauth2/oauth2](https://github.com/go-oauth2/oauth2); OAuth server framework and token handling. [MIT License](https://spdx.org/licenses/MIT.html). +- [temoto/robotstxt](https://github.com/temoto/robotstxt); robots.txt parsing. [MIT License](https://spdx.org/licenses/MIT.html). - [tdewolff/minify](https://github.com/tdewolff/minify); HTML minification for Markdown-submitted posts. [MIT License](https://spdx.org/licenses/MIT.html). - [uber-go/automaxprocs](https://github.com/uber-go/automaxprocs); GOMAXPROCS automation. [MIT License](https://spdx.org/licenses/MIT.html). - [ulule/limiter](https://github.com/ulule/limiter); http rate limit middleware. [MIT License](https://spdx.org/licenses/MIT.html). diff --git a/go.mod b/go.mod index 59c924a09..b7d816a63 100644 --- a/go.mod +++ b/go.mod @@ -60,6 +60,7 @@ require ( github.com/superseriousbusiness/oauth2/v4 v4.3.2-SSB.0.20230227143000-f4900831d6c8 github.com/tdewolff/minify/v2 v2.21.3 github.com/technologize/otel-go-contrib v1.1.1 + github.com/temoto/robotstxt v1.1.2 github.com/tetratelabs/wazero v1.8.2 github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 github.com/ulule/limiter/v3 v3.11.2 diff --git a/go.sum b/go.sum index 3a1613c89..a3481ce82 100644 --- a/go.sum +++ b/go.sum @@ -540,6 +540,8 @@ github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739 h1:IkjBCtQOOjIn03 github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739/go.mod h1:XPuWBzvdUzhCuxWO1ojpXsyzsA5bFoS3tO/Q3kFuTG8= github.com/technologize/otel-go-contrib v1.1.1 h1:wZH9aSPNWZWIkEh3vfaKfMb15AJ80jJ1aVj/4GZdqIw= github.com/technologize/otel-go-contrib v1.1.1/go.mod h1:dCN/wj2WyUO8aFZFdIN+6tfJHImjTML/8r2YVYAy3So= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4= github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs= github.com/tidwall/btree v0.0.0-20191029221954-400434d76274 h1:G6Z6HvJuPjG6XfNGi/feOATzeJrfgTNJY+rGrHbA04E= diff --git a/internal/api/util/mime.go b/internal/api/util/mime.go index 4d8946e5d..da96be786 100644 --- a/internal/api/util/mime.go +++ b/internal/api/util/mime.go @@ -36,6 +36,8 @@ TextHTML = `text/html` TextCSS = `text/css` TextCSV = `text/csv` + TextPlain = `text/plain` + UTF8 = `utf-8` ) // JSONContentType returns whether is application/json(;charset=utf-8)? content-type. @@ -74,6 +76,14 @@ func XMLXRDContentType(ct string) bool { p[0] == AppXMLXRD } +// TextPlainContentType returns whether is text/plain(;charset=utf-8)? content-type. +func TextPlainContentType(ct string) bool { + p := splitContentType(ct) + p, ok := isUTF8ContentType(p) + return ok && len(p) == 1 && + p[0] == TextPlain +} + // ASContentType returns whether is valid ActivityStreams content-types: // - application/activity+json // - application/ld+json;profile=https://w3.org/ns/activitystreams @@ -118,7 +128,7 @@ func NodeInfo2ContentType(ct string) bool { // type parts list, removes it and returns whether is utf-8. func isUTF8ContentType(p []string) ([]string, bool) { const charset = "charset=" - const charsetUTF8 = charset + "utf-8" + const charsetUTF8 = charset + UTF8 for i, part := range p { // Only handle charset slice parts. diff --git a/internal/federation/dereferencing/instance.go b/internal/federation/dereferencing/instance.go index 90ce074cd..66d0a21be 100644 --- a/internal/federation/dereferencing/instance.go +++ b/internal/federation/dereferencing/instance.go @@ -19,20 +19,20 @@ import ( "context" - "fmt" "net/url" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" ) func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) { if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil { - return nil, fmt.Errorf("GetRemoteInstance: domain %s is blocked", remoteInstanceURI.Host) + return nil, gtserror.Newf("domain %s is blocked", remoteInstanceURI.Host) } transport, err := d.transportController.NewTransportForUsername(ctx, username) if err != nil { - return nil, fmt.Errorf("transport err: %s", err) + return nil, gtserror.Newf("transport err: %w", err) } return transport.DereferenceInstance(ctx, remoteInstanceURI) diff --git a/internal/federation/dereferencing/instance_test.go b/internal/federation/dereferencing/instance_test.go new file mode 100644 index 000000000..15f075479 --- /dev/null +++ b/internal/federation/dereferencing/instance_test.go @@ -0,0 +1,94 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package dereferencing_test + +import ( + "context" + "net/url" + "testing" + + "github.com/stretchr/testify/suite" + "github.com/superseriousbusiness/gotosocial/internal/gtscontext" + "github.com/superseriousbusiness/gotosocial/testrig" +) + +type InstanceTestSuite struct { + DereferencerStandardTestSuite +} + +func (suite *InstanceTestSuite) TestDerefInstance() { + type testCase struct { + instanceIRI *url.URL + expectedSoftware string + } + + for _, tc := range []testCase{ + { + // Fossbros anonymous doesn't shield their nodeinfo or + // well-known or anything so we should be able to fetch. + instanceIRI: testrig.URLMustParse("https://fossbros-anonymous.io"), + expectedSoftware: "Hellsoft 6.6.6", + }, + { + // Furtive nerds forbids /nodeinfo using + // robots.txt so we should get bare minimum only. + // + // Debug-level logs should show something like: + // + // - "can't fetch /nodeinfo/2.1: robots.txt disallows it" + instanceIRI: testrig.URLMustParse("https://furtive-nerds.example.org"), + expectedSoftware: "", + }, + { + // Robotic furtive nerds forbids *everything* using + // robots.txt so we should get bare minimum only. + // + // Debug-level logs should show something like: + // + // - "can't fetch api/v1/instance: robots.txt disallows it" + // - "can't fetch .well-known/nodeinfo: robots.txt disallows it" + instanceIRI: testrig.URLMustParse("https://robotic.furtive-nerds.example.org"), + expectedSoftware: "", + }, + { + // Really furtive nerds forbids .well-known/nodeinfo using + // X-Robots-Tagheaders, so we should get bare minimum only. + // + // Debug-level logs should show something like: + // + // - "can't use fetched .well-known/nodeinfo: robots tags disallows it" + instanceIRI: testrig.URLMustParse("https://really.furtive-nerds.example.org"), + expectedSoftware: "", + }, + } { + instance, err := suite.dereferencer.GetRemoteInstance( + gtscontext.SetFastFail(context.Background()), + suite.testAccounts["admin_account"].Username, + tc.instanceIRI, + ) + if err != nil { + suite.FailNow(err.Error()) + } + + suite.Equal(tc.expectedSoftware, instance.Version) + } +} + +func TestInstanceTestSuite(t *testing.T) { + suite.Run(t, new(InstanceTestSuite)) +} diff --git a/internal/transport/derefinstance.go b/internal/transport/derefinstance.go index bbeb51000..e7971093d 100644 --- a/internal/transport/derefinstance.go +++ b/internal/transport/derefinstance.go @@ -25,6 +25,7 @@ "io" "net/http" "net/url" + "slices" "strings" apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" @@ -35,18 +36,29 @@ "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/util" "github.com/superseriousbusiness/gotosocial/internal/validate" + "github.com/temoto/robotstxt" ) func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) { + // Try to fetch robots.txt to check + // if we're allowed to try endpoints: + // + // - /api/v1/instance + // - /.well-known/nodeinfo + // - /nodeinfo/2.0|2.1 endpoints + robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host) + if err != nil { + log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err) + } + var i *gtsmodel.Instance - var err error // First try to dereference using /api/v1/instance. // This will provide the most complete picture of an instance, and avoid unnecessary api calls. // // This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial. log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host) - i, err = dereferenceByAPIV1Instance(ctx, t, iri) + i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt) if err == nil { log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance") return i, nil @@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts // If that doesn't work, try to dereference using /.well-known/nodeinfo. // This will involve two API calls and return less info overall, but should be more widely compatible. log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host) - i, err = dereferenceByNodeInfo(ctx, t, iri) + i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt) if err == nil { log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo") return i, nil @@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts }, nil } -func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { +func (t *transport) dereferenceByAPIV1Instance( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { + const path = "api/v1/instance" + + // Bail if we're not allowed to fetch this endpoint. + if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { + err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + cleanIRI := &url.URL{ Scheme: iri.Scheme, Host: iri.Host, - Path: "api/v1/instance", + Path: path, } // Build IRI just once @@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) return nil, gtserror.NewFromResponse(resp) } + // Ensure that we can use data returned from this endpoint. + robots := resp.Header.Values("X-Robots-Tag") + if slices.ContainsFunc( + robots, + func(key string) bool { + return strings.Contains(key, "noindex") + }, + ) { + err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + // Ensure that the incoming request content-type is expected. if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) { err := gtserror.Newf("non json response type: %s", ct) @@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) return nil, errors.New("response bytes was len 0") } - // try to parse the returned bytes directly into an Instance model + // Try to parse the returned bytes + // directly into an Instance model. apiResp := &apimodel.InstanceV1{} if err := json.Unmarshal(b, apiResp); err != nil { return nil, err @@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) return i, nil } -func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { - niIRI, err := callNodeInfoWellKnown(c, t, iri) +func (t *transport) dereferenceByNodeInfo( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { + // Retrieve the nodeinfo IRI from .well-known/nodeinfo. + niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt) if err != nil { - return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err) + return nil, gtserror.Newf("error during initial call to .well-known: %w", err) } - ni, err := callNodeInfo(c, t, niIRI) + // Use the returned nodeinfo IRI to make a followup call. + ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt) if err != nil { - return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err) + return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err) } - // we got a response of some kind! take what we can from it... + // We got a response of some kind! + // + // Start building out the bare minimum + // instance model, we'll add to it if we can. id, err := id.NewRandomULID() if err != nil { - return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err) + return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err) } - // this is the bare minimum instance we'll return, and we'll add more stuff to it if we can i := >smodel.Instance{ ID: id, Domain: iri.Host, @@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm return i, nil } -func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) { +func (t *transport) callNodeInfoWellKnown( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*url.URL, error) { + const path = ".well-known/nodeinfo" + + // Bail if we're not allowed to fetch this endpoint. + if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { + err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + cleanIRI := &url.URL{ Scheme: iri.Scheme, Host: iri.Host, - Path: ".well-known/nodeinfo", + Path: path, } // Build IRI just once @@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur return nil, gtserror.NewFromResponse(resp) } - // Ensure that the incoming request content-type is expected. + // Ensure that we can use data returned from this endpoint. + robots := resp.Header.Values("X-Robots-Tag") + if slices.ContainsFunc( + robots, + func(key string) bool { + return strings.Contains(key, "noindex") + }, + ) { + err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + + // Ensure that the returned content-type is expected. if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) { err := gtserror.Newf("non json response type: %s", ct) return nil, gtserror.SetMalformed(err) @@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err) } - // look through the links for the first one that matches the nodeinfo schema, this is what we need + // Look through the links for the first one that + // matches nodeinfo schema, this is what we need. var nodeinfoHref *url.URL for _, l := range wellKnownResp.Links { if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") { @@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur return nodeinfoHref, nil } -func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) { +func (t *transport) callNodeInfo( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*apimodel.Nodeinfo, error) { + // Normalize robots.txt test path. + testPath := iri.Path + if !strings.HasPrefix(testPath, "/") { + testPath = "/" + testPath + } + + // Bail if we're not allowed to fetch this endpoint. + if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) { + err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath) + return nil, gtserror.SetNotPermitted(err) + } + // Build IRI just once iriStr := iri.String() @@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No return nil, gtserror.SetMalformed(err) } + // Ensure that we can use data returned from this endpoint. + robots := resp.Header.Values("X-Robots-Tag") + if slices.ContainsFunc( + robots, + func(key string) bool { + return strings.Contains(key, "noindex") + }, + ) { + err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path) + return nil, gtserror.SetNotPermitted(err) + } + b, err := io.ReadAll(resp.Body) if err != nil { return nil, err diff --git a/internal/transport/derefrobots.go b/internal/transport/derefrobots.go new file mode 100644 index 000000000..d6c4f3058 --- /dev/null +++ b/internal/transport/derefrobots.go @@ -0,0 +1,91 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package transport + +import ( + "context" + "net/http" + "net/url" + + "codeberg.org/gruf/go-bytesize" + "codeberg.org/gruf/go-iotools" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" + "github.com/temoto/robotstxt" +) + +func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) { + robotsIRI := &url.URL{ + Scheme: protocol, + Host: host, + Path: "robots.txt", + } + + // Build IRI just once + iriStr := robotsIRI.String() + + // Prepare new HTTP request to endpoint + req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil) + if err != nil { + return nil, err + } + + // We want text/plain utf-8 encoding. + // + // https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method + req.Header.Add("Accept", apiutil.TextPlain) + req.Header.Add("Accept-Charset", apiutil.UTF8) + + // Perform the HTTP request + rsp, err := t.GET(req) + if err != nil { + return nil, err + } + + // Ensure a non-error status response. + if rsp.StatusCode != http.StatusOK { + err := gtserror.NewFromResponse(rsp) + _ = rsp.Body.Close() // close early. + return nil, err + } + + // Ensure that the incoming request content-type is expected. + if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) { + err := gtserror.Newf("non text/plain response: %s", ct) + _ = rsp.Body.Close() // close early. + return nil, gtserror.SetMalformed(err) + } + + // Limit the robots.txt size to 500KiB + // + // https://www.rfc-editor.org/rfc/rfc9309.html#name-limits + const maxsz = int64(500 * bytesize.KiB) + + // Check body claims to be within size limit. + if rsp.ContentLength > maxsz { + _ = rsp.Body.Close() // close early. + sz := bytesize.Size(maxsz) //nolint:gosec + return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz) + } + + // Update response body with maximum size. + rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz) + defer rsp.Body.Close() + + return robotstxt.FromResponse(rsp) +} diff --git a/testrig/transportcontroller.go b/testrig/transportcontroller.go index b886e5c40..00f8ad2a6 100644 --- a/testrig/transportcontroller.go +++ b/testrig/transportcontroller.go @@ -133,6 +133,12 @@ func NewMockHTTPClient(do func(req *http.Request) (*http.Response, error), relat responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WebfingerResponse(req) } else if strings.Contains(reqURLString, ".well-known/host-meta") { responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = HostMetaResponse(req) + } else if strings.Contains(reqURLString, ".well-known/nodeinfo") { + responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WellKnownNodeInfoResponse(req) + } else if strings.Contains(reqURLString, "/robots.txt") { + responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = RobotsTxtResponse(req) + } else if strings.Contains(reqURLString, "/nodeinfo/2.1") { + responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = NodeInfoResponse(req) } else if strings.Contains(reqURLString, "lists.example.org") { responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = DomainPermissionSubscriptionResponse(req) } else if note, ok := mockHTTPClient.TestRemoteStatuses[reqURLString]; ok { @@ -318,6 +324,162 @@ func HostMetaResponse(req *http.Request) ( return } +func WellKnownNodeInfoResponse(req *http.Request) ( + responseCode int, + responseBytes []byte, + responseContentType string, + responseContentLength int, + extraHeaders map[string]string, +) { + var wkr *apimodel.WellKnownResponse + + switch req.URL.String() { + case "https://fossbros-anonymous.io/.well-known/nodeinfo": + wkr = &apimodel.WellKnownResponse{ + Links: []apimodel.Link{ + { + Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1", + Href: "https://fossbros-anonymous.io/nodeinfo/2.1", + }, + }, + } + case "https://furtive-nerds.example.org/.well-known/nodeinfo": + wkr = &apimodel.WellKnownResponse{ + Links: []apimodel.Link{ + { + Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1", + Href: "https://furtive-nerds.example.org/nodeinfo/2.1", + }, + }, + } + case "https://really.furtive-nerds.example.org/.well-known/nodeinfo": + wkr = &apimodel.WellKnownResponse{ + Links: []apimodel.Link{ + { + Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1", + Href: "https://really.furtive-nerds.example.org/nodeinfo/2.1", + }, + }, + } + extraHeaders = map[string]string{"X-Robots-Tag": "noindex,nofollow"} + default: + log.Debugf(nil, "nodeinfo response not available for %s", req.URL) + responseCode = http.StatusNotFound + responseBytes = []byte(``) + responseContentType = "application/json" + responseContentLength = len(responseBytes) + return + } + + niJSON, err := json.Marshal(wkr) + if err != nil { + panic(err) + } + responseCode = http.StatusOK + responseBytes = niJSON + responseContentType = "application/json" + responseContentLength = len(niJSON) + + return +} + +func NodeInfoResponse(req *http.Request) ( + responseCode int, + responseBytes []byte, + responseContentType string, + responseContentLength int, + extraHeaders map[string]string, +) { + var ni *apimodel.Nodeinfo + + switch req.URL.String() { + case "https://fossbros-anonymous.io/nodeinfo/2.1": + ni = &apimodel.Nodeinfo{ + Version: "2.1", + Software: apimodel.NodeInfoSoftware{ + Name: "Hellsoft", + Version: "6.6.6", + Repository: "https://forge.hellsoft.fossbros-anonymous.io", + Homepage: "https://hellsoft.fossbros-anonymous.io", + }, + Protocols: []string{"activitypub"}, + } + case "https://furtive-nerds.example.org/nodeinfo/2.1": + ni = &apimodel.Nodeinfo{ + Version: "2.1", + Software: apimodel.NodeInfoSoftware{ + Name: "GoToSocial", + Version: "1.3.1.2", + Repository: "https://github.com/superseriousbusiness/gotosocial", + Homepage: "https://docs.gotosocial.org", + }, + Protocols: []string{"activitypub"}, + } + case "https://really.furtive-nerds.example.org/nodeinfo/2.1": + ni = &apimodel.Nodeinfo{ + Version: "2.1", + Software: apimodel.NodeInfoSoftware{ + Name: "GoToSocial", + Version: "1.3.1.2", + Repository: "https://github.com/superseriousbusiness/gotosocial", + Homepage: "https://docs.gotosocial.org", + }, + Protocols: []string{"activitypub"}, + } + default: + log.Debugf(nil, "nodeinfo response not available for %s", req.URL) + responseCode = http.StatusNotFound + responseBytes = []byte(``) + responseContentType = "application/json" + responseContentLength = len(responseBytes) + return + } + + niJSON, err := json.Marshal(ni) + if err != nil { + panic(err) + } + responseCode = http.StatusOK + responseBytes = niJSON + responseContentType = "application/json" + responseContentLength = len(niJSON) + + return +} + +func RobotsTxtResponse(req *http.Request) ( + responseCode int, + responseBytes []byte, + responseContentType string, + responseContentLength int, + extraHeaders map[string]string, +) { + var robots string + + switch req.URL.String() { + case "https://furtive-nerds.example.org/robots.txt": + // Disallow nodeinfo. + robots = "User-agent: *\nDisallow: /nodeinfo" + case "https://robotic.furtive-nerds.example.org/robots.txt": + // Disallow everything. + robots = "User-agent: *\nDisallow: /" + default: + log.Debugf(nil, "robots response not available for %s", req.URL) + responseCode = http.StatusNotFound + responseBytes = []byte(``) + responseContentType = "text/plain" + responseContentLength = len(responseBytes) + return + } + + responseCode = http.StatusOK + responseBytes = []byte(robots) + responseContentType = "text/plain" + responseContentLength = len(responseBytes) + + return +} + func WebfingerResponse(req *http.Request) ( responseCode int, responseBytes []byte, diff --git a/vendor/github.com/temoto/robotstxt/.gitignore b/vendor/github.com/temoto/robotstxt/.gitignore new file mode 100644 index 000000000..6205f9eae --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/.gitignore @@ -0,0 +1,15 @@ +*.cgo?.* +*.o +*.so +*.sublime-* +*.zip +.DS_Store +.idea/ +.tags* +_cgo_* +_gofuzz/crashers/ +_gofuzz/suppressions/ +_obj +_test +coverage.txt +robots.txt-check/robots.txt-check diff --git a/vendor/github.com/temoto/robotstxt/.golangci.yml b/vendor/github.com/temoto/robotstxt/.golangci.yml new file mode 100644 index 000000000..24e5858fa --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/.golangci.yml @@ -0,0 +1,20 @@ +linters: + enable: + - goconst + - gofmt + - gosec + - maligned + - prealloc + - staticcheck + disable: + - deadcode + - structcheck + - varcheck + +linters-settings: + gofmt: + simplify: true + govet: + check-shadowing: true + maligned: + suggest-new: true diff --git a/vendor/github.com/temoto/robotstxt/.travis.yml b/vendor/github.com/temoto/robotstxt/.travis.yml new file mode 100644 index 000000000..ad90dac37 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/.travis.yml @@ -0,0 +1,30 @@ +cache: + go: true + directories: + - $HOME/.cache + - $HOME/bin + - $HOME/gopath/pkg/mod +language: go +go: +- 1.11 +- 1.12 +- 1.13 +- 1.14 +- 1.x +- master +install: true +script: GO111MODULE=on go test -race + +matrix: + include: + - go: 1.x + env: task=coverage + script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt + after_success: bash <(curl -s https://codecov.io/bash) + - go: 1.x + env: task=bench + script: GO111MODULE=on ./script/bench + - go: 1.x + install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1 + env: task=clean + script: GO111MODULE=on ./script/clean diff --git a/vendor/github.com/temoto/robotstxt/LICENSE b/vendor/github.com/temoto/robotstxt/LICENSE new file mode 100644 index 000000000..c125145b6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2010 Sergey Shepelev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/temoto/robotstxt/README.rst b/vendor/github.com/temoto/robotstxt/README.rst new file mode 100644 index 000000000..92f1ae161 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/README.rst @@ -0,0 +1,115 @@ +What +==== + +This is a robots.txt exclusion protocol implementation for Go language (golang). + + +Build +===== + +To build and run tests run `go test` in source directory. + + +Contribute +========== + +Warm welcome. + +* If desired, add your name in README.rst, section Who. +* Run `script/test && script/clean && echo ok` +* You can ignore linter warnings, but everything else must pass. +* Send your change as pull request or just a regular patch to current maintainer (see section Who). + +Thank you. + + +Usage +===== + +As usual, no special installation is required, just + + import "github.com/temoto/robotstxt" + +run `go get` and you're ready. + +1. Parse +^^^^^^^^ + +First of all, you need to parse robots.txt data. You can do it with +functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`:: + + robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:")) + robots, err := robotstxt.FromString("User-agent: *\nDisallow:") + +As of 2012-10-03, `FromBytes` is the most efficient method, everything else +is a wrapper for this core function. + +There are few convenient constructors for various purposes: + +* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data +from HTTP response. It *does not* call `response.Body.Close()`:: + + robots, err := robotstxt.FromResponse(resp) + resp.Body.Close() + if err != nil { + log.Println("Error parsing robots.txt:", err.Error()) + } + +* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or +`FromStatusAndString` if you prefer to read bytes (string) yourself. +Passing status code applies following logic in line with Google's interpretation +of robots.txt files: + + * status 2xx -> parse body with `FromBytes` and apply rules listed there. + * status 4xx -> allow all (even 401/403, as recommended by Google). + * other (5xx) -> disallow all, consider this a temporary unavailability. + +2. Query +^^^^^^^^ + +Parsing robots.txt content builds a kind of logic database, which you can +query with `(r *RobotsData) TestAgent(url, agent string) (bool)`. + +Explicit passing of agent is useful if you want to query for different agents. For +single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)` +returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`. + +Simple query with explicit user agent. Each call will scan all rules. + +:: + + allow := robots.TestAgent("/", "FooBot") + +Or query several paths against same user agent for performance. + +:: + + group := robots.FindGroup("BarBot") + group.Test("/") + group.Test("/download.mp3") + group.Test("/news/article-2012-1") + + +Who +=== + +Honorable contributors (in undefined order): + + * Ilya Grigorik (igrigorik) + * Martin Angers (PuerkitoBio) + * Micha Gorelick (mynameisfiber) + +Initial commit and other: Sergey Shepelev temotor@gmail.com + + +Flair +===== + +.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master + :target: https://travis-ci.org/temoto/robotstxt + +.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg + :target: https://codecov.io/gh/temoto/robotstxt + +.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt + :target: https://goreportcard.com/report/github.com/temoto/robotstxt diff --git a/vendor/github.com/temoto/robotstxt/codecov.yml b/vendor/github.com/temoto/robotstxt/codecov.yml new file mode 100644 index 000000000..b80be28f6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/codecov.yml @@ -0,0 +1,2 @@ +codecov: + token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04 diff --git a/vendor/github.com/temoto/robotstxt/fuzz.go b/vendor/github.com/temoto/robotstxt/fuzz.go new file mode 100644 index 000000000..de4b0587a --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/fuzz.go @@ -0,0 +1,29 @@ +// +build gofuzz + +package robotstxt + +import "testing/quick" + +func Fuzz(data []byte) int { + r, err := FromBytes(data) + if err != nil { + if r != nil { + panic("r != nil on error") + } + return 0 + } + + // FindGroup must never return nil + f1 := func(agent string) bool { return r.FindGroup(agent) != nil } + if err := quick.Check(f1, nil); err != nil { + panic(err) + } + + // just check TestAgent doesn't panic + f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true } + if err := quick.Check(f2, nil); err != nil { + panic(err) + } + + return 1 +} diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go new file mode 100644 index 000000000..46eb6b184 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/parser.go @@ -0,0 +1,271 @@ +package robotstxt + +// Comments explaining the logic are taken from either the google's spec: +// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt +// +// or the Wikipedia's entry on robots.txt: +// http://en.wikipedia.org/wiki/Robots.txt + +import ( + "fmt" + "io" + "math" + "regexp" + "strconv" + "strings" + "time" +) + +type lineType uint + +const ( + lIgnore lineType = iota + lUnknown + lUserAgent + lAllow + lDisallow + lCrawlDelay + lSitemap + lHost +) + +type parser struct { + tokens []string + pos int +} + +type lineInfo struct { + t lineType // Type of line key + k string // String representation of the type of key + vs string // String value of the key + vf float64 // Float value of the key + vr *regexp.Regexp // Regexp value of the key +} + +func newParser(tokens []string) *parser { + return &parser{tokens: tokens} +} + +func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) { + var g *Group + for _, a := range agents { + if g = groups[a]; g == nil { + g = new(Group) + groups[a] = g + } + fun(g) + } +} + +func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) { + groups = make(map[string]*Group, 16) + agents := make([]string, 0, 4) + isEmptyGroup := true + + // Reset internal fields, tokens are assigned at creation time, never change + p.pos = 0 + + for { + if li, err := p.parseLine(); err != nil { + if err == io.EOF { + break + } + errs = append(errs, err) + } else { + switch li.t { + case lUserAgent: + // Two successive user-agent lines are part of the same group. + if !isEmptyGroup { + // End previous group + agents = make([]string, 0, 4) + } + if len(agents) == 0 { + isEmptyGroup = true + } + agents = append(agents, li.vs) + + case lDisallow: + // Error if no current group + if len(agents) == 0 { + errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos)) + } else { + isEmptyGroup = false + var r *rule + if li.vr != nil { + r = &rule{"", false, li.vr} + } else { + r = &rule{li.vs, false, nil} + } + parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) + } + + case lAllow: + // Error if no current group + if len(agents) == 0 { + errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos)) + } else { + isEmptyGroup = false + var r *rule + if li.vr != nil { + r = &rule{"", true, li.vr} + } else { + r = &rule{li.vs, true, nil} + } + parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) + } + + case lHost: + host = li.vs + + case lSitemap: + sitemaps = append(sitemaps, li.vs) + + case lCrawlDelay: + if len(agents) == 0 { + errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos)) + } else { + isEmptyGroup = false + delay := time.Duration(li.vf * float64(time.Second)) + parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay }) + } + } + } + } + return +} + +func (p *parser) parseLine() (li *lineInfo, err error) { + t1, ok1 := p.popToken() + if !ok1 { + // proper EOF + return nil, io.EOF + } + + t2, ok2 := p.peekToken() + if !ok2 { + // EOF, no value associated with the token, so ignore token and return + return nil, io.EOF + } + + // Helper closure for all string-based tokens, common behaviour: + // - Consume t2 token + // - If empty, return unknown line info + // - Otherwise return the specified line info + returnStringVal := func(t lineType) (*lineInfo, error) { + p.popToken() + if t2 != "" { + return &lineInfo{t: t, k: t1, vs: t2}, nil + } + return &lineInfo{t: lIgnore}, nil + } + + // Helper closure for all path tokens (allow/disallow), common behaviour: + // - Consume t2 token + // - If empty, return unknown line info + // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*") + // - Detect if wildcards are present, if so, compile into a regexp + // - Return the specified line info + returnPathVal := func(t lineType) (*lineInfo, error) { + p.popToken() + if t2 != "" { + if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") { + t2 = "/" + t2 + } + t2 = strings.TrimRightFunc(t2, isAsterisk) + // From google's spec: + // Google, Bing, Yahoo, and Ask support a limited form of + // "wildcards" for path values. These are: + // * designates 0 or more instances of any valid character + // $ designates the end of the URL + if strings.ContainsAny(t2, "*$") { + // Must compile a regexp, this is a pattern. + // Escape string before compile. + t2 = regexp.QuoteMeta(t2) + t2 = strings.Replace(t2, `\*`, `.*`, -1) + t2 = strings.Replace(t2, `\$`, `$`, -1) + if r, e := regexp.Compile(t2); e != nil { + return nil, e + } else { + return &lineInfo{t: t, k: t1, vr: r}, nil + } + } else { + // Simple string path + return &lineInfo{t: t, k: t1, vs: t2}, nil + } + } + return &lineInfo{t: lIgnore}, nil + } + + switch strings.ToLower(t1) { + case tokEOL: + // Don't consume t2 and continue parsing + return &lineInfo{t: lIgnore}, nil + + case "user-agent", "useragent": + // From google's spec: + // Handling of elements with simple errors / typos (eg "useragent" + // instead of "user-agent") is undefined and may be interpreted as correct + // directives by some user-agents. + // The user-agent is non-case-sensitive. + t2 = strings.ToLower(t2) + return returnStringVal(lUserAgent) + + case "disallow": + // From google's spec: + // When no path is specified, the directive is ignored (so an empty Disallow + // CAN be an allow, since allow is the default. The actual result depends + // on the other rules in the group). + return returnPathVal(lDisallow) + + case "allow": + // From google's spec: + // When no path is specified, the directive is ignored. + return returnPathVal(lAllow) + + case "host": + // Host directive to specify main site mirror + // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host + return returnStringVal(lHost) + + case "sitemap": + // Non-group field, applies to the host as a whole, not to a specific user-agent + return returnStringVal(lSitemap) + + case "crawl-delay", "crawldelay": + // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions + // Several major crawlers support a Crawl-delay parameter, set to the + // number of seconds to wait between successive requests to the same server. + p.popToken() + if cd, e := strconv.ParseFloat(t2, 64); e != nil { + return nil, e + } else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) { + return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2) + } else { + return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil + } + } + + // Consume t2 token + p.popToken() + return &lineInfo{t: lUnknown, k: t1}, nil +} + +func (p *parser) popToken() (tok string, ok bool) { + tok, ok = p.peekToken() + if !ok { + return + } + p.pos++ + return tok, true +} + +func (p *parser) peekToken() (tok string, ok bool) { + if p.pos >= len(p.tokens) { + return "", false + } + return p.tokens[p.pos], true +} + +func isAsterisk(r rune) bool { + return r == '*' +} diff --git a/vendor/github.com/temoto/robotstxt/robotstxt.go b/vendor/github.com/temoto/robotstxt/robotstxt.go new file mode 100644 index 000000000..52d3637c6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/robotstxt.go @@ -0,0 +1,227 @@ +// Package robotstxt implements the robots.txt Exclusion Protocol +// as specified in http://www.robotstxt.org/wc/robots.html +// with various extensions. +package robotstxt + +// Comments explaining the logic are taken from either the Google's spec: +// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt + +import ( + "bytes" + "errors" + "io/ioutil" + "net/http" + "regexp" + "strconv" + "strings" + "time" +) + +type RobotsData struct { + // private + groups map[string]*Group + allowAll bool + disallowAll bool + Host string + Sitemaps []string +} + +type Group struct { + rules []*rule + Agent string + CrawlDelay time.Duration +} + +type rule struct { + path string + allow bool + pattern *regexp.Regexp +} + +type ParseError struct { + Errs []error +} + +func newParseError(errs []error) *ParseError { + return &ParseError{errs} +} + +func (e ParseError) Error() string { + var b bytes.Buffer + + b.WriteString("Parse error(s): " + "\n") + for _, er := range e.Errs { + b.WriteString(er.Error() + "\n") + } + return b.String() +} + +var allowAll = &RobotsData{allowAll: true} +var disallowAll = &RobotsData{disallowAll: true} +var emptyGroup = &Group{} + +func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) { + switch { + case statusCode >= 200 && statusCode < 300: + return FromBytes(body) + + // From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt + // + // Google treats all 4xx errors in the same way and assumes that no valid + // robots.txt file exists. It is assumed that there are no restrictions. + // This is a "full allow" for crawling. Note: this includes 401 + // "Unauthorized" and 403 "Forbidden" HTTP result codes. + case statusCode >= 400 && statusCode < 500: + return allowAll, nil + + // From Google's spec: + // Server errors (5xx) are seen as temporary errors that result in a "full + // disallow" of crawling. + case statusCode >= 500 && statusCode < 600: + return disallowAll, nil + } + + return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode)) +} + +func FromStatusAndString(statusCode int, body string) (*RobotsData, error) { + return FromStatusAndBytes(statusCode, []byte(body)) +} + +func FromResponse(res *http.Response) (*RobotsData, error) { + if res == nil { + // Edge case, if res is nil, return nil data + return nil, nil + } + buf, e := ioutil.ReadAll(res.Body) + if e != nil { + return nil, e + } + return FromStatusAndBytes(res.StatusCode, buf) +} + +func FromBytes(body []byte) (r *RobotsData, err error) { + var errs []error + + // special case (probably not worth optimization?) + trimmed := bytes.TrimSpace(body) + if len(trimmed) == 0 { + return allowAll, nil + } + + sc := newByteScanner("bytes", true) + //sc.Quiet = !print_errors + sc.feed(body, true) + tokens := sc.scanAll() + + // special case worth optimization + if len(tokens) == 0 { + return allowAll, nil + } + + r = &RobotsData{} + parser := newParser(tokens) + r.groups, r.Host, r.Sitemaps, errs = parser.parseAll() + if len(errs) > 0 { + return nil, newParseError(errs) + } + + return r, nil +} + +func FromString(body string) (r *RobotsData, err error) { + return FromBytes([]byte(body)) +} + +func (r *RobotsData) TestAgent(path, agent string) bool { + if r.allowAll { + return true + } + if r.disallowAll { + return false + } + + // Find a group of rules that applies to this agent + // From Google's spec: + // The user-agent is non-case-sensitive. + g := r.FindGroup(agent) + return g.Test(path) +} + +// FindGroup searches block of declarations for specified user-agent. +// From Google's spec: +// Only one group of group-member records is valid for a particular crawler. +// The crawler must determine the correct group of records by finding the group +// with the most specific user-agent that still matches. All other groups of +// records are ignored by the crawler. The user-agent is non-case-sensitive. +// The order of the groups within the robots.txt file is irrelevant. +func (r *RobotsData) FindGroup(agent string) (ret *Group) { + var prefixLen int + + agent = strings.ToLower(agent) + if ret = r.groups["*"]; ret != nil { + // Weakest match possible + prefixLen = 1 + } + for a, g := range r.groups { + if a != "*" && strings.HasPrefix(agent, a) { + if l := len(a); l > prefixLen { + prefixLen = l + ret = g + } + } + } + + if ret == nil { + return emptyGroup + } + return +} + +func (g *Group) Test(path string) bool { + if r := g.findRule(path); r != nil { + return r.allow + } + + // From Google's spec: + // By default, there are no restrictions for crawling for the designated crawlers. + return true +} + +// From Google's spec: +// The path value is used as a basis to determine whether or not a rule applies +// to a specific URL on a site. With the exception of wildcards, the path is +// used to match the beginning of a URL (and any valid URLs that start with the +// same path). +// +// At a group-member level, in particular for allow and disallow directives, +// the most specific rule based on the length of the [path] entry will trump +// the less specific (shorter) rule. The order of precedence for rules with +// wildcards is undefined. +func (g *Group) findRule(path string) (ret *rule) { + var prefixLen int + + for _, r := range g.rules { + if r.pattern != nil { + if r.pattern.MatchString(path) { + // Consider this a match equal to the length of the pattern. + // From Google's spec: + // The order of precedence for rules with wildcards is undefined. + if l := len(r.pattern.String()); l > prefixLen { + prefixLen = l + ret = r + } + } + } else if r.path == "/" && prefixLen == 0 { + // Weakest match possible + prefixLen = 1 + ret = r + } else if strings.HasPrefix(path, r.path) { + if l := len(r.path); l > prefixLen { + prefixLen = l + ret = r + } + } + } + return +} diff --git a/vendor/github.com/temoto/robotstxt/scanner.go b/vendor/github.com/temoto/robotstxt/scanner.go new file mode 100644 index 000000000..6bd98c2ec --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/scanner.go @@ -0,0 +1,185 @@ +package robotstxt + +import ( + "bytes" + "fmt" + "go/token" + "os" + "sync" + "unicode/utf8" +) + +type byteScanner struct { + pos token.Position + buf []byte + ErrorCount int + ch rune + Quiet bool + keyTokenFound bool + lastChunk bool +} + +const tokEOL = "\n" + +var WhitespaceChars = []rune{' ', '\t', '\v'} +var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }} + +func newByteScanner(srcname string, quiet bool) *byteScanner { + return &byteScanner{ + Quiet: quiet, + ch: -1, + pos: token.Position{Filename: srcname}, + } +} + +func (s *byteScanner) feed(input []byte, end bool) { + s.buf = input + s.pos.Offset = 0 + s.pos.Line = 1 + s.pos.Column = 1 + s.lastChunk = end + + // Read first char into look-ahead buffer `s.ch`. + if !s.nextChar() { + return + } + + // Skip UTF-8 byte order mark + if s.ch == 65279 { + s.nextChar() + s.pos.Column = 1 + } +} + +func (s *byteScanner) GetPosition() token.Position { + return s.pos +} + +func (s *byteScanner) scan() string { + // Note Offset > len, not >=, so we can scan last character. + if s.lastChunk && s.pos.Offset > len(s.buf) { + return "" + } + + s.skipSpace() + + if s.ch == -1 { + return "" + } + + // EOL + if s.isEol() { + s.keyTokenFound = false + // skip subsequent newline chars + for s.ch != -1 && s.isEol() { + s.nextChar() + } + // emit newline as separate token + return tokEOL + } + + // skip comments + if s.ch == '#' { + s.keyTokenFound = false + s.skipUntilEol() + if s.ch == -1 { + return "" + } + // emit newline as separate token + return tokEOL + } + + // else we found something + tok := tokBuffers.Get().(*bytes.Buffer) + defer tokBuffers.Put(tok) + tok.Reset() + tok.WriteRune(s.ch) + s.nextChar() + for s.ch != -1 && !s.isSpace() && !s.isEol() { + // Do not consider ":" to be a token separator if a first key token + // has already been found on this line (avoid cutting an absolute URL + // after the "http:") + if s.ch == ':' && !s.keyTokenFound { + s.nextChar() + s.keyTokenFound = true + break + } + + tok.WriteRune(s.ch) + s.nextChar() + } + return tok.String() +} + +func (s *byteScanner) scanAll() []string { + results := make([]string, 0, 64) // random guess of average tokens length + for { + token := s.scan() + if token != "" { + results = append(results, token) + } else { + break + } + } + return results +} + +func (s *byteScanner) error(pos token.Position, msg string) { + s.ErrorCount++ + if !s.Quiet { + fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg) + } +} + +func (s *byteScanner) isEol() bool { + return s.ch == '\n' || s.ch == '\r' +} + +func (s *byteScanner) isSpace() bool { + for _, r := range WhitespaceChars { + if s.ch == r { + return true + } + } + return false +} + +func (s *byteScanner) skipSpace() { + for s.ch != -1 && s.isSpace() { + s.nextChar() + } +} + +func (s *byteScanner) skipUntilEol() { + for s.ch != -1 && !s.isEol() { + s.nextChar() + } + // skip subsequent newline chars + for s.ch != -1 && s.isEol() { + s.nextChar() + } +} + +// Reads next Unicode char. +func (s *byteScanner) nextChar() bool { + if s.pos.Offset >= len(s.buf) { + s.ch = -1 + return false + } + s.pos.Column++ + if s.ch == '\n' { + s.pos.Line++ + s.pos.Column = 1 + } + r, w := rune(s.buf[s.pos.Offset]), 1 + if r >= 0x80 { + r, w = utf8.DecodeRune(s.buf[s.pos.Offset:]) + if r == utf8.RuneError && w == 1 { + s.error(s.pos, "illegal UTF-8 encoding") + } + } + s.pos.Column++ + s.pos.Offset += w + s.ch = r + return true +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 49ca611b2..04314f34f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -412,6 +412,8 @@ github.com/jackc/puddle/v2/internal/genstack # github.com/jessevdk/go-flags v1.5.0 ## explicit; go 1.15 github.com/jessevdk/go-flags +# github.com/jimsmart/grobotstxt v1.0.3 +## explicit; go 1.14 # github.com/jinzhu/inflection v1.0.0 ## explicit github.com/jinzhu/inflection @@ -831,6 +833,9 @@ github.com/tdewolff/parse/v2/strconv # github.com/technologize/otel-go-contrib v1.1.1 ## explicit; go 1.17 github.com/technologize/otel-go-contrib/otelginmetrics +# github.com/temoto/robotstxt v1.1.2 +## explicit; go 1.11 +github.com/temoto/robotstxt # github.com/tetratelabs/wazero v1.8.2 ## explicit; go 1.21 github.com/tetratelabs/wazero