mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-02-13 19:46:32 +01:00
[bug] respect X-Robots-Tag
and robots.txt
on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag when accessing /api/v1/instance or /nodeinfo endpoints respect X-Robots-Tag * chore: go fmt ./... * Check robots.txt as well, add tests --------- Co-authored-by: tobi <tobi.smethurst@protonmail.com>
This commit is contained in:
parent
2c95fd4115
commit
d0de3ad492
20 changed files with 1404 additions and 24 deletions
|
@ -43,7 +43,8 @@ Here's a screenshot of the instance landing page! Check out the project's [offic
|
||||||
- [Known Issues](#known-issues)
|
- [Known Issues](#known-issues)
|
||||||
- [Installing GoToSocial](#installing-gotosocial)
|
- [Installing GoToSocial](#installing-gotosocial)
|
||||||
- [Supported Platforms](#supported-platforms)
|
- [Supported Platforms](#supported-platforms)
|
||||||
- [FreeBSD](#freebsd)
|
- [64-bit](#64-bit)
|
||||||
|
- [BSDs](#bsds)
|
||||||
- [32-bit](#32-bit)
|
- [32-bit](#32-bit)
|
||||||
- [OpenBSD](#openbsd)
|
- [OpenBSD](#openbsd)
|
||||||
- [Stable Releases](#stable-releases)
|
- [Stable Releases](#stable-releases)
|
||||||
|
@ -434,6 +435,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
|
||||||
- [superseriousbusiness/exif-terminator](https://codeberg.org/superseriousbusiness/exif-terminator); EXIF data removal. [GNU AGPL v3 LICENSE](https://spdx.org/licenses/AGPL-3.0-or-later.html).
|
- [superseriousbusiness/exif-terminator](https://codeberg.org/superseriousbusiness/exif-terminator); EXIF data removal. [GNU AGPL v3 LICENSE](https://spdx.org/licenses/AGPL-3.0-or-later.html).
|
||||||
- [superseriousbusiness/httpsig](https://github.com/superseriousbusiness/httpsig) forked from [go-fed/httpsig](https://github.com/go-fed/httpsig); secure HTTP signature library. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
|
- [superseriousbusiness/httpsig](https://github.com/superseriousbusiness/httpsig) forked from [go-fed/httpsig](https://github.com/go-fed/httpsig); secure HTTP signature library. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
|
||||||
- [superseriousbusiness/oauth2](https://github.com/superseriousbusiness/oauth2) forked from [go-oauth2/oauth2](https://github.com/go-oauth2/oauth2); OAuth server framework and token handling. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [superseriousbusiness/oauth2](https://github.com/superseriousbusiness/oauth2) forked from [go-oauth2/oauth2](https://github.com/go-oauth2/oauth2); OAuth server framework and token handling. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
|
- [temoto/robotstxt](https://github.com/temoto/robotstxt); robots.txt parsing. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [tdewolff/minify](https://github.com/tdewolff/minify); HTML minification for Markdown-submitted posts. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [tdewolff/minify](https://github.com/tdewolff/minify); HTML minification for Markdown-submitted posts. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [uber-go/automaxprocs](https://github.com/uber-go/automaxprocs); GOMAXPROCS automation. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [uber-go/automaxprocs](https://github.com/uber-go/automaxprocs); GOMAXPROCS automation. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [ulule/limiter](https://github.com/ulule/limiter); http rate limit middleware. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [ulule/limiter](https://github.com/ulule/limiter); http rate limit middleware. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
|
|
1
go.mod
1
go.mod
|
@ -60,6 +60,7 @@ require (
|
||||||
github.com/superseriousbusiness/oauth2/v4 v4.3.2-SSB.0.20230227143000-f4900831d6c8
|
github.com/superseriousbusiness/oauth2/v4 v4.3.2-SSB.0.20230227143000-f4900831d6c8
|
||||||
github.com/tdewolff/minify/v2 v2.21.3
|
github.com/tdewolff/minify/v2 v2.21.3
|
||||||
github.com/technologize/otel-go-contrib v1.1.1
|
github.com/technologize/otel-go-contrib v1.1.1
|
||||||
|
github.com/temoto/robotstxt v1.1.2
|
||||||
github.com/tetratelabs/wazero v1.8.2
|
github.com/tetratelabs/wazero v1.8.2
|
||||||
github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80
|
github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80
|
||||||
github.com/ulule/limiter/v3 v3.11.2
|
github.com/ulule/limiter/v3 v3.11.2
|
||||||
|
|
2
go.sum
generated
2
go.sum
generated
|
@ -540,6 +540,8 @@ github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739 h1:IkjBCtQOOjIn03
|
||||||
github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739/go.mod h1:XPuWBzvdUzhCuxWO1ojpXsyzsA5bFoS3tO/Q3kFuTG8=
|
github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739/go.mod h1:XPuWBzvdUzhCuxWO1ojpXsyzsA5bFoS3tO/Q3kFuTG8=
|
||||||
github.com/technologize/otel-go-contrib v1.1.1 h1:wZH9aSPNWZWIkEh3vfaKfMb15AJ80jJ1aVj/4GZdqIw=
|
github.com/technologize/otel-go-contrib v1.1.1 h1:wZH9aSPNWZWIkEh3vfaKfMb15AJ80jJ1aVj/4GZdqIw=
|
||||||
github.com/technologize/otel-go-contrib v1.1.1/go.mod h1:dCN/wj2WyUO8aFZFdIN+6tfJHImjTML/8r2YVYAy3So=
|
github.com/technologize/otel-go-contrib v1.1.1/go.mod h1:dCN/wj2WyUO8aFZFdIN+6tfJHImjTML/8r2YVYAy3So=
|
||||||
|
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
|
||||||
|
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||||
github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4=
|
github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4=
|
||||||
github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
|
github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
|
||||||
github.com/tidwall/btree v0.0.0-20191029221954-400434d76274 h1:G6Z6HvJuPjG6XfNGi/feOATzeJrfgTNJY+rGrHbA04E=
|
github.com/tidwall/btree v0.0.0-20191029221954-400434d76274 h1:G6Z6HvJuPjG6XfNGi/feOATzeJrfgTNJY+rGrHbA04E=
|
||||||
|
|
|
@ -36,6 +36,8 @@
|
||||||
TextHTML = `text/html`
|
TextHTML = `text/html`
|
||||||
TextCSS = `text/css`
|
TextCSS = `text/css`
|
||||||
TextCSV = `text/csv`
|
TextCSV = `text/csv`
|
||||||
|
TextPlain = `text/plain`
|
||||||
|
UTF8 = `utf-8`
|
||||||
)
|
)
|
||||||
|
|
||||||
// JSONContentType returns whether is application/json(;charset=utf-8)? content-type.
|
// JSONContentType returns whether is application/json(;charset=utf-8)? content-type.
|
||||||
|
@ -74,6 +76,14 @@ func XMLXRDContentType(ct string) bool {
|
||||||
p[0] == AppXMLXRD
|
p[0] == AppXMLXRD
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TextPlainContentType returns whether is text/plain(;charset=utf-8)? content-type.
|
||||||
|
func TextPlainContentType(ct string) bool {
|
||||||
|
p := splitContentType(ct)
|
||||||
|
p, ok := isUTF8ContentType(p)
|
||||||
|
return ok && len(p) == 1 &&
|
||||||
|
p[0] == TextPlain
|
||||||
|
}
|
||||||
|
|
||||||
// ASContentType returns whether is valid ActivityStreams content-types:
|
// ASContentType returns whether is valid ActivityStreams content-types:
|
||||||
// - application/activity+json
|
// - application/activity+json
|
||||||
// - application/ld+json;profile=https://w3.org/ns/activitystreams
|
// - application/ld+json;profile=https://w3.org/ns/activitystreams
|
||||||
|
@ -118,7 +128,7 @@ func NodeInfo2ContentType(ct string) bool {
|
||||||
// type parts list, removes it and returns whether is utf-8.
|
// type parts list, removes it and returns whether is utf-8.
|
||||||
func isUTF8ContentType(p []string) ([]string, bool) {
|
func isUTF8ContentType(p []string) ([]string, bool) {
|
||||||
const charset = "charset="
|
const charset = "charset="
|
||||||
const charsetUTF8 = charset + "utf-8"
|
const charsetUTF8 = charset + UTF8
|
||||||
for i, part := range p {
|
for i, part := range p {
|
||||||
|
|
||||||
// Only handle charset slice parts.
|
// Only handle charset slice parts.
|
||||||
|
|
|
@ -19,20 +19,20 @@
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
||||||
|
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) {
|
func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) {
|
||||||
if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil {
|
if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil {
|
||||||
return nil, fmt.Errorf("GetRemoteInstance: domain %s is blocked", remoteInstanceURI.Host)
|
return nil, gtserror.Newf("domain %s is blocked", remoteInstanceURI.Host)
|
||||||
}
|
}
|
||||||
|
|
||||||
transport, err := d.transportController.NewTransportForUsername(ctx, username)
|
transport, err := d.transportController.NewTransportForUsername(ctx, username)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("transport err: %s", err)
|
return nil, gtserror.Newf("transport err: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return transport.DereferenceInstance(ctx, remoteInstanceURI)
|
return transport.DereferenceInstance(ctx, remoteInstanceURI)
|
||||||
|
|
94
internal/federation/dereferencing/instance_test.go
Normal file
94
internal/federation/dereferencing/instance_test.go
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
// GoToSocial
|
||||||
|
// Copyright (C) GoToSocial Authors admin@gotosocial.org
|
||||||
|
// SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
package dereferencing_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/url"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/suite"
|
||||||
|
"github.com/superseriousbusiness/gotosocial/internal/gtscontext"
|
||||||
|
"github.com/superseriousbusiness/gotosocial/testrig"
|
||||||
|
)
|
||||||
|
|
||||||
|
type InstanceTestSuite struct {
|
||||||
|
DereferencerStandardTestSuite
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suite *InstanceTestSuite) TestDerefInstance() {
|
||||||
|
type testCase struct {
|
||||||
|
instanceIRI *url.URL
|
||||||
|
expectedSoftware string
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range []testCase{
|
||||||
|
{
|
||||||
|
// Fossbros anonymous doesn't shield their nodeinfo or
|
||||||
|
// well-known or anything so we should be able to fetch.
|
||||||
|
instanceIRI: testrig.URLMustParse("https://fossbros-anonymous.io"),
|
||||||
|
expectedSoftware: "Hellsoft 6.6.6",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Furtive nerds forbids /nodeinfo using
|
||||||
|
// robots.txt so we should get bare minimum only.
|
||||||
|
//
|
||||||
|
// Debug-level logs should show something like:
|
||||||
|
//
|
||||||
|
// - "can't fetch /nodeinfo/2.1: robots.txt disallows it"
|
||||||
|
instanceIRI: testrig.URLMustParse("https://furtive-nerds.example.org"),
|
||||||
|
expectedSoftware: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Robotic furtive nerds forbids *everything* using
|
||||||
|
// robots.txt so we should get bare minimum only.
|
||||||
|
//
|
||||||
|
// Debug-level logs should show something like:
|
||||||
|
//
|
||||||
|
// - "can't fetch api/v1/instance: robots.txt disallows it"
|
||||||
|
// - "can't fetch .well-known/nodeinfo: robots.txt disallows it"
|
||||||
|
instanceIRI: testrig.URLMustParse("https://robotic.furtive-nerds.example.org"),
|
||||||
|
expectedSoftware: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Really furtive nerds forbids .well-known/nodeinfo using
|
||||||
|
// X-Robots-Tagheaders, so we should get bare minimum only.
|
||||||
|
//
|
||||||
|
// Debug-level logs should show something like:
|
||||||
|
//
|
||||||
|
// - "can't use fetched .well-known/nodeinfo: robots tags disallows it"
|
||||||
|
instanceIRI: testrig.URLMustParse("https://really.furtive-nerds.example.org"),
|
||||||
|
expectedSoftware: "",
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
instance, err := suite.dereferencer.GetRemoteInstance(
|
||||||
|
gtscontext.SetFastFail(context.Background()),
|
||||||
|
suite.testAccounts["admin_account"].Username,
|
||||||
|
tc.instanceIRI,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
suite.FailNow(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
suite.Equal(tc.expectedSoftware, instance.Version)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInstanceTestSuite(t *testing.T) {
|
||||||
|
suite.Run(t, new(InstanceTestSuite))
|
||||||
|
}
|
|
@ -25,6 +25,7 @@
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
|
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
|
||||||
|
@ -35,18 +36,29 @@
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/log"
|
"github.com/superseriousbusiness/gotosocial/internal/log"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/util"
|
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/validate"
|
"github.com/superseriousbusiness/gotosocial/internal/validate"
|
||||||
|
"github.com/temoto/robotstxt"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) {
|
func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) {
|
||||||
|
// Try to fetch robots.txt to check
|
||||||
|
// if we're allowed to try endpoints:
|
||||||
|
//
|
||||||
|
// - /api/v1/instance
|
||||||
|
// - /.well-known/nodeinfo
|
||||||
|
// - /nodeinfo/2.0|2.1 endpoints
|
||||||
|
robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host)
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err)
|
||||||
|
}
|
||||||
|
|
||||||
var i *gtsmodel.Instance
|
var i *gtsmodel.Instance
|
||||||
var err error
|
|
||||||
|
|
||||||
// First try to dereference using /api/v1/instance.
|
// First try to dereference using /api/v1/instance.
|
||||||
// This will provide the most complete picture of an instance, and avoid unnecessary api calls.
|
// This will provide the most complete picture of an instance, and avoid unnecessary api calls.
|
||||||
//
|
//
|
||||||
// This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial.
|
// This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial.
|
||||||
log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host)
|
log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host)
|
||||||
i, err = dereferenceByAPIV1Instance(ctx, t, iri)
|
i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance")
|
log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance")
|
||||||
return i, nil
|
return i, nil
|
||||||
|
@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts
|
||||||
// If that doesn't work, try to dereference using /.well-known/nodeinfo.
|
// If that doesn't work, try to dereference using /.well-known/nodeinfo.
|
||||||
// This will involve two API calls and return less info overall, but should be more widely compatible.
|
// This will involve two API calls and return less info overall, but should be more widely compatible.
|
||||||
log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host)
|
log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host)
|
||||||
i, err = dereferenceByNodeInfo(ctx, t, iri)
|
i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo")
|
log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo")
|
||||||
return i, nil
|
return i, nil
|
||||||
|
@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) {
|
func (t *transport) dereferenceByAPIV1Instance(
|
||||||
|
ctx context.Context,
|
||||||
|
iri *url.URL,
|
||||||
|
robotsTxt *robotstxt.RobotsData,
|
||||||
|
) (*gtsmodel.Instance, error) {
|
||||||
|
const path = "api/v1/instance"
|
||||||
|
|
||||||
|
// Bail if we're not allowed to fetch this endpoint.
|
||||||
|
if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) {
|
||||||
|
err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path)
|
||||||
|
return nil, gtserror.SetNotPermitted(err)
|
||||||
|
}
|
||||||
|
|
||||||
cleanIRI := &url.URL{
|
cleanIRI := &url.URL{
|
||||||
Scheme: iri.Scheme,
|
Scheme: iri.Scheme,
|
||||||
Host: iri.Host,
|
Host: iri.Host,
|
||||||
Path: "api/v1/instance",
|
Path: path,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build IRI just once
|
// Build IRI just once
|
||||||
|
@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
|
||||||
return nil, gtserror.NewFromResponse(resp)
|
return nil, gtserror.NewFromResponse(resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure that we can use data returned from this endpoint.
|
||||||
|
robots := resp.Header.Values("X-Robots-Tag")
|
||||||
|
if slices.ContainsFunc(
|
||||||
|
robots,
|
||||||
|
func(key string) bool {
|
||||||
|
return strings.Contains(key, "noindex")
|
||||||
|
},
|
||||||
|
) {
|
||||||
|
err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path)
|
||||||
|
return nil, gtserror.SetNotPermitted(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Ensure that the incoming request content-type is expected.
|
// Ensure that the incoming request content-type is expected.
|
||||||
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
|
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
|
||||||
err := gtserror.Newf("non json response type: %s", ct)
|
err := gtserror.Newf("non json response type: %s", ct)
|
||||||
|
@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
|
||||||
return nil, errors.New("response bytes was len 0")
|
return nil, errors.New("response bytes was len 0")
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to parse the returned bytes directly into an Instance model
|
// Try to parse the returned bytes
|
||||||
|
// directly into an Instance model.
|
||||||
apiResp := &apimodel.InstanceV1{}
|
apiResp := &apimodel.InstanceV1{}
|
||||||
if err := json.Unmarshal(b, apiResp); err != nil {
|
if err := json.Unmarshal(b, apiResp); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)
|
||||||
return i, nil
|
return i, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) {
|
func (t *transport) dereferenceByNodeInfo(
|
||||||
niIRI, err := callNodeInfoWellKnown(c, t, iri)
|
ctx context.Context,
|
||||||
|
iri *url.URL,
|
||||||
|
robotsTxt *robotstxt.RobotsData,
|
||||||
|
) (*gtsmodel.Instance, error) {
|
||||||
|
// Retrieve the nodeinfo IRI from .well-known/nodeinfo.
|
||||||
|
niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err)
|
return nil, gtserror.Newf("error during initial call to .well-known: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ni, err := callNodeInfo(c, t, niIRI)
|
// Use the returned nodeinfo IRI to make a followup call.
|
||||||
|
ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err)
|
return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// we got a response of some kind! take what we can from it...
|
// We got a response of some kind!
|
||||||
|
//
|
||||||
|
// Start building out the bare minimum
|
||||||
|
// instance model, we'll add to it if we can.
|
||||||
id, err := id.NewRandomULID()
|
id, err := id.NewRandomULID()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err)
|
return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is the bare minimum instance we'll return, and we'll add more stuff to it if we can
|
|
||||||
i := >smodel.Instance{
|
i := >smodel.Instance{
|
||||||
ID: id,
|
ID: id,
|
||||||
Domain: iri.Host,
|
Domain: iri.Host,
|
||||||
|
@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm
|
||||||
return i, nil
|
return i, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) {
|
func (t *transport) callNodeInfoWellKnown(
|
||||||
|
ctx context.Context,
|
||||||
|
iri *url.URL,
|
||||||
|
robotsTxt *robotstxt.RobotsData,
|
||||||
|
) (*url.URL, error) {
|
||||||
|
const path = ".well-known/nodeinfo"
|
||||||
|
|
||||||
|
// Bail if we're not allowed to fetch this endpoint.
|
||||||
|
if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) {
|
||||||
|
err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path)
|
||||||
|
return nil, gtserror.SetNotPermitted(err)
|
||||||
|
}
|
||||||
|
|
||||||
cleanIRI := &url.URL{
|
cleanIRI := &url.URL{
|
||||||
Scheme: iri.Scheme,
|
Scheme: iri.Scheme,
|
||||||
Host: iri.Host,
|
Host: iri.Host,
|
||||||
Path: ".well-known/nodeinfo",
|
Path: path,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build IRI just once
|
// Build IRI just once
|
||||||
|
@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
|
||||||
return nil, gtserror.NewFromResponse(resp)
|
return nil, gtserror.NewFromResponse(resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that the incoming request content-type is expected.
|
// Ensure that we can use data returned from this endpoint.
|
||||||
|
robots := resp.Header.Values("X-Robots-Tag")
|
||||||
|
if slices.ContainsFunc(
|
||||||
|
robots,
|
||||||
|
func(key string) bool {
|
||||||
|
return strings.Contains(key, "noindex")
|
||||||
|
},
|
||||||
|
) {
|
||||||
|
err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path)
|
||||||
|
return nil, gtserror.SetNotPermitted(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that the returned content-type is expected.
|
||||||
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
|
if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {
|
||||||
err := gtserror.Newf("non json response type: %s", ct)
|
err := gtserror.Newf("non json response type: %s", ct)
|
||||||
return nil, gtserror.SetMalformed(err)
|
return nil, gtserror.SetMalformed(err)
|
||||||
|
@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
|
||||||
return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err)
|
return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// look through the links for the first one that matches the nodeinfo schema, this is what we need
|
// Look through the links for the first one that
|
||||||
|
// matches nodeinfo schema, this is what we need.
|
||||||
var nodeinfoHref *url.URL
|
var nodeinfoHref *url.URL
|
||||||
for _, l := range wellKnownResp.Links {
|
for _, l := range wellKnownResp.Links {
|
||||||
if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") {
|
if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") {
|
||||||
|
@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur
|
||||||
return nodeinfoHref, nil
|
return nodeinfoHref, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) {
|
func (t *transport) callNodeInfo(
|
||||||
|
ctx context.Context,
|
||||||
|
iri *url.URL,
|
||||||
|
robotsTxt *robotstxt.RobotsData,
|
||||||
|
) (*apimodel.Nodeinfo, error) {
|
||||||
|
// Normalize robots.txt test path.
|
||||||
|
testPath := iri.Path
|
||||||
|
if !strings.HasPrefix(testPath, "/") {
|
||||||
|
testPath = "/" + testPath
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bail if we're not allowed to fetch this endpoint.
|
||||||
|
if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) {
|
||||||
|
err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath)
|
||||||
|
return nil, gtserror.SetNotPermitted(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Build IRI just once
|
// Build IRI just once
|
||||||
iriStr := iri.String()
|
iriStr := iri.String()
|
||||||
|
|
||||||
|
@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No
|
||||||
return nil, gtserror.SetMalformed(err)
|
return nil, gtserror.SetMalformed(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure that we can use data returned from this endpoint.
|
||||||
|
robots := resp.Header.Values("X-Robots-Tag")
|
||||||
|
if slices.ContainsFunc(
|
||||||
|
robots,
|
||||||
|
func(key string) bool {
|
||||||
|
return strings.Contains(key, "noindex")
|
||||||
|
},
|
||||||
|
) {
|
||||||
|
err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path)
|
||||||
|
return nil, gtserror.SetNotPermitted(err)
|
||||||
|
}
|
||||||
|
|
||||||
b, err := io.ReadAll(resp.Body)
|
b, err := io.ReadAll(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
91
internal/transport/derefrobots.go
Normal file
91
internal/transport/derefrobots.go
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
// GoToSocial
|
||||||
|
// Copyright (C) GoToSocial Authors admin@gotosocial.org
|
||||||
|
// SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
package transport
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
|
||||||
|
"codeberg.org/gruf/go-bytesize"
|
||||||
|
"codeberg.org/gruf/go-iotools"
|
||||||
|
apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
|
||||||
|
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
|
||||||
|
"github.com/temoto/robotstxt"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) {
|
||||||
|
robotsIRI := &url.URL{
|
||||||
|
Scheme: protocol,
|
||||||
|
Host: host,
|
||||||
|
Path: "robots.txt",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build IRI just once
|
||||||
|
iriStr := robotsIRI.String()
|
||||||
|
|
||||||
|
// Prepare new HTTP request to endpoint
|
||||||
|
req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// We want text/plain utf-8 encoding.
|
||||||
|
//
|
||||||
|
// https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method
|
||||||
|
req.Header.Add("Accept", apiutil.TextPlain)
|
||||||
|
req.Header.Add("Accept-Charset", apiutil.UTF8)
|
||||||
|
|
||||||
|
// Perform the HTTP request
|
||||||
|
rsp, err := t.GET(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure a non-error status response.
|
||||||
|
if rsp.StatusCode != http.StatusOK {
|
||||||
|
err := gtserror.NewFromResponse(rsp)
|
||||||
|
_ = rsp.Body.Close() // close early.
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that the incoming request content-type is expected.
|
||||||
|
if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) {
|
||||||
|
err := gtserror.Newf("non text/plain response: %s", ct)
|
||||||
|
_ = rsp.Body.Close() // close early.
|
||||||
|
return nil, gtserror.SetMalformed(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Limit the robots.txt size to 500KiB
|
||||||
|
//
|
||||||
|
// https://www.rfc-editor.org/rfc/rfc9309.html#name-limits
|
||||||
|
const maxsz = int64(500 * bytesize.KiB)
|
||||||
|
|
||||||
|
// Check body claims to be within size limit.
|
||||||
|
if rsp.ContentLength > maxsz {
|
||||||
|
_ = rsp.Body.Close() // close early.
|
||||||
|
sz := bytesize.Size(maxsz) //nolint:gosec
|
||||||
|
return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update response body with maximum size.
|
||||||
|
rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz)
|
||||||
|
defer rsp.Body.Close()
|
||||||
|
|
||||||
|
return robotstxt.FromResponse(rsp)
|
||||||
|
}
|
|
@ -133,6 +133,12 @@ func NewMockHTTPClient(do func(req *http.Request) (*http.Response, error), relat
|
||||||
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WebfingerResponse(req)
|
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WebfingerResponse(req)
|
||||||
} else if strings.Contains(reqURLString, ".well-known/host-meta") {
|
} else if strings.Contains(reqURLString, ".well-known/host-meta") {
|
||||||
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = HostMetaResponse(req)
|
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = HostMetaResponse(req)
|
||||||
|
} else if strings.Contains(reqURLString, ".well-known/nodeinfo") {
|
||||||
|
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WellKnownNodeInfoResponse(req)
|
||||||
|
} else if strings.Contains(reqURLString, "/robots.txt") {
|
||||||
|
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = RobotsTxtResponse(req)
|
||||||
|
} else if strings.Contains(reqURLString, "/nodeinfo/2.1") {
|
||||||
|
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = NodeInfoResponse(req)
|
||||||
} else if strings.Contains(reqURLString, "lists.example.org") {
|
} else if strings.Contains(reqURLString, "lists.example.org") {
|
||||||
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = DomainPermissionSubscriptionResponse(req)
|
responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = DomainPermissionSubscriptionResponse(req)
|
||||||
} else if note, ok := mockHTTPClient.TestRemoteStatuses[reqURLString]; ok {
|
} else if note, ok := mockHTTPClient.TestRemoteStatuses[reqURLString]; ok {
|
||||||
|
@ -318,6 +324,162 @@ func HostMetaResponse(req *http.Request) (
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WellKnownNodeInfoResponse(req *http.Request) (
|
||||||
|
responseCode int,
|
||||||
|
responseBytes []byte,
|
||||||
|
responseContentType string,
|
||||||
|
responseContentLength int,
|
||||||
|
extraHeaders map[string]string,
|
||||||
|
) {
|
||||||
|
var wkr *apimodel.WellKnownResponse
|
||||||
|
|
||||||
|
switch req.URL.String() {
|
||||||
|
case "https://fossbros-anonymous.io/.well-known/nodeinfo":
|
||||||
|
wkr = &apimodel.WellKnownResponse{
|
||||||
|
Links: []apimodel.Link{
|
||||||
|
{
|
||||||
|
Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1",
|
||||||
|
Href: "https://fossbros-anonymous.io/nodeinfo/2.1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case "https://furtive-nerds.example.org/.well-known/nodeinfo":
|
||||||
|
wkr = &apimodel.WellKnownResponse{
|
||||||
|
Links: []apimodel.Link{
|
||||||
|
{
|
||||||
|
Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1",
|
||||||
|
Href: "https://furtive-nerds.example.org/nodeinfo/2.1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case "https://really.furtive-nerds.example.org/.well-known/nodeinfo":
|
||||||
|
wkr = &apimodel.WellKnownResponse{
|
||||||
|
Links: []apimodel.Link{
|
||||||
|
{
|
||||||
|
Rel: "http://nodeinfo.diaspora.software/ns/schema/2.1",
|
||||||
|
Href: "https://really.furtive-nerds.example.org/nodeinfo/2.1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
extraHeaders = map[string]string{"X-Robots-Tag": "noindex,nofollow"}
|
||||||
|
default:
|
||||||
|
log.Debugf(nil, "nodeinfo response not available for %s", req.URL)
|
||||||
|
responseCode = http.StatusNotFound
|
||||||
|
responseBytes = []byte(``)
|
||||||
|
responseContentType = "application/json"
|
||||||
|
responseContentLength = len(responseBytes)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
niJSON, err := json.Marshal(wkr)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
responseCode = http.StatusOK
|
||||||
|
responseBytes = niJSON
|
||||||
|
responseContentType = "application/json"
|
||||||
|
responseContentLength = len(niJSON)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func NodeInfoResponse(req *http.Request) (
|
||||||
|
responseCode int,
|
||||||
|
responseBytes []byte,
|
||||||
|
responseContentType string,
|
||||||
|
responseContentLength int,
|
||||||
|
extraHeaders map[string]string,
|
||||||
|
) {
|
||||||
|
var ni *apimodel.Nodeinfo
|
||||||
|
|
||||||
|
switch req.URL.String() {
|
||||||
|
case "https://fossbros-anonymous.io/nodeinfo/2.1":
|
||||||
|
ni = &apimodel.Nodeinfo{
|
||||||
|
Version: "2.1",
|
||||||
|
Software: apimodel.NodeInfoSoftware{
|
||||||
|
Name: "Hellsoft",
|
||||||
|
Version: "6.6.6",
|
||||||
|
Repository: "https://forge.hellsoft.fossbros-anonymous.io",
|
||||||
|
Homepage: "https://hellsoft.fossbros-anonymous.io",
|
||||||
|
},
|
||||||
|
Protocols: []string{"activitypub"},
|
||||||
|
}
|
||||||
|
case "https://furtive-nerds.example.org/nodeinfo/2.1":
|
||||||
|
ni = &apimodel.Nodeinfo{
|
||||||
|
Version: "2.1",
|
||||||
|
Software: apimodel.NodeInfoSoftware{
|
||||||
|
Name: "GoToSocial",
|
||||||
|
Version: "1.3.1.2",
|
||||||
|
Repository: "https://github.com/superseriousbusiness/gotosocial",
|
||||||
|
Homepage: "https://docs.gotosocial.org",
|
||||||
|
},
|
||||||
|
Protocols: []string{"activitypub"},
|
||||||
|
}
|
||||||
|
case "https://really.furtive-nerds.example.org/nodeinfo/2.1":
|
||||||
|
ni = &apimodel.Nodeinfo{
|
||||||
|
Version: "2.1",
|
||||||
|
Software: apimodel.NodeInfoSoftware{
|
||||||
|
Name: "GoToSocial",
|
||||||
|
Version: "1.3.1.2",
|
||||||
|
Repository: "https://github.com/superseriousbusiness/gotosocial",
|
||||||
|
Homepage: "https://docs.gotosocial.org",
|
||||||
|
},
|
||||||
|
Protocols: []string{"activitypub"},
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
log.Debugf(nil, "nodeinfo response not available for %s", req.URL)
|
||||||
|
responseCode = http.StatusNotFound
|
||||||
|
responseBytes = []byte(``)
|
||||||
|
responseContentType = "application/json"
|
||||||
|
responseContentLength = len(responseBytes)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
niJSON, err := json.Marshal(ni)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
responseCode = http.StatusOK
|
||||||
|
responseBytes = niJSON
|
||||||
|
responseContentType = "application/json"
|
||||||
|
responseContentLength = len(niJSON)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func RobotsTxtResponse(req *http.Request) (
|
||||||
|
responseCode int,
|
||||||
|
responseBytes []byte,
|
||||||
|
responseContentType string,
|
||||||
|
responseContentLength int,
|
||||||
|
extraHeaders map[string]string,
|
||||||
|
) {
|
||||||
|
var robots string
|
||||||
|
|
||||||
|
switch req.URL.String() {
|
||||||
|
case "https://furtive-nerds.example.org/robots.txt":
|
||||||
|
// Disallow nodeinfo.
|
||||||
|
robots = "User-agent: *\nDisallow: /nodeinfo"
|
||||||
|
case "https://robotic.furtive-nerds.example.org/robots.txt":
|
||||||
|
// Disallow everything.
|
||||||
|
robots = "User-agent: *\nDisallow: /"
|
||||||
|
default:
|
||||||
|
log.Debugf(nil, "robots response not available for %s", req.URL)
|
||||||
|
responseCode = http.StatusNotFound
|
||||||
|
responseBytes = []byte(``)
|
||||||
|
responseContentType = "text/plain"
|
||||||
|
responseContentLength = len(responseBytes)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
responseCode = http.StatusOK
|
||||||
|
responseBytes = []byte(robots)
|
||||||
|
responseContentType = "text/plain"
|
||||||
|
responseContentLength = len(responseBytes)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
func WebfingerResponse(req *http.Request) (
|
func WebfingerResponse(req *http.Request) (
|
||||||
responseCode int,
|
responseCode int,
|
||||||
responseBytes []byte,
|
responseBytes []byte,
|
||||||
|
|
15
vendor/github.com/temoto/robotstxt/.gitignore
generated
vendored
Normal file
15
vendor/github.com/temoto/robotstxt/.gitignore
generated
vendored
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
*.cgo?.*
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.sublime-*
|
||||||
|
*.zip
|
||||||
|
.DS_Store
|
||||||
|
.idea/
|
||||||
|
.tags*
|
||||||
|
_cgo_*
|
||||||
|
_gofuzz/crashers/
|
||||||
|
_gofuzz/suppressions/
|
||||||
|
_obj
|
||||||
|
_test
|
||||||
|
coverage.txt
|
||||||
|
robots.txt-check/robots.txt-check
|
20
vendor/github.com/temoto/robotstxt/.golangci.yml
generated
vendored
Normal file
20
vendor/github.com/temoto/robotstxt/.golangci.yml
generated
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
linters:
|
||||||
|
enable:
|
||||||
|
- goconst
|
||||||
|
- gofmt
|
||||||
|
- gosec
|
||||||
|
- maligned
|
||||||
|
- prealloc
|
||||||
|
- staticcheck
|
||||||
|
disable:
|
||||||
|
- deadcode
|
||||||
|
- structcheck
|
||||||
|
- varcheck
|
||||||
|
|
||||||
|
linters-settings:
|
||||||
|
gofmt:
|
||||||
|
simplify: true
|
||||||
|
govet:
|
||||||
|
check-shadowing: true
|
||||||
|
maligned:
|
||||||
|
suggest-new: true
|
30
vendor/github.com/temoto/robotstxt/.travis.yml
generated
vendored
Normal file
30
vendor/github.com/temoto/robotstxt/.travis.yml
generated
vendored
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
cache:
|
||||||
|
go: true
|
||||||
|
directories:
|
||||||
|
- $HOME/.cache
|
||||||
|
- $HOME/bin
|
||||||
|
- $HOME/gopath/pkg/mod
|
||||||
|
language: go
|
||||||
|
go:
|
||||||
|
- 1.11
|
||||||
|
- 1.12
|
||||||
|
- 1.13
|
||||||
|
- 1.14
|
||||||
|
- 1.x
|
||||||
|
- master
|
||||||
|
install: true
|
||||||
|
script: GO111MODULE=on go test -race
|
||||||
|
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- go: 1.x
|
||||||
|
env: task=coverage
|
||||||
|
script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt
|
||||||
|
after_success: bash <(curl -s https://codecov.io/bash)
|
||||||
|
- go: 1.x
|
||||||
|
env: task=bench
|
||||||
|
script: GO111MODULE=on ./script/bench
|
||||||
|
- go: 1.x
|
||||||
|
install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1
|
||||||
|
env: task=clean
|
||||||
|
script: GO111MODULE=on ./script/clean
|
21
vendor/github.com/temoto/robotstxt/LICENSE
generated
vendored
Normal file
21
vendor/github.com/temoto/robotstxt/LICENSE
generated
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
The MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2010 Sergey Shepelev <temotor@gmail.com>
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
115
vendor/github.com/temoto/robotstxt/README.rst
generated
vendored
Normal file
115
vendor/github.com/temoto/robotstxt/README.rst
generated
vendored
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
What
|
||||||
|
====
|
||||||
|
|
||||||
|
This is a robots.txt exclusion protocol implementation for Go language (golang).
|
||||||
|
|
||||||
|
|
||||||
|
Build
|
||||||
|
=====
|
||||||
|
|
||||||
|
To build and run tests run `go test` in source directory.
|
||||||
|
|
||||||
|
|
||||||
|
Contribute
|
||||||
|
==========
|
||||||
|
|
||||||
|
Warm welcome.
|
||||||
|
|
||||||
|
* If desired, add your name in README.rst, section Who.
|
||||||
|
* Run `script/test && script/clean && echo ok`
|
||||||
|
* You can ignore linter warnings, but everything else must pass.
|
||||||
|
* Send your change as pull request or just a regular patch to current maintainer (see section Who).
|
||||||
|
|
||||||
|
Thank you.
|
||||||
|
|
||||||
|
|
||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
As usual, no special installation is required, just
|
||||||
|
|
||||||
|
import "github.com/temoto/robotstxt"
|
||||||
|
|
||||||
|
run `go get` and you're ready.
|
||||||
|
|
||||||
|
1. Parse
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
First of all, you need to parse robots.txt data. You can do it with
|
||||||
|
functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`::
|
||||||
|
|
||||||
|
robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:"))
|
||||||
|
robots, err := robotstxt.FromString("User-agent: *\nDisallow:")
|
||||||
|
|
||||||
|
As of 2012-10-03, `FromBytes` is the most efficient method, everything else
|
||||||
|
is a wrapper for this core function.
|
||||||
|
|
||||||
|
There are few convenient constructors for various purposes:
|
||||||
|
|
||||||
|
* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data
|
||||||
|
from HTTP response. It *does not* call `response.Body.Close()`::
|
||||||
|
|
||||||
|
robots, err := robotstxt.FromResponse(resp)
|
||||||
|
resp.Body.Close()
|
||||||
|
if err != nil {
|
||||||
|
log.Println("Error parsing robots.txt:", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or
|
||||||
|
`FromStatusAndString` if you prefer to read bytes (string) yourself.
|
||||||
|
Passing status code applies following logic in line with Google's interpretation
|
||||||
|
of robots.txt files:
|
||||||
|
|
||||||
|
* status 2xx -> parse body with `FromBytes` and apply rules listed there.
|
||||||
|
* status 4xx -> allow all (even 401/403, as recommended by Google).
|
||||||
|
* other (5xx) -> disallow all, consider this a temporary unavailability.
|
||||||
|
|
||||||
|
2. Query
|
||||||
|
^^^^^^^^
|
||||||
|
|
||||||
|
Parsing robots.txt content builds a kind of logic database, which you can
|
||||||
|
query with `(r *RobotsData) TestAgent(url, agent string) (bool)`.
|
||||||
|
|
||||||
|
Explicit passing of agent is useful if you want to query for different agents. For
|
||||||
|
single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)`
|
||||||
|
returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`.
|
||||||
|
|
||||||
|
Simple query with explicit user agent. Each call will scan all rules.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
allow := robots.TestAgent("/", "FooBot")
|
||||||
|
|
||||||
|
Or query several paths against same user agent for performance.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
group := robots.FindGroup("BarBot")
|
||||||
|
group.Test("/")
|
||||||
|
group.Test("/download.mp3")
|
||||||
|
group.Test("/news/article-2012-1")
|
||||||
|
|
||||||
|
|
||||||
|
Who
|
||||||
|
===
|
||||||
|
|
||||||
|
Honorable contributors (in undefined order):
|
||||||
|
|
||||||
|
* Ilya Grigorik (igrigorik)
|
||||||
|
* Martin Angers (PuerkitoBio)
|
||||||
|
* Micha Gorelick (mynameisfiber)
|
||||||
|
|
||||||
|
Initial commit and other: Sergey Shepelev temotor@gmail.com
|
||||||
|
|
||||||
|
|
||||||
|
Flair
|
||||||
|
=====
|
||||||
|
|
||||||
|
.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master
|
||||||
|
:target: https://travis-ci.org/temoto/robotstxt
|
||||||
|
|
||||||
|
.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg
|
||||||
|
:target: https://codecov.io/gh/temoto/robotstxt
|
||||||
|
|
||||||
|
.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt
|
||||||
|
:target: https://goreportcard.com/report/github.com/temoto/robotstxt
|
2
vendor/github.com/temoto/robotstxt/codecov.yml
generated
vendored
Normal file
2
vendor/github.com/temoto/robotstxt/codecov.yml
generated
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
codecov:
|
||||||
|
token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04
|
29
vendor/github.com/temoto/robotstxt/fuzz.go
generated
vendored
Normal file
29
vendor/github.com/temoto/robotstxt/fuzz.go
generated
vendored
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
// +build gofuzz
|
||||||
|
|
||||||
|
package robotstxt
|
||||||
|
|
||||||
|
import "testing/quick"
|
||||||
|
|
||||||
|
func Fuzz(data []byte) int {
|
||||||
|
r, err := FromBytes(data)
|
||||||
|
if err != nil {
|
||||||
|
if r != nil {
|
||||||
|
panic("r != nil on error")
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// FindGroup must never return nil
|
||||||
|
f1 := func(agent string) bool { return r.FindGroup(agent) != nil }
|
||||||
|
if err := quick.Check(f1, nil); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// just check TestAgent doesn't panic
|
||||||
|
f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true }
|
||||||
|
if err := quick.Check(f2, nil); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1
|
||||||
|
}
|
271
vendor/github.com/temoto/robotstxt/parser.go
generated
vendored
Normal file
271
vendor/github.com/temoto/robotstxt/parser.go
generated
vendored
Normal file
|
@ -0,0 +1,271 @@
|
||||||
|
package robotstxt
|
||||||
|
|
||||||
|
// Comments explaining the logic are taken from either the google's spec:
|
||||||
|
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
|
||||||
|
//
|
||||||
|
// or the Wikipedia's entry on robots.txt:
|
||||||
|
// http://en.wikipedia.org/wiki/Robots.txt
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"math"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type lineType uint
|
||||||
|
|
||||||
|
const (
|
||||||
|
lIgnore lineType = iota
|
||||||
|
lUnknown
|
||||||
|
lUserAgent
|
||||||
|
lAllow
|
||||||
|
lDisallow
|
||||||
|
lCrawlDelay
|
||||||
|
lSitemap
|
||||||
|
lHost
|
||||||
|
)
|
||||||
|
|
||||||
|
type parser struct {
|
||||||
|
tokens []string
|
||||||
|
pos int
|
||||||
|
}
|
||||||
|
|
||||||
|
type lineInfo struct {
|
||||||
|
t lineType // Type of line key
|
||||||
|
k string // String representation of the type of key
|
||||||
|
vs string // String value of the key
|
||||||
|
vf float64 // Float value of the key
|
||||||
|
vr *regexp.Regexp // Regexp value of the key
|
||||||
|
}
|
||||||
|
|
||||||
|
func newParser(tokens []string) *parser {
|
||||||
|
return &parser{tokens: tokens}
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
|
||||||
|
var g *Group
|
||||||
|
for _, a := range agents {
|
||||||
|
if g = groups[a]; g == nil {
|
||||||
|
g = new(Group)
|
||||||
|
groups[a] = g
|
||||||
|
}
|
||||||
|
fun(g)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
|
||||||
|
groups = make(map[string]*Group, 16)
|
||||||
|
agents := make([]string, 0, 4)
|
||||||
|
isEmptyGroup := true
|
||||||
|
|
||||||
|
// Reset internal fields, tokens are assigned at creation time, never change
|
||||||
|
p.pos = 0
|
||||||
|
|
||||||
|
for {
|
||||||
|
if li, err := p.parseLine(); err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
errs = append(errs, err)
|
||||||
|
} else {
|
||||||
|
switch li.t {
|
||||||
|
case lUserAgent:
|
||||||
|
// Two successive user-agent lines are part of the same group.
|
||||||
|
if !isEmptyGroup {
|
||||||
|
// End previous group
|
||||||
|
agents = make([]string, 0, 4)
|
||||||
|
}
|
||||||
|
if len(agents) == 0 {
|
||||||
|
isEmptyGroup = true
|
||||||
|
}
|
||||||
|
agents = append(agents, li.vs)
|
||||||
|
|
||||||
|
case lDisallow:
|
||||||
|
// Error if no current group
|
||||||
|
if len(agents) == 0 {
|
||||||
|
errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
|
||||||
|
} else {
|
||||||
|
isEmptyGroup = false
|
||||||
|
var r *rule
|
||||||
|
if li.vr != nil {
|
||||||
|
r = &rule{"", false, li.vr}
|
||||||
|
} else {
|
||||||
|
r = &rule{li.vs, false, nil}
|
||||||
|
}
|
||||||
|
parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
|
||||||
|
}
|
||||||
|
|
||||||
|
case lAllow:
|
||||||
|
// Error if no current group
|
||||||
|
if len(agents) == 0 {
|
||||||
|
errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
|
||||||
|
} else {
|
||||||
|
isEmptyGroup = false
|
||||||
|
var r *rule
|
||||||
|
if li.vr != nil {
|
||||||
|
r = &rule{"", true, li.vr}
|
||||||
|
} else {
|
||||||
|
r = &rule{li.vs, true, nil}
|
||||||
|
}
|
||||||
|
parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
|
||||||
|
}
|
||||||
|
|
||||||
|
case lHost:
|
||||||
|
host = li.vs
|
||||||
|
|
||||||
|
case lSitemap:
|
||||||
|
sitemaps = append(sitemaps, li.vs)
|
||||||
|
|
||||||
|
case lCrawlDelay:
|
||||||
|
if len(agents) == 0 {
|
||||||
|
errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
|
||||||
|
} else {
|
||||||
|
isEmptyGroup = false
|
||||||
|
delay := time.Duration(li.vf * float64(time.Second))
|
||||||
|
parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *parser) parseLine() (li *lineInfo, err error) {
|
||||||
|
t1, ok1 := p.popToken()
|
||||||
|
if !ok1 {
|
||||||
|
// proper EOF
|
||||||
|
return nil, io.EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
t2, ok2 := p.peekToken()
|
||||||
|
if !ok2 {
|
||||||
|
// EOF, no value associated with the token, so ignore token and return
|
||||||
|
return nil, io.EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper closure for all string-based tokens, common behaviour:
|
||||||
|
// - Consume t2 token
|
||||||
|
// - If empty, return unknown line info
|
||||||
|
// - Otherwise return the specified line info
|
||||||
|
returnStringVal := func(t lineType) (*lineInfo, error) {
|
||||||
|
p.popToken()
|
||||||
|
if t2 != "" {
|
||||||
|
return &lineInfo{t: t, k: t1, vs: t2}, nil
|
||||||
|
}
|
||||||
|
return &lineInfo{t: lIgnore}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper closure for all path tokens (allow/disallow), common behaviour:
|
||||||
|
// - Consume t2 token
|
||||||
|
// - If empty, return unknown line info
|
||||||
|
// - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
|
||||||
|
// - Detect if wildcards are present, if so, compile into a regexp
|
||||||
|
// - Return the specified line info
|
||||||
|
returnPathVal := func(t lineType) (*lineInfo, error) {
|
||||||
|
p.popToken()
|
||||||
|
if t2 != "" {
|
||||||
|
if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
|
||||||
|
t2 = "/" + t2
|
||||||
|
}
|
||||||
|
t2 = strings.TrimRightFunc(t2, isAsterisk)
|
||||||
|
// From google's spec:
|
||||||
|
// Google, Bing, Yahoo, and Ask support a limited form of
|
||||||
|
// "wildcards" for path values. These are:
|
||||||
|
// * designates 0 or more instances of any valid character
|
||||||
|
// $ designates the end of the URL
|
||||||
|
if strings.ContainsAny(t2, "*$") {
|
||||||
|
// Must compile a regexp, this is a pattern.
|
||||||
|
// Escape string before compile.
|
||||||
|
t2 = regexp.QuoteMeta(t2)
|
||||||
|
t2 = strings.Replace(t2, `\*`, `.*`, -1)
|
||||||
|
t2 = strings.Replace(t2, `\$`, `$`, -1)
|
||||||
|
if r, e := regexp.Compile(t2); e != nil {
|
||||||
|
return nil, e
|
||||||
|
} else {
|
||||||
|
return &lineInfo{t: t, k: t1, vr: r}, nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Simple string path
|
||||||
|
return &lineInfo{t: t, k: t1, vs: t2}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &lineInfo{t: lIgnore}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
switch strings.ToLower(t1) {
|
||||||
|
case tokEOL:
|
||||||
|
// Don't consume t2 and continue parsing
|
||||||
|
return &lineInfo{t: lIgnore}, nil
|
||||||
|
|
||||||
|
case "user-agent", "useragent":
|
||||||
|
// From google's spec:
|
||||||
|
// Handling of <field> elements with simple errors / typos (eg "useragent"
|
||||||
|
// instead of "user-agent") is undefined and may be interpreted as correct
|
||||||
|
// directives by some user-agents.
|
||||||
|
// The user-agent is non-case-sensitive.
|
||||||
|
t2 = strings.ToLower(t2)
|
||||||
|
return returnStringVal(lUserAgent)
|
||||||
|
|
||||||
|
case "disallow":
|
||||||
|
// From google's spec:
|
||||||
|
// When no path is specified, the directive is ignored (so an empty Disallow
|
||||||
|
// CAN be an allow, since allow is the default. The actual result depends
|
||||||
|
// on the other rules in the group).
|
||||||
|
return returnPathVal(lDisallow)
|
||||||
|
|
||||||
|
case "allow":
|
||||||
|
// From google's spec:
|
||||||
|
// When no path is specified, the directive is ignored.
|
||||||
|
return returnPathVal(lAllow)
|
||||||
|
|
||||||
|
case "host":
|
||||||
|
// Host directive to specify main site mirror
|
||||||
|
// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
|
||||||
|
return returnStringVal(lHost)
|
||||||
|
|
||||||
|
case "sitemap":
|
||||||
|
// Non-group field, applies to the host as a whole, not to a specific user-agent
|
||||||
|
return returnStringVal(lSitemap)
|
||||||
|
|
||||||
|
case "crawl-delay", "crawldelay":
|
||||||
|
// From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
|
||||||
|
// Several major crawlers support a Crawl-delay parameter, set to the
|
||||||
|
// number of seconds to wait between successive requests to the same server.
|
||||||
|
p.popToken()
|
||||||
|
if cd, e := strconv.ParseFloat(t2, 64); e != nil {
|
||||||
|
return nil, e
|
||||||
|
} else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
|
||||||
|
return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
|
||||||
|
} else {
|
||||||
|
return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume t2 token
|
||||||
|
p.popToken()
|
||||||
|
return &lineInfo{t: lUnknown, k: t1}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *parser) popToken() (tok string, ok bool) {
|
||||||
|
tok, ok = p.peekToken()
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p.pos++
|
||||||
|
return tok, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *parser) peekToken() (tok string, ok bool) {
|
||||||
|
if p.pos >= len(p.tokens) {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
return p.tokens[p.pos], true
|
||||||
|
}
|
||||||
|
|
||||||
|
func isAsterisk(r rune) bool {
|
||||||
|
return r == '*'
|
||||||
|
}
|
227
vendor/github.com/temoto/robotstxt/robotstxt.go
generated
vendored
Normal file
227
vendor/github.com/temoto/robotstxt/robotstxt.go
generated
vendored
Normal file
|
@ -0,0 +1,227 @@
|
||||||
|
// Package robotstxt implements the robots.txt Exclusion Protocol
|
||||||
|
// as specified in http://www.robotstxt.org/wc/robots.html
|
||||||
|
// with various extensions.
|
||||||
|
package robotstxt
|
||||||
|
|
||||||
|
// Comments explaining the logic are taken from either the Google's spec:
|
||||||
|
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"io/ioutil"
|
||||||
|
"net/http"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type RobotsData struct {
|
||||||
|
// private
|
||||||
|
groups map[string]*Group
|
||||||
|
allowAll bool
|
||||||
|
disallowAll bool
|
||||||
|
Host string
|
||||||
|
Sitemaps []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type Group struct {
|
||||||
|
rules []*rule
|
||||||
|
Agent string
|
||||||
|
CrawlDelay time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
type rule struct {
|
||||||
|
path string
|
||||||
|
allow bool
|
||||||
|
pattern *regexp.Regexp
|
||||||
|
}
|
||||||
|
|
||||||
|
type ParseError struct {
|
||||||
|
Errs []error
|
||||||
|
}
|
||||||
|
|
||||||
|
func newParseError(errs []error) *ParseError {
|
||||||
|
return &ParseError{errs}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e ParseError) Error() string {
|
||||||
|
var b bytes.Buffer
|
||||||
|
|
||||||
|
b.WriteString("Parse error(s): " + "\n")
|
||||||
|
for _, er := range e.Errs {
|
||||||
|
b.WriteString(er.Error() + "\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
var allowAll = &RobotsData{allowAll: true}
|
||||||
|
var disallowAll = &RobotsData{disallowAll: true}
|
||||||
|
var emptyGroup = &Group{}
|
||||||
|
|
||||||
|
func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
|
||||||
|
switch {
|
||||||
|
case statusCode >= 200 && statusCode < 300:
|
||||||
|
return FromBytes(body)
|
||||||
|
|
||||||
|
// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
|
||||||
|
//
|
||||||
|
// Google treats all 4xx errors in the same way and assumes that no valid
|
||||||
|
// robots.txt file exists. It is assumed that there are no restrictions.
|
||||||
|
// This is a "full allow" for crawling. Note: this includes 401
|
||||||
|
// "Unauthorized" and 403 "Forbidden" HTTP result codes.
|
||||||
|
case statusCode >= 400 && statusCode < 500:
|
||||||
|
return allowAll, nil
|
||||||
|
|
||||||
|
// From Google's spec:
|
||||||
|
// Server errors (5xx) are seen as temporary errors that result in a "full
|
||||||
|
// disallow" of crawling.
|
||||||
|
case statusCode >= 500 && statusCode < 600:
|
||||||
|
return disallowAll, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))
|
||||||
|
}
|
||||||
|
|
||||||
|
func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
|
||||||
|
return FromStatusAndBytes(statusCode, []byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func FromResponse(res *http.Response) (*RobotsData, error) {
|
||||||
|
if res == nil {
|
||||||
|
// Edge case, if res is nil, return nil data
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
buf, e := ioutil.ReadAll(res.Body)
|
||||||
|
if e != nil {
|
||||||
|
return nil, e
|
||||||
|
}
|
||||||
|
return FromStatusAndBytes(res.StatusCode, buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
func FromBytes(body []byte) (r *RobotsData, err error) {
|
||||||
|
var errs []error
|
||||||
|
|
||||||
|
// special case (probably not worth optimization?)
|
||||||
|
trimmed := bytes.TrimSpace(body)
|
||||||
|
if len(trimmed) == 0 {
|
||||||
|
return allowAll, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
sc := newByteScanner("bytes", true)
|
||||||
|
//sc.Quiet = !print_errors
|
||||||
|
sc.feed(body, true)
|
||||||
|
tokens := sc.scanAll()
|
||||||
|
|
||||||
|
// special case worth optimization
|
||||||
|
if len(tokens) == 0 {
|
||||||
|
return allowAll, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
r = &RobotsData{}
|
||||||
|
parser := newParser(tokens)
|
||||||
|
r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return nil, newParseError(errs)
|
||||||
|
}
|
||||||
|
|
||||||
|
return r, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func FromString(body string) (r *RobotsData, err error) {
|
||||||
|
return FromBytes([]byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RobotsData) TestAgent(path, agent string) bool {
|
||||||
|
if r.allowAll {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if r.disallowAll {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find a group of rules that applies to this agent
|
||||||
|
// From Google's spec:
|
||||||
|
// The user-agent is non-case-sensitive.
|
||||||
|
g := r.FindGroup(agent)
|
||||||
|
return g.Test(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
// FindGroup searches block of declarations for specified user-agent.
|
||||||
|
// From Google's spec:
|
||||||
|
// Only one group of group-member records is valid for a particular crawler.
|
||||||
|
// The crawler must determine the correct group of records by finding the group
|
||||||
|
// with the most specific user-agent that still matches. All other groups of
|
||||||
|
// records are ignored by the crawler. The user-agent is non-case-sensitive.
|
||||||
|
// The order of the groups within the robots.txt file is irrelevant.
|
||||||
|
func (r *RobotsData) FindGroup(agent string) (ret *Group) {
|
||||||
|
var prefixLen int
|
||||||
|
|
||||||
|
agent = strings.ToLower(agent)
|
||||||
|
if ret = r.groups["*"]; ret != nil {
|
||||||
|
// Weakest match possible
|
||||||
|
prefixLen = 1
|
||||||
|
}
|
||||||
|
for a, g := range r.groups {
|
||||||
|
if a != "*" && strings.HasPrefix(agent, a) {
|
||||||
|
if l := len(a); l > prefixLen {
|
||||||
|
prefixLen = l
|
||||||
|
ret = g
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ret == nil {
|
||||||
|
return emptyGroup
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *Group) Test(path string) bool {
|
||||||
|
if r := g.findRule(path); r != nil {
|
||||||
|
return r.allow
|
||||||
|
}
|
||||||
|
|
||||||
|
// From Google's spec:
|
||||||
|
// By default, there are no restrictions for crawling for the designated crawlers.
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// From Google's spec:
|
||||||
|
// The path value is used as a basis to determine whether or not a rule applies
|
||||||
|
// to a specific URL on a site. With the exception of wildcards, the path is
|
||||||
|
// used to match the beginning of a URL (and any valid URLs that start with the
|
||||||
|
// same path).
|
||||||
|
//
|
||||||
|
// At a group-member level, in particular for allow and disallow directives,
|
||||||
|
// the most specific rule based on the length of the [path] entry will trump
|
||||||
|
// the less specific (shorter) rule. The order of precedence for rules with
|
||||||
|
// wildcards is undefined.
|
||||||
|
func (g *Group) findRule(path string) (ret *rule) {
|
||||||
|
var prefixLen int
|
||||||
|
|
||||||
|
for _, r := range g.rules {
|
||||||
|
if r.pattern != nil {
|
||||||
|
if r.pattern.MatchString(path) {
|
||||||
|
// Consider this a match equal to the length of the pattern.
|
||||||
|
// From Google's spec:
|
||||||
|
// The order of precedence for rules with wildcards is undefined.
|
||||||
|
if l := len(r.pattern.String()); l > prefixLen {
|
||||||
|
prefixLen = l
|
||||||
|
ret = r
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if r.path == "/" && prefixLen == 0 {
|
||||||
|
// Weakest match possible
|
||||||
|
prefixLen = 1
|
||||||
|
ret = r
|
||||||
|
} else if strings.HasPrefix(path, r.path) {
|
||||||
|
if l := len(r.path); l > prefixLen {
|
||||||
|
prefixLen = l
|
||||||
|
ret = r
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
185
vendor/github.com/temoto/robotstxt/scanner.go
generated
vendored
Normal file
185
vendor/github.com/temoto/robotstxt/scanner.go
generated
vendored
Normal file
|
@ -0,0 +1,185 @@
|
||||||
|
package robotstxt
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"go/token"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
type byteScanner struct {
|
||||||
|
pos token.Position
|
||||||
|
buf []byte
|
||||||
|
ErrorCount int
|
||||||
|
ch rune
|
||||||
|
Quiet bool
|
||||||
|
keyTokenFound bool
|
||||||
|
lastChunk bool
|
||||||
|
}
|
||||||
|
|
||||||
|
const tokEOL = "\n"
|
||||||
|
|
||||||
|
var WhitespaceChars = []rune{' ', '\t', '\v'}
|
||||||
|
var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
|
||||||
|
|
||||||
|
func newByteScanner(srcname string, quiet bool) *byteScanner {
|
||||||
|
return &byteScanner{
|
||||||
|
Quiet: quiet,
|
||||||
|
ch: -1,
|
||||||
|
pos: token.Position{Filename: srcname},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) feed(input []byte, end bool) {
|
||||||
|
s.buf = input
|
||||||
|
s.pos.Offset = 0
|
||||||
|
s.pos.Line = 1
|
||||||
|
s.pos.Column = 1
|
||||||
|
s.lastChunk = end
|
||||||
|
|
||||||
|
// Read first char into look-ahead buffer `s.ch`.
|
||||||
|
if !s.nextChar() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip UTF-8 byte order mark
|
||||||
|
if s.ch == 65279 {
|
||||||
|
s.nextChar()
|
||||||
|
s.pos.Column = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) GetPosition() token.Position {
|
||||||
|
return s.pos
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) scan() string {
|
||||||
|
// Note Offset > len, not >=, so we can scan last character.
|
||||||
|
if s.lastChunk && s.pos.Offset > len(s.buf) {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
s.skipSpace()
|
||||||
|
|
||||||
|
if s.ch == -1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// EOL
|
||||||
|
if s.isEol() {
|
||||||
|
s.keyTokenFound = false
|
||||||
|
// skip subsequent newline chars
|
||||||
|
for s.ch != -1 && s.isEol() {
|
||||||
|
s.nextChar()
|
||||||
|
}
|
||||||
|
// emit newline as separate token
|
||||||
|
return tokEOL
|
||||||
|
}
|
||||||
|
|
||||||
|
// skip comments
|
||||||
|
if s.ch == '#' {
|
||||||
|
s.keyTokenFound = false
|
||||||
|
s.skipUntilEol()
|
||||||
|
if s.ch == -1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
// emit newline as separate token
|
||||||
|
return tokEOL
|
||||||
|
}
|
||||||
|
|
||||||
|
// else we found something
|
||||||
|
tok := tokBuffers.Get().(*bytes.Buffer)
|
||||||
|
defer tokBuffers.Put(tok)
|
||||||
|
tok.Reset()
|
||||||
|
tok.WriteRune(s.ch)
|
||||||
|
s.nextChar()
|
||||||
|
for s.ch != -1 && !s.isSpace() && !s.isEol() {
|
||||||
|
// Do not consider ":" to be a token separator if a first key token
|
||||||
|
// has already been found on this line (avoid cutting an absolute URL
|
||||||
|
// after the "http:")
|
||||||
|
if s.ch == ':' && !s.keyTokenFound {
|
||||||
|
s.nextChar()
|
||||||
|
s.keyTokenFound = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
tok.WriteRune(s.ch)
|
||||||
|
s.nextChar()
|
||||||
|
}
|
||||||
|
return tok.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) scanAll() []string {
|
||||||
|
results := make([]string, 0, 64) // random guess of average tokens length
|
||||||
|
for {
|
||||||
|
token := s.scan()
|
||||||
|
if token != "" {
|
||||||
|
results = append(results, token)
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) error(pos token.Position, msg string) {
|
||||||
|
s.ErrorCount++
|
||||||
|
if !s.Quiet {
|
||||||
|
fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) isEol() bool {
|
||||||
|
return s.ch == '\n' || s.ch == '\r'
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) isSpace() bool {
|
||||||
|
for _, r := range WhitespaceChars {
|
||||||
|
if s.ch == r {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) skipSpace() {
|
||||||
|
for s.ch != -1 && s.isSpace() {
|
||||||
|
s.nextChar()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *byteScanner) skipUntilEol() {
|
||||||
|
for s.ch != -1 && !s.isEol() {
|
||||||
|
s.nextChar()
|
||||||
|
}
|
||||||
|
// skip subsequent newline chars
|
||||||
|
for s.ch != -1 && s.isEol() {
|
||||||
|
s.nextChar()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reads next Unicode char.
|
||||||
|
func (s *byteScanner) nextChar() bool {
|
||||||
|
if s.pos.Offset >= len(s.buf) {
|
||||||
|
s.ch = -1
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
s.pos.Column++
|
||||||
|
if s.ch == '\n' {
|
||||||
|
s.pos.Line++
|
||||||
|
s.pos.Column = 1
|
||||||
|
}
|
||||||
|
r, w := rune(s.buf[s.pos.Offset]), 1
|
||||||
|
if r >= 0x80 {
|
||||||
|
r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
|
||||||
|
if r == utf8.RuneError && w == 1 {
|
||||||
|
s.error(s.pos, "illegal UTF-8 encoding")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.pos.Column++
|
||||||
|
s.pos.Offset += w
|
||||||
|
s.ch = r
|
||||||
|
return true
|
||||||
|
}
|
5
vendor/modules.txt
vendored
5
vendor/modules.txt
vendored
|
@ -412,6 +412,8 @@ github.com/jackc/puddle/v2/internal/genstack
|
||||||
# github.com/jessevdk/go-flags v1.5.0
|
# github.com/jessevdk/go-flags v1.5.0
|
||||||
## explicit; go 1.15
|
## explicit; go 1.15
|
||||||
github.com/jessevdk/go-flags
|
github.com/jessevdk/go-flags
|
||||||
|
# github.com/jimsmart/grobotstxt v1.0.3
|
||||||
|
## explicit; go 1.14
|
||||||
# github.com/jinzhu/inflection v1.0.0
|
# github.com/jinzhu/inflection v1.0.0
|
||||||
## explicit
|
## explicit
|
||||||
github.com/jinzhu/inflection
|
github.com/jinzhu/inflection
|
||||||
|
@ -831,6 +833,9 @@ github.com/tdewolff/parse/v2/strconv
|
||||||
# github.com/technologize/otel-go-contrib v1.1.1
|
# github.com/technologize/otel-go-contrib v1.1.1
|
||||||
## explicit; go 1.17
|
## explicit; go 1.17
|
||||||
github.com/technologize/otel-go-contrib/otelginmetrics
|
github.com/technologize/otel-go-contrib/otelginmetrics
|
||||||
|
# github.com/temoto/robotstxt v1.1.2
|
||||||
|
## explicit; go 1.11
|
||||||
|
github.com/temoto/robotstxt
|
||||||
# github.com/tetratelabs/wazero v1.8.2
|
# github.com/tetratelabs/wazero v1.8.2
|
||||||
## explicit; go 1.21
|
## explicit; go 1.21
|
||||||
github.com/tetratelabs/wazero
|
github.com/tetratelabs/wazero
|
||||||
|
|
Loading…
Reference in a new issue