diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go index c75b65551..fb91bcc93 100644 --- a/internal/middleware/extraheaders.go +++ b/internal/middleware/extraheaders.go @@ -44,5 +44,12 @@ func ExtraHeaders() gin.HandlerFunc { // // See: https://github.com/patcg-individual-drafts/topics c.Header("Permissions-Policy", "browsing-topics=()") + + // Some AI scrapers respect the following tags to opt-out + // of their crawling and datasets. + c.Header("X-Robots-Tag", "noimageai") + // c.Header calls .Set(), but we want to emit the header + // twice, not override it. + c.Writer.Header().Add("X-Robots-Tag", "noai") } } diff --git a/internal/web/robots.go b/internal/web/robots.go index 39708eb55..3309de97c 100644 --- a/internal/web/robots.go +++ b/internal/web/robots.go @@ -43,15 +43,24 @@ User-agent: cohere-ai User-agent: Diffbot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video User-agent: GPTBot User-agent: ImagesiftBot User-agent: img2dataset +User-agent: Meta-ExternalAgent +User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler User-agent: YouBot Disallow: /