From 0cce2c083809ad6e64182fd4dc05ddec7d73323b Mon Sep 17 00:00:00 2001
From: Daenney <daenney@users.noreply.github.com>
Date: Sat, 30 Sep 2023 21:44:57 +0200
Subject: [PATCH] [feature] Block a bunch of "AI" crawlers (#2239)

* [feature] Block Google Bard/AI crawlers

* [feature] Block the other OpenAI crawler

* [feature] Block Common Crawl crawler

This is used in research, but also gleefully advertises itself as the
training source used in all LLMs and GPT-3.

Fixes: #2240

* [feature] Block Omgilikebot

Used by some shady big web data engine company.

* [feature] Block Meta's language model crawler

* [feature] Block well-known.dev crawler
---
 internal/web/robots.go | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/internal/web/robots.go b/internal/web/robots.go
index a79c31034..aee4d1a55 100644
--- a/internal/web/robots.go
+++ b/internal/web/robots.go
@@ -34,6 +34,36 @@ const (
 User-agent: GPTBot
 Disallow: /
 
+# As of September 2023, GPTBot and ChatGPT-User are equivalent. But there's no telling
+# when OpenAI might decide to change that, so block this one too.
+User-agent: ChatGPT-User
+Disallow: /
+
+# And a giant fuck you to Google Bard and their other generative AI ventures too.
+# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
+User-agent: Google-Extended
+Disallow: /
+
+# Block CommonCrawl. Used in training LLMs and specifically GPT-3.
+# https://commoncrawl.org/faq
+User-agent: CCBot
+Disallow: /
+
+# Block Omgilike/Webz.io, a "Big Web Data" engine.
+# https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/
+User-agent: Omgilibot
+Disallow: /
+
+# Block Faceboobot, because Meta.
+# https://developers.facebook.com/docs/sharing/bot
+User-agent: FacebookBot
+Disallow: /
+
+# Well-known.dev crawler. Indexes stuff under /.well-known.
+# https://well-known.dev/about/
+User-agent: WellKnownBot
+Disallow: /
+
 # Rules for everything else.
 User-agent: *
 Crawl-delay: 500