diff --git a/internal/web/robots.go b/internal/web/robots.go index a79c31034..aee4d1a55 100644 --- a/internal/web/robots.go +++ b/internal/web/robots.go @@ -34,6 +34,36 @@ const ( User-agent: GPTBot Disallow: / +# As of September 2023, GPTBot and ChatGPT-User are equivalent. But there's no telling +# when OpenAI might decide to change that, so block this one too. +User-agent: ChatGPT-User +Disallow: / + +# And a giant fuck you to Google Bard and their other generative AI ventures too. +# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers +User-agent: Google-Extended +Disallow: / + +# Block CommonCrawl. Used in training LLMs and specifically GPT-3. +# https://commoncrawl.org/faq +User-agent: CCBot +Disallow: / + +# Block Omgilike/Webz.io, a "Big Web Data" engine. +# https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/ +User-agent: Omgilibot +Disallow: / + +# Block Faceboobot, because Meta. +# https://developers.facebook.com/docs/sharing/bot +User-agent: FacebookBot +Disallow: / + +# Well-known.dev crawler. Indexes stuff under /.well-known. +# https://well-known.dev/about/ +User-agent: WellKnownBot +Disallow: / + # Rules for everything else. User-agent: * Crawl-delay: 500