From 0e77e2daa1c48bcab9e57d79573401c732211ea0 Mon Sep 17 00:00:00 2001 From: gil Date: Tue, 25 Jun 2024 10:18:16 -0500 Subject: [PATCH] Add `robots.txt` --- .eleventy.js | 1 + README.md | 6 +-- src/res/robots.txt | 130 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 3 deletions(-) create mode 100644 src/res/robots.txt diff --git a/.eleventy.js b/.eleventy.js index 45d4b5d..45b3a7f 100644 --- a/.eleventy.js +++ b/.eleventy.js @@ -11,6 +11,7 @@ module.exports = function (eleventyConfig) { eleventyConfig.addPassthroughCopy({ "./src/res/css/**/*.css": "css" }); eleventyConfig.addPassthroughCopy({ "./src/res/font": "font" }); eleventyConfig.addPassthroughCopy({ "./src/res/img": "img" }); + eleventyConfig.addPassthroughCopy({ "./src/res/*.txt": "."}); eleventyConfig.addWatchTarget("./src/res/css"); return { diff --git a/README.md b/README.md index 1f65dd9..a382331 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Todo -- [x] Add colophon and licensing info (kinda) +- [x] Add colophon and licensing info - [x] Clean up and polish landing page - [ ] Do some i18n? (e.g., English, German, Tagalog, Bisaya, Spanish, etc.) - [ ] Add more graphical stuff to site so it doesn't look so bland @@ -10,8 +10,8 @@ - [ ] Global background - [ ] Custom badge - [ ] Make a webring with ringfairy and join webrings -- [x] Redo about page (kinda) -- [ ] Add blog post +- [x] Redo about page +- [x] Add blog post # Changelog diff --git a/src/res/robots.txt b/src/res/robots.txt new file mode 100644 index 0000000..b9ddede --- /dev/null +++ b/src/res/robots.txt @@ -0,0 +1,130 @@ +# Currently borrowed from seirdy.one. Will add my own blocks +User-agent: * + +# I opt out of online advertising so malware that injects ads on my site won't +# get paid. You should do the same. my ads.txt file contains a standard +# placeholder to forbid any compliant ad networks from paying for ad placement +# on my domain. +User-agent: Adsbot +Disallow: / +Allow: /ads.txt +Allow: /app-ads.txt + +# By allowing us access, you enable the maximum number +# of advertisers to confidently purchase advertising space on your pages. Our +# comprehensive data insights help advertisers understand the suitability and +# context of your content, ensuring that their ads align with your audience's +# interests and needs. This alignment leads to improved user experiences, +# increased engagement, and ultimately, higher revenue potential for your +# publication. (https://www.peer39.com/crawler-notice) +# --> fuck off. +User-agent: peer39_crawler +User-agent: peer39_crawler/1.0 +Disallow: / + +## IP-violation scanners ## + +# The next three are borrowed from https://www.videolan.org/robots.txt + +# > This robot collects content from the Internet for the sole purpose of # +# helping educational institutions prevent plagiarism. [...] we compare student +# papers against the content we find on the Internet to see if we # can find +# similarities. (http://www.turnitin.com/robot/crawlerinfo.html) +# --> fuck off. +User-agent: TurnitinBot +Disallow: / + +# > NameProtect engages in crawling activity in search of a wide range of brand +# and other intellectual property violations that may be of interest to our +# clients. (http://www.nameprotect.com/botinfo.html) +# --> fuck off. +User-agent: NPBot +Disallow: / + +# iThenticate is a new service we have developed to combat the piracy of +# intellectual property and ensure the originality of written work for# +# publishers, non-profit agencies, corporations, and newspapers. +# (http://www.slysearch.com/) +# --> fuck off. +User-agent: SlySearch +Disallow: / + +# BLEXBot assists internet marketers to get information on the link structure +# of sites and their interlinking on the web, to avoid any technical and +# possible legal issues and improve overall online experience. +# (http://webmeup-crawler.com/) +# --> fuck off. +User-agent: BLEXBot +Disallow: / + +# Providing Intellectual Property professionals with superior brand protection +# services by artfully merging the latest technology with expert analysis. +# (https://www.checkmarknetwork.com/spider.html/) +# "The Internet is just way to big to effectively police alone." (ACTUAL quote) +# --> fuck off. +User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html) +Disallow: / + +# Stop trademark violations and affiliate non-compliance in paid search. +# Automatically monitor your partner and affiliates’ online marketing to +# protect yourself from harmful brand violations and regulatory risks. We +# regularly crawl websites on behalf of our clients to ensure content +# compliance with brand and regulatory guidelines. +# (https://www.brandverity.com/why-is-brandverity-visiting-me) +# --> fuck off. +User-agent: BrandVerity/1.0 +Disallow: / + +## Misc. icky stuff ## + +# Pipl assembles online identity information from multiple independent sources +# to create the most complete picture of a digital identity and connect it to +# real people and their offline identity records. When all the fragments of +# online identity data are collected, connected, and corroborated, the result +# is a more trustworthy identity. +# --> fuck off. +User-agent: PiplBot +Disallow: / + +## Gen-AI data scrapers ## + +# Eat shit, OpenAI. +User-agent: ChatGPT-User +User-agent: GPTBot +Disallow: / + +# Official way to opt-out of Google's generative AI training: +# +User-agent: Google-Extended +Disallow: / + +# Official way to opt-out of LLM training by Apple +# +User-agent: Applebot-Extended +Disallow: / + +# Anthropic-AI crawler posted guidance after a long period of crawling without opt-out documentation: +User-agent: ClaudeBot +Disallow: / + +# FacebookBot crawls public web pages to improve language models for our speech +# recognition technology. +# +User-agent: FacebookBot +Disallow: / + +# I'm not blocking CCBot for now. It publishes a free index for anyone to use. +# Googe used this to train the initial version of Bard (now called Gemini). +# I allow CCBot since its index is also used for upstart/hobbyist search engines +# like Alexandria and for genuinely useful academic work I personally like. +# I allow Owler for similar reasons: +# +# . +# Omgilibot/Omgili is similar to CCBot, except it sells the scrape results. +# I'm not familiar enough with Omgili to make a call here. +# In the long run, my embedded robots meta-tags and headers could cover gen-AI + +# I don't block cohere-ai or Perplexitybot: they don't appear to actually +# scrape data for LLM training purposes. The crawling powers search engines +# with integrated pre-trained LLMs. +# TODO: investigate whether YouBot scrapes to train its own in-house LLM. \ No newline at end of file