# Cyclesite — UK Bike Marketplace
# https://www.cyclesite.co.uk
#
# This file is the canonical robots.txt body served from
# frontend-next/app/robots.txt/route.ts. Edit here directly — the legacy
# `frontend/scripts/build-robots.mjs` generator is no longer the source of
# truth (frontend-next is the canonical SSR codebase as of 2026-05-01).
#
# AI & LLM Policy: Cyclesite WELCOMES responsible AI crawling.
# See: https://www.cyclesite.co.uk/llms.txt
# See: https://www.cyclesite.co.uk/ai.txt
# See: https://www.cyclesite.co.uk/.well-known/ai.json
# See: https://www.cyclesite.co.uk/.well-known/agents.json
# See: https://www.cyclesite.co.uk/.well-known/mcp.json            (one-line MCP pointer)
# See: https://www.cyclesite.co.uk/.well-known/mcp/server-card.json (full MCP server card)
# MCP Streamable HTTP endpoint: https://www.cyclesite.co.uk/api/mcp-server
# npm install:                  npx -y cyclesite-mcp-server
#
# NOTE on /auth/ — intentionally NOT blocked for search engines
# (Googlebot / Bingbot / etc.). Those pages emit
# <meta name="robots" content="noindex"> server-side via the SEO
# component. Blocking crawl here would prevent Google from seeing
# the directive, causing URLs to be indexed as "URL submitted"
# with no snippet (documented anti-pattern — see SEO incident memo
# 2026-04-13). Let Google crawl /auth/*, read the noindex, and
# drop the URL cleanly.
#
# AI training crawlers (GPTBot, Amazonbot, Meta-ExternalAgent etc.)
# are a different story — they aren't search engines, so the "must
# crawl to see noindex" rationale doesn't apply, and they don't need
# indexing-status feedback at all. Until 2026-05-11 they were
# fetching /auth/sign-in ~5800 times/day (GPTBot alone: 3680 hits ≈
# 637 MB egress). That's wasted Railway bandwidth on a page that
# returns noindex,nofollow and has no useful AI-training content.
# Disallow: /auth/ is added in the AI-crawlers group below.

# Default rules — applies to all user-agents not listed below
#
# ai-train=no (2026-06-17, was =yes 2026-06-10..2026-06-17): commercial-
# protection posture. We WANT to be found and cited live — search=yes and
# ai-input=yes keep full search visibility and the AI-answer/citation upside
# (OAI-SearchBot, PerplexityBot, Claude-User, Googlebot etc. all stay
# allowed). But our first-party data (sold/asking-price corpus, valuation
# curves, listings) is the moat, so we do NOT release it as free LLM training
# data. Training and bulk reuse are reserved for commercial licensing — see
# /ai.txt §2 and /legal/data-licence (partnerships@cyclesite.co.uk). As of
# 2026-06-18 Google-Extended is ALLOWED (to court Gemini/Vertex grounding
# citations, which it gates); the training-opt-out is now carried by the
# Content-Signal ai-train=no header below rather than a hard Disallow.
# Applebot-Extended (Apple training, little citation upside) stays Disallowed in
# the section below.
User-agent: *
Content-Signal: search=yes, ai-input=yes, ai-train=no
Allow: /
Disallow: /account/
Disallow: /admin/
Disallow: /seller/
Disallow: /dealer/
Disallow: /business/
Disallow: /ai/
Disallow: /dev/
Disallow: /swipe
# /compare-bikes pair removed 2026-05-17: the `Disallow: /compare-bikes`
# line blocked Google from crawling the bare hub URL (the
# `Allow: /compare-bikes/` trailing-slash variant only re-allows
# /compare-bikes/<slug>, not the bare /compare-bikes). The hub is a real
# indexable page (`<meta robots content="index, follow">`, in sitemap)
# serving "Compare Bikes Side by Side" content — the rules were a relic
# of an old policy intent. Audit 2026-05-17.
# /buy, /buy?, /bikes-for-sale?, /search? — un-blocked 2026-05-10.
# The original Disallow rules pre-dated the SSR cutover.
# Three problems they caused:
#   1. /buy and /buy?location=<city> 301-redirect to /bikes-for-sale and
#      /locations/<city>. Disallow blocked Google from FETCHING the URL,
#      so it never followed the redirect — historical clicks (e.g. GSC
#      data showed clicks on /buy?location=hereford) became zombie URLs
#      stuck in the index, never passing PageRank to the canonical
#      destination.
#   2. /bikes-for-sale?page=2 etc. emit <link rel="canonical"> pointing
#      at /bikes-for-sale. Crawl + canonical is the standard pagination
#      pattern; Disallow short-circuited Google's ability to consolidate.
#   3. /search?q=… emits noindex,follow + canonical /search. Google needs
#      to crawl to read the noindex; Disallow makes Google guess (or
#      worse, "URL submitted, content unknown" warnings).
# Verified canonical tags + noindex (where appropriate) are emitted on
# every variant before un-blocking. Audit 2026-05-10.
Disallow: /checkout/
Disallow: /billing/
Disallow: /bike-shops/map
Disallow: /bike-shops/*/reviews
Allow: /bike-history
Allow: /bike-history/check
Allow: /bike-history/*/linked-listings
Disallow: /bike-history/*/claim
Disallow: /bike-history/*/add-event
# /stolen-bikes/report un-blocked 2026-05-17: real indexable page
# (`Report a Stolen Bike UK`, in sitemap at priority 0.75) that the
# team wants Google to find. The previous Disallow was a
# sitemap-vs-robots conflict that surfaced as GSC "Submitted URL
# blocked by robots.txt" warnings.
Disallow: /stolen-bikes/search
Disallow: /contact/ticket
Disallow: /contact-seller/
Disallow: /contact-retailer/
Disallow: /support/report-abuse
Disallow: /listing/*/photos
Disallow: /listing/*/spec
Disallow: /listing/*/history
Disallow: /listing/*/seller
Disallow: /listing/*/similar
Disallow: /listing/*/share
Allow: /forum
Allow: /forum/*
# Community/forum surface — Reddit-shaped public discussions.
# Every post page is a high-value cycling Q&A; we want every search and
# AI engine to crawl, render, and cite. Markdown alternates at /markdown
# are the AI-preferred path.
Allow: /community
Allow: /community/*
Allow: /c/*
Allow: /u/*

# =====================================================
# TRAINING-ONLY OPT-OUT (updated 2026-06-18)
# These tokens gate ONLY AI-model training and carry zero search/discovery
# cost; the matching search crawlers (Googlebot, Applebot) keep full access in
# their own groups.
#
# 2026-06-18 change: Google-Extended is now ALLOWED. GA4 showed Gemini and
# Perplexity sending ~0 referral while ChatGPT became our #3 source; courting
# Gemini/Vertex grounding citations (which Google-Extended gates) is worth more
# than the training opt-out it also covered. The commercial-protection posture
# is now carried solely by the Content-Signal `ai-train=no` header + /ai.txt §2
# + /legal/data-licence (the softer, still-honoured signal) — see the AI
# crawlers group below. Applebot-Extended stays Disallowed: Apple Intelligence
# *training* carries little citation upside (Applebot itself, allowed, already
# powers Siri/Spotlight citation), so keeping that opt-out costs no live reach.
# Training / bulk reuse remains reserved for commercial licensing
# (/ai.txt §2, /legal/data-licence, partnerships@cyclesite.co.uk).
# =====================================================
User-agent: Google-Extended
Allow: /

User-agent: Applebot-Extended
Disallow: /

# =====================================================
# AI / LLM CRAWLERS — welcomed with the same block list as *
# CCBot (Common Crawl) stays crawlable for discovery and live citation, but
# the group Content-Signal below now declares ai-train=no (2026-06-17
# commercial-protection posture): training and bulk reuse are licence-gated,
# not free. To HARD-block training-corpus crawlers (CCBot, GPTBot, ClaudeBot)
# entirely, give them their own Disallow groups — deliberately NOT done here,
# to preserve discovery and live citation. Bytespider remains blocked
# separately due to documented rate-limit and robots.txt violations.
# =====================================================
User-agent: GPTBot
User-agent: ChatGPT-User
User-agent: OAI-SearchBot
User-agent: GoogleOther
User-agent: GoogleOther-Image
User-agent: GoogleOther-Video
User-agent: Anthropic-AI
User-agent: ClaudeBot
User-agent: Claude-Web
User-agent: Claude-User
User-agent: BingPreview
User-agent: Applebot
User-agent: Meta-ExternalAgent
User-agent: Meta-ExternalFetcher
User-agent: FacebookBot
User-agent: PerplexityBot
User-agent: PerplexityUser
User-agent: cohere-ai
User-agent: YouBot
User-agent: Amazonbot
User-agent: PetalBot
User-agent: Diffbot
User-agent: AI2Bot
User-agent: MistralBot
User-agent: xAI-Grok
User-agent: DeepSeekBot
User-agent: InflectionBot
User-agent: CCBot
User-agent: YandexBot
User-agent: Baiduspider
# ai-train=no — see the rationale on the default group's Content-Signal.
# These crawlers stay ALLOWED so Cyclesite is found and cited live
# (ai-input=yes); our data is just not free training material. Google-Extended
# is now ALLOWED too (2026-06-18, for Gemini citation); only Applebot-Extended
# keeps its own Disallow group above.
Content-Signal: search=yes, ai-input=yes, ai-train=no
Allow: /
# /auth/* — disallowed for AI crawlers only (see top-of-file note).
# Search engines (Googlebot, Bingbot) keep /auth/ access in their
# group below so they can read the noindex meta and drop URLs cleanly.
Disallow: /auth/
Disallow: /account/
Disallow: /admin/
Disallow: /seller/
Disallow: /dealer/
Disallow: /business/
Disallow: /ai/
Disallow: /dev/
Disallow: /swipe
# /compare-bikes pair removed 2026-05-17: the `Disallow: /compare-bikes`
# line blocked Google from crawling the bare hub URL (the
# `Allow: /compare-bikes/` trailing-slash variant only re-allows
# /compare-bikes/<slug>, not the bare /compare-bikes). The hub is a real
# indexable page (`<meta robots content="index, follow">`, in sitemap)
# serving "Compare Bikes Side by Side" content — the rules were a relic
# of an old policy intent. Audit 2026-05-17.
# /buy, /buy?, /bikes-for-sale?, /search? — un-blocked 2026-05-10.
# The original Disallow rules pre-dated the SSR cutover.
# Three problems they caused:
#   1. /buy and /buy?location=<city> 301-redirect to /bikes-for-sale and
#      /locations/<city>. Disallow blocked Google from FETCHING the URL,
#      so it never followed the redirect — historical clicks (e.g. GSC
#      data showed clicks on /buy?location=hereford) became zombie URLs
#      stuck in the index, never passing PageRank to the canonical
#      destination.
#   2. /bikes-for-sale?page=2 etc. emit <link rel="canonical"> pointing
#      at /bikes-for-sale. Crawl + canonical is the standard pagination
#      pattern; Disallow short-circuited Google's ability to consolidate.
#   3. /search?q=… emits noindex,follow + canonical /search. Google needs
#      to crawl to read the noindex; Disallow makes Google guess (or
#      worse, "URL submitted, content unknown" warnings).
# Verified canonical tags + noindex (where appropriate) are emitted on
# every variant before un-blocking. Audit 2026-05-10.
Disallow: /checkout/
Disallow: /billing/
Disallow: /bike-shops/map
Disallow: /bike-shops/*/reviews
Allow: /bike-history
Allow: /bike-history/check
Allow: /bike-history/*/linked-listings
Disallow: /bike-history/*/claim
Disallow: /bike-history/*/add-event
# /stolen-bikes/report un-blocked 2026-05-17: real indexable page
# (`Report a Stolen Bike UK`, in sitemap at priority 0.75) that the
# team wants Google to find. The previous Disallow was a
# sitemap-vs-robots conflict that surfaced as GSC "Submitted URL
# blocked by robots.txt" warnings.
Disallow: /stolen-bikes/search
Disallow: /contact/ticket
Disallow: /contact-seller/
Disallow: /contact-retailer/
Disallow: /support/report-abuse
Disallow: /listing/*/photos
Disallow: /listing/*/spec
Disallow: /listing/*/history
Disallow: /listing/*/seller
Disallow: /listing/*/similar
Disallow: /listing/*/share
Allow: /forum
Allow: /forum/*
Allow: /community
Allow: /community/*
Allow: /c/*
Allow: /u/*

# =====================================================
# Search engines and social bots
# =====================================================
User-agent: Bingbot
User-agent: facebookexternalhit
User-agent: BraveBot
User-agent: DuckDuckBot
Allow: /
Disallow: /account/
Disallow: /admin/
Disallow: /seller/
Disallow: /dealer/
Disallow: /business/
Disallow: /ai/
Disallow: /dev/
Disallow: /swipe
# /compare-bikes pair removed 2026-05-17: the `Disallow: /compare-bikes`
# line blocked Google from crawling the bare hub URL (the
# `Allow: /compare-bikes/` trailing-slash variant only re-allows
# /compare-bikes/<slug>, not the bare /compare-bikes). The hub is a real
# indexable page (`<meta robots content="index, follow">`, in sitemap)
# serving "Compare Bikes Side by Side" content — the rules were a relic
# of an old policy intent. Audit 2026-05-17.
# /buy, /buy?, /bikes-for-sale?, /search? — un-blocked 2026-05-10.
# The original Disallow rules pre-dated the SSR cutover.
# Three problems they caused:
#   1. /buy and /buy?location=<city> 301-redirect to /bikes-for-sale and
#      /locations/<city>. Disallow blocked Google from FETCHING the URL,
#      so it never followed the redirect — historical clicks (e.g. GSC
#      data showed clicks on /buy?location=hereford) became zombie URLs
#      stuck in the index, never passing PageRank to the canonical
#      destination.
#   2. /bikes-for-sale?page=2 etc. emit <link rel="canonical"> pointing
#      at /bikes-for-sale. Crawl + canonical is the standard pagination
#      pattern; Disallow short-circuited Google's ability to consolidate.
#   3. /search?q=… emits noindex,follow + canonical /search. Google needs
#      to crawl to read the noindex; Disallow makes Google guess (or
#      worse, "URL submitted, content unknown" warnings).
# Verified canonical tags + noindex (where appropriate) are emitted on
# every variant before un-blocking. Audit 2026-05-10.
Disallow: /checkout/
Disallow: /billing/
Disallow: /bike-shops/map
Disallow: /bike-shops/*/reviews
Allow: /bike-history
Allow: /bike-history/check
Allow: /bike-history/*/linked-listings
Disallow: /bike-history/*/claim
Disallow: /bike-history/*/add-event
# /stolen-bikes/report un-blocked 2026-05-17: real indexable page
# (`Report a Stolen Bike UK`, in sitemap at priority 0.75) that the
# team wants Google to find. The previous Disallow was a
# sitemap-vs-robots conflict that surfaced as GSC "Submitted URL
# blocked by robots.txt" warnings.
Disallow: /stolen-bikes/search
Disallow: /contact/ticket
Disallow: /contact-seller/
Disallow: /contact-retailer/
Disallow: /support/report-abuse
Disallow: /listing/*/photos
Disallow: /listing/*/spec
Disallow: /listing/*/history
Disallow: /listing/*/seller
Disallow: /listing/*/similar
Disallow: /listing/*/share
Allow: /forum
Allow: /forum/*

# =====================================================
# SEO monitoring — allowed with rate limits
# =====================================================
User-agent: AhrefsBot
Allow: /
Disallow: /account/
Disallow: /admin/
Disallow: /seller/
Disallow: /dealer/
Disallow: /business/
Disallow: /ai/
Disallow: /dev/
Disallow: /swipe
# /compare-bikes pair removed 2026-05-17: the `Disallow: /compare-bikes`
# line blocked Google from crawling the bare hub URL (the
# `Allow: /compare-bikes/` trailing-slash variant only re-allows
# /compare-bikes/<slug>, not the bare /compare-bikes). The hub is a real
# indexable page (`<meta robots content="index, follow">`, in sitemap)
# serving "Compare Bikes Side by Side" content — the rules were a relic
# of an old policy intent. Audit 2026-05-17.
# /buy, /buy?, /bikes-for-sale?, /search? — un-blocked 2026-05-10.
# The original Disallow rules pre-dated the SSR cutover.
# Three problems they caused:
#   1. /buy and /buy?location=<city> 301-redirect to /bikes-for-sale and
#      /locations/<city>. Disallow blocked Google from FETCHING the URL,
#      so it never followed the redirect — historical clicks (e.g. GSC
#      data showed clicks on /buy?location=hereford) became zombie URLs
#      stuck in the index, never passing PageRank to the canonical
#      destination.
#   2. /bikes-for-sale?page=2 etc. emit <link rel="canonical"> pointing
#      at /bikes-for-sale. Crawl + canonical is the standard pagination
#      pattern; Disallow short-circuited Google's ability to consolidate.
#   3. /search?q=… emits noindex,follow + canonical /search. Google needs
#      to crawl to read the noindex; Disallow makes Google guess (or
#      worse, "URL submitted, content unknown" warnings).
# Verified canonical tags + noindex (where appropriate) are emitted on
# every variant before un-blocking. Audit 2026-05-10.
Disallow: /checkout/
Disallow: /billing/
Disallow: /bike-shops/map
Disallow: /bike-shops/*/reviews
Allow: /bike-history
Allow: /bike-history/check
Allow: /bike-history/*/linked-listings
Disallow: /bike-history/*/claim
Disallow: /bike-history/*/add-event
# /stolen-bikes/report un-blocked 2026-05-17: real indexable page
# (`Report a Stolen Bike UK`, in sitemap at priority 0.75) that the
# team wants Google to find. The previous Disallow was a
# sitemap-vs-robots conflict that surfaced as GSC "Submitted URL
# blocked by robots.txt" warnings.
Disallow: /stolen-bikes/search
Disallow: /contact/ticket
Disallow: /contact-seller/
Disallow: /contact-retailer/
Disallow: /support/report-abuse
Disallow: /listing/*/photos
Disallow: /listing/*/spec
Disallow: /listing/*/history
Disallow: /listing/*/seller
Disallow: /listing/*/similar
Disallow: /listing/*/share
Allow: /forum
Allow: /forum/*
Crawl-delay: 2
User-agent: SemrushBot
Allow: /
Disallow: /account/
Disallow: /admin/
Disallow: /seller/
Disallow: /dealer/
Disallow: /business/
Disallow: /ai/
Disallow: /dev/
Disallow: /swipe
# /compare-bikes pair removed 2026-05-17: the `Disallow: /compare-bikes`
# line blocked Google from crawling the bare hub URL (the
# `Allow: /compare-bikes/` trailing-slash variant only re-allows
# /compare-bikes/<slug>, not the bare /compare-bikes). The hub is a real
# indexable page (`<meta robots content="index, follow">`, in sitemap)
# serving "Compare Bikes Side by Side" content — the rules were a relic
# of an old policy intent. Audit 2026-05-17.
# /buy, /buy?, /bikes-for-sale?, /search? — un-blocked 2026-05-10.
# The original Disallow rules pre-dated the SSR cutover.
# Three problems they caused:
#   1. /buy and /buy?location=<city> 301-redirect to /bikes-for-sale and
#      /locations/<city>. Disallow blocked Google from FETCHING the URL,
#      so it never followed the redirect — historical clicks (e.g. GSC
#      data showed clicks on /buy?location=hereford) became zombie URLs
#      stuck in the index, never passing PageRank to the canonical
#      destination.
#   2. /bikes-for-sale?page=2 etc. emit <link rel="canonical"> pointing
#      at /bikes-for-sale. Crawl + canonical is the standard pagination
#      pattern; Disallow short-circuited Google's ability to consolidate.
#   3. /search?q=… emits noindex,follow + canonical /search. Google needs
#      to crawl to read the noindex; Disallow makes Google guess (or
#      worse, "URL submitted, content unknown" warnings).
# Verified canonical tags + noindex (where appropriate) are emitted on
# every variant before un-blocking. Audit 2026-05-10.
Disallow: /checkout/
Disallow: /billing/
Disallow: /bike-shops/map
Disallow: /bike-shops/*/reviews
Allow: /bike-history
Allow: /bike-history/check
Allow: /bike-history/*/linked-listings
Disallow: /bike-history/*/claim
Disallow: /bike-history/*/add-event
# /stolen-bikes/report un-blocked 2026-05-17: real indexable page
# (`Report a Stolen Bike UK`, in sitemap at priority 0.75) that the
# team wants Google to find. The previous Disallow was a
# sitemap-vs-robots conflict that surfaced as GSC "Submitted URL
# blocked by robots.txt" warnings.
Disallow: /stolen-bikes/search
Disallow: /contact/ticket
Disallow: /contact-seller/
Disallow: /contact-retailer/
Disallow: /support/report-abuse
Disallow: /listing/*/photos
Disallow: /listing/*/spec
Disallow: /listing/*/history
Disallow: /listing/*/seller
Disallow: /listing/*/similar
Disallow: /listing/*/share
Allow: /forum
Allow: /forum/*
Crawl-delay: 2

# =====================================================
# Blocked scrapers (malicious / aggressive / ignore rate limits)
# =====================================================
User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: MegaIndex
Disallow: /

User-agent: SEOkicks
Disallow: /

User-agent: SerpstatBot
Disallow: /

User-agent: Sistrix
Disallow: /

User-agent: SiteExplorer
Disallow: /

User-agent: Screaming Frog
Disallow: /

User-agent: Sogou
Disallow: /

User-agent: MauiBot
Disallow: /

User-agent: linkdexbot
Disallow: /

User-agent: spbot
Disallow: /

User-agent: Exabot
Disallow: /

User-agent: Gigabot
Disallow: /

User-agent: Netcraft
Disallow: /

User-agent: BlekkoBot
Disallow: /

# CCBot (Common Crawl) — UNBLOCKED 2026-05-17, now allowed via the AI / LLM
# CRAWLERS group above with a conservative 15 r/s rate limit at the middleware
# layer (apps/api/src/shared/constants/bot-registry.ts).

User-agent: Bytespider
Disallow: /

User-agent: python-requests
Disallow: /

User-agent: python-urllib
Disallow: /

User-agent: scrapy
Disallow: /

User-agent: curl
Disallow: /

User-agent: wget
Disallow: /

User-agent: libwww
Disallow: /

User-agent: Java
Disallow: /

User-agent: Go-http-client
Disallow: /

User-agent: axios
Disallow: /

User-agent: node-fetch
Disallow: /

User-agent: http-client
Disallow: /

User-agent: Nikto
Disallow: /

User-agent: sqlmap
Disallow: /

User-agent: nmap
Disallow: /

User-agent: masscan
Disallow: /

User-agent: ZAP
Disallow: /

User-agent: Burp
Disallow: /

User-agent: PhantomJS
Disallow: /

# HeadlessChrome intentionally NOT blocked — Lighthouse, PSI, web.dev
# measure, chrome-devtools MCP and Google's GSC live URL render all
# ship "HeadlessChrome" in their UA. Blocking it broke our own SEO
# diagnostics for zero gain (determined scrapers spoof UA anyway).
# Behavioural detection layer handles lazy puppeteer defaults.
# Matches bot-registry.ts BLOCKED_BOT_PATTERNS comment (2026-05-03).

User-agent: HTTrack
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: SiteSucker
Disallow: /

User-agent: Teleport Pro
Disallow: /

User-agent: WebCapture
Disallow: /

# =====================================================
# Sitemaps
# =====================================================
# Primary URL index (proxied to backend, splits into per-category children).
Sitemap: https://www.cyclesite.co.uk/sitemap-index.xml
# Image sitemap — declared explicitly so crawlers do not have to discover it
# from the index. Spec: https://developers.google.com/search/docs/crawling-indexing/sitemaps/image-sitemaps
Sitemap: https://www.cyclesite.co.uk/sitemap-images.xml
# News sitemap. Only includes editorial articles published in the last 48
# hours, per the Google News protocol. Discovers /bike-guides news content.
Sitemap: https://www.cyclesite.co.uk/news-sitemap.xml
# Bike-guides section sitemap. Editorial guides, topic-cluster pages,
# author pages and supporting feeds. lastmod reflects actual publication
# dates, not deploy date, so crawlers see a natural editorial cadence.
Sitemap: https://www.cyclesite.co.uk/bike-guides/sitemap.xml

# Community section sitemap (forum hub, about, feedback board, topic seeds).
# Supplemental until the backend sitemapindex enumerates /community natively.
Sitemap: https://www.cyclesite.co.uk/community/sitemap.xml