aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/static/robots.allow.txt
blob: 9f98785dbdb5557e3632aba2fde2ec3f6bf7ed8b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Hello friends!
# If you are considering large or automated crawling, you may want to look at
# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.

# large-scale bots should not index search pages
User-agent: SemrushBot
User-agent: YandexBot
User-agent: bingbot
User-agent: Googlebot
User-agent: SemanticScholarBot
User-agent: yacybot
User-agent: PetalBot
User-agent: Yeti
User-agent: Riddler
Disallow: /search

# crawling search result pages is expensive, so we do specify a long crawl
# delay for those (for bots other than the above broad search bots)
# UPDATE: actually, just block all robots from search page, we are overwhelmed
# as of 2022-10-31
User-agent: *
Disallow: /search
#Allow: /search
#Crawl-delay: 5

# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
# codes are used for rate-limiting instead of any crawl delay specified here.
# Up to a handful concurrent requests should be fine.
User-Agent: *
Allow: /

Sitemap: https://scholar.archive.org/sitemap.xml
Sitemap: https://scholar.archive.org/sitemap-index-works.xml

# same info as sitemap-index-works.xml plus following citation_pdf_url
#Sitemap: https://scholar.archive.org/sitemap-index-access.xml