misc: simplify user-agent bot check (#68644)

This commit is contained in:
Frédéric Péters 2022-09-01 21:02:33 +02:00
parent 5522a3f7b3
commit f99948d1de
3 changed files with 3 additions and 188 deletions

View File

@ -1,178 +0,0 @@
AbachoBOT
abcdatos_botlink
http://www.abcdatos.com/botlink/
AESOP_com_SpiderMan
ah-ha.com crawler (crawler@ah-ha.com)
ia_archiver
Scooter
Mercator
Scooter2_Mercator_3-1.0
roach.smo.av.com-1.0
Tv<nn>_Merc_resh_26_1_D-1.0
AltaVista-Intranet
jan.gelin@av.com
FAST-WebCrawler
crawler@fast.no
Acoon Robot
antibot
Atomz
AxmoRobot
Buscaplus Robi
http://www.buscaplus.com/robi/
CanSeek/
support@canseek.ca
ChristCRAWLER
http://www.christcrawler.com/
Clushbot
http://www.clush.com/bot.html
Crawler
admin@crawler.de
DaAdLe.com ROBOT/
RaBot
Agent-admin/ phortse@hanmail.net
contact/jylee@kies.co.kr
RaBot
Agent-admin/ webmaster@kisco.go.kr
DeepIndex
DittoSpyder
Jack
EARTHCOM.info
Speedy Spider
ArchitextSpider
ArchitectSpider
EuripBot
Arachnoidea
arachnoidea@euroseek.net
EZResult
Fast PartnerSite Crawler
FAST Data Search Crawler
FAST Data Search Document Retriever
KIT-Fireball
france.misesajour.com
FyberSearch
GalaxyBot
http://www.galaxy.com/galaxybot.html
geckobot
GenCrawler
GeonaBot
getRAX
Googlebot
googlebot@googlebot.com
http://googlebot.com/
moget/2.0
moget@goo.ne.jp
Aranha
Slurp.so/1.0
slurp@inktomi.com
Slurp/2.0j
slurp@inktomi.com
www.inktomisearch.com
Slurp/2.0-KiteHourly
slurp@inktomi.com;
www.inktomi.com/slurp.html
Slurp/2.0-OwlWeekly
spider@aeneid.com
www.inktomi.com/slurp.html
Slurp/3.0-AU
slurp@inktomi.com
Toutatis 2.5-2
Hubater
http://www.almaden.ibm.com/cs/crawler
IlTrovatore-Setaccio
IncyWincy
UltraSeek
InfoSeek Sidewinder
Mole2/1.0
webmaster@intags.de
MP3Bot
C-PBWF-ip3000.com-crawler
ip3000.com-crawler
http://www.istarthere.com
spider@istarthere.com
Knowledge.com/
kuloko-bot/0.2
LNSpiderguy
Linknzbot
lookbot
MantraAgent
NetResearchServer
www.loopimprovements.com/robot.html
Lycos_Spider_(T-Rex)
JoocerBot
HenryTheMiragoRobot
MojeekBot
mozDex/
MSNBOT/0.1
http://search.msn.com/msnbot.htm)
Navadoo Crawler
Gulliver
ObjectsSearch/0.01
PicoSearch/
PJspider
DIIbot
nttdirectory_robot
super-robot@super.navi.ocn.ne.jp
griffon
griffon@super.navi.ocn.ne.jp
Spider/maxbot.com
admin@maxbot.com
various (fakes agent on each access)
gazz/1.0
gazz@nttrd.com
???
NationalDirectory-SuperSpider
dloader(NaverRobot)/
dumrobo(NaverRobot)/
Openfind piranha,Shark
robot-response@openfind.com.tw
Openbot/
psbot
www.picsearch.org/bot.html
CrawlerBoy Pinpoint.com
user<n>.ip3000.com
QweeryBot
http://qweerybot.qweery.com)
AlkalineBOT
SeznamBot
Search-10
Fluffy the spider
info@searchhippo.com)
Scrubby/
asterias
speedfind ramBot xtreme
Kototoi/0.1
SearchByUsa
Searchspider/
SightQuestBot/
http://www.sightquest.com/bot.htm
Spider_Monkey/
Surfnomore Spider v1.1
Robot@SuperSnooper.Com
teoma_agent1
teoma_admin@hawkholdings.com
Teradex_Mapper
mapper@teradex.com
ESISmartSpider
Spider TraficDublu
Tutorial Crawler
http://www.tutorgig.com/crawler
updated/0.1beta
crawler@updated.com
UK Searcher Spider
Vivante Link Checker
appie
Nazilla
www.WebWombat.com.au
marvin/infoseek
marvin-team@webseek.de
MuscatFerret
WhizBang! Lab
ZyBorg
(info@WISEnut.com)
WIRE WebRefiner:
webrefiner@wire.co.uk
WSCbot
Yandex
Yellopet-Spider
libwww-perl
Iron33

View File

@ -190,8 +190,7 @@ setup(
data_files=data_tree('share/wcs/web/', 'data/web/')
+ data_tree('share/wcs/themes/', 'data/themes/')
+ data_tree('share/wcs/vendor/', 'data/vendor/')
+ data_tree('share/wcs/qommon/', 'wcs/qommon/static/')
+ [('share/wcs/', ['data/webbots'])],
+ data_tree('share/wcs/qommon/', 'wcs/qommon/static/'),
)
if local_cfg:

View File

@ -16,7 +16,6 @@
import base64
import copy
import os
import re
import time
@ -205,13 +204,8 @@ class HTTPRequest(quixote.http_request.HTTPRequest):
return not (self.is_in_backoffice() or self.is_api_url())
def is_from_bot(self):
botfile = os.path.join(get_publisher().data_dir, 'webbots')
user_agent = self.get_environ('HTTP_USER_AGENT', '')
with open(botfile) as fd:
for bot_ua_string in [x.strip() for x in fd.readlines()]:
if bot_ua_string in user_agent:
return True
return False
user_agent = self.get_environ('HTTP_USER_AGENT', '').lower()
return bool('bot' in user_agent or 'crawl' in user_agent)
def is_from_application(self):
# detect calls made from other applications or debug tools