misc: simplify user-agent bot check (#68644)
This commit is contained in:
parent
5522a3f7b3
commit
f99948d1de
178
data/webbots
178
data/webbots
|
@ -1,178 +0,0 @@
|
|||
AbachoBOT
|
||||
abcdatos_botlink
|
||||
http://www.abcdatos.com/botlink/
|
||||
AESOP_com_SpiderMan
|
||||
ah-ha.com crawler (crawler@ah-ha.com)
|
||||
ia_archiver
|
||||
Scooter
|
||||
Mercator
|
||||
Scooter2_Mercator_3-1.0
|
||||
roach.smo.av.com-1.0
|
||||
Tv<nn>_Merc_resh_26_1_D-1.0
|
||||
AltaVista-Intranet
|
||||
jan.gelin@av.com
|
||||
FAST-WebCrawler
|
||||
crawler@fast.no
|
||||
Acoon Robot
|
||||
antibot
|
||||
Atomz
|
||||
AxmoRobot
|
||||
Buscaplus Robi
|
||||
http://www.buscaplus.com/robi/
|
||||
CanSeek/
|
||||
support@canseek.ca
|
||||
ChristCRAWLER
|
||||
http://www.christcrawler.com/
|
||||
Clushbot
|
||||
http://www.clush.com/bot.html
|
||||
Crawler
|
||||
admin@crawler.de
|
||||
DaAdLe.com ROBOT/
|
||||
RaBot
|
||||
Agent-admin/ phortse@hanmail.net
|
||||
contact/jylee@kies.co.kr
|
||||
RaBot
|
||||
Agent-admin/ webmaster@kisco.go.kr
|
||||
DeepIndex
|
||||
DittoSpyder
|
||||
Jack
|
||||
EARTHCOM.info
|
||||
Speedy Spider
|
||||
ArchitextSpider
|
||||
ArchitectSpider
|
||||
EuripBot
|
||||
Arachnoidea
|
||||
arachnoidea@euroseek.net
|
||||
EZResult
|
||||
Fast PartnerSite Crawler
|
||||
FAST Data Search Crawler
|
||||
FAST Data Search Document Retriever
|
||||
KIT-Fireball
|
||||
france.misesajour.com
|
||||
FyberSearch
|
||||
GalaxyBot
|
||||
http://www.galaxy.com/galaxybot.html
|
||||
geckobot
|
||||
GenCrawler
|
||||
GeonaBot
|
||||
getRAX
|
||||
Googlebot
|
||||
googlebot@googlebot.com
|
||||
http://googlebot.com/
|
||||
moget/2.0
|
||||
moget@goo.ne.jp
|
||||
Aranha
|
||||
Slurp.so/1.0
|
||||
slurp@inktomi.com
|
||||
Slurp/2.0j
|
||||
slurp@inktomi.com
|
||||
www.inktomisearch.com
|
||||
Slurp/2.0-KiteHourly
|
||||
slurp@inktomi.com;
|
||||
www.inktomi.com/slurp.html
|
||||
Slurp/2.0-OwlWeekly
|
||||
spider@aeneid.com
|
||||
www.inktomi.com/slurp.html
|
||||
Slurp/3.0-AU
|
||||
slurp@inktomi.com
|
||||
Toutatis 2.5-2
|
||||
Hubater
|
||||
http://www.almaden.ibm.com/cs/crawler
|
||||
IlTrovatore-Setaccio
|
||||
IncyWincy
|
||||
UltraSeek
|
||||
InfoSeek Sidewinder
|
||||
Mole2/1.0
|
||||
webmaster@intags.de
|
||||
MP3Bot
|
||||
C-PBWF-ip3000.com-crawler
|
||||
ip3000.com-crawler
|
||||
http://www.istarthere.com
|
||||
spider@istarthere.com
|
||||
Knowledge.com/
|
||||
kuloko-bot/0.2
|
||||
LNSpiderguy
|
||||
Linknzbot
|
||||
lookbot
|
||||
MantraAgent
|
||||
NetResearchServer
|
||||
www.loopimprovements.com/robot.html
|
||||
Lycos_Spider_(T-Rex)
|
||||
JoocerBot
|
||||
HenryTheMiragoRobot
|
||||
MojeekBot
|
||||
mozDex/
|
||||
MSNBOT/0.1
|
||||
http://search.msn.com/msnbot.htm)
|
||||
Navadoo Crawler
|
||||
Gulliver
|
||||
ObjectsSearch/0.01
|
||||
PicoSearch/
|
||||
PJspider
|
||||
DIIbot
|
||||
nttdirectory_robot
|
||||
super-robot@super.navi.ocn.ne.jp
|
||||
griffon
|
||||
griffon@super.navi.ocn.ne.jp
|
||||
Spider/maxbot.com
|
||||
admin@maxbot.com
|
||||
various (fakes agent on each access)
|
||||
gazz/1.0
|
||||
gazz@nttrd.com
|
||||
???
|
||||
NationalDirectory-SuperSpider
|
||||
dloader(NaverRobot)/
|
||||
dumrobo(NaverRobot)/
|
||||
Openfind piranha,Shark
|
||||
robot-response@openfind.com.tw
|
||||
Openbot/
|
||||
psbot
|
||||
www.picsearch.org/bot.html
|
||||
CrawlerBoy Pinpoint.com
|
||||
user<n>.ip3000.com
|
||||
QweeryBot
|
||||
http://qweerybot.qweery.com)
|
||||
AlkalineBOT
|
||||
SeznamBot
|
||||
Search-10
|
||||
Fluffy the spider
|
||||
info@searchhippo.com)
|
||||
Scrubby/
|
||||
asterias
|
||||
speedfind ramBot xtreme
|
||||
Kototoi/0.1
|
||||
SearchByUsa
|
||||
Searchspider/
|
||||
SightQuestBot/
|
||||
http://www.sightquest.com/bot.htm
|
||||
Spider_Monkey/
|
||||
Surfnomore Spider v1.1
|
||||
Robot@SuperSnooper.Com
|
||||
teoma_agent1
|
||||
teoma_admin@hawkholdings.com
|
||||
Teradex_Mapper
|
||||
mapper@teradex.com
|
||||
ESISmartSpider
|
||||
Spider TraficDublu
|
||||
Tutorial Crawler
|
||||
http://www.tutorgig.com/crawler
|
||||
updated/0.1beta
|
||||
crawler@updated.com
|
||||
UK Searcher Spider
|
||||
Vivante Link Checker
|
||||
appie
|
||||
Nazilla
|
||||
www.WebWombat.com.au
|
||||
marvin/infoseek
|
||||
marvin-team@webseek.de
|
||||
MuscatFerret
|
||||
WhizBang! Lab
|
||||
ZyBorg
|
||||
(info@WISEnut.com)
|
||||
WIRE WebRefiner:
|
||||
webrefiner@wire.co.uk
|
||||
WSCbot
|
||||
Yandex
|
||||
Yellopet-Spider
|
||||
libwww-perl
|
||||
Iron33
|
3
setup.py
3
setup.py
|
@ -190,8 +190,7 @@ setup(
|
|||
data_files=data_tree('share/wcs/web/', 'data/web/')
|
||||
+ data_tree('share/wcs/themes/', 'data/themes/')
|
||||
+ data_tree('share/wcs/vendor/', 'data/vendor/')
|
||||
+ data_tree('share/wcs/qommon/', 'wcs/qommon/static/')
|
||||
+ [('share/wcs/', ['data/webbots'])],
|
||||
+ data_tree('share/wcs/qommon/', 'wcs/qommon/static/'),
|
||||
)
|
||||
|
||||
if local_cfg:
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
|
||||
import base64
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
|
@ -205,13 +204,8 @@ class HTTPRequest(quixote.http_request.HTTPRequest):
|
|||
return not (self.is_in_backoffice() or self.is_api_url())
|
||||
|
||||
def is_from_bot(self):
|
||||
botfile = os.path.join(get_publisher().data_dir, 'webbots')
|
||||
user_agent = self.get_environ('HTTP_USER_AGENT', '')
|
||||
with open(botfile) as fd:
|
||||
for bot_ua_string in [x.strip() for x in fd.readlines()]:
|
||||
if bot_ua_string in user_agent:
|
||||
return True
|
||||
return False
|
||||
user_agent = self.get_environ('HTTP_USER_AGENT', '').lower()
|
||||
return bool('bot' in user_agent or 'crawl' in user_agent)
|
||||
|
||||
def is_from_application(self):
|
||||
# detect calls made from other applications or debug tools
|
||||
|
|
Loading…
Reference in New Issue