commit 4c7909e023f9639d2e8b981df2183e0ba89be6e1 Author: Benjamin Dauvergne Date: Tue Nov 17 00:09:48 2015 +0100 first commit diff --git a/README b/README new file mode 100644 index 0000000..c92f55c --- /dev/null +++ b/README @@ -0,0 +1 @@ +W.C.S. ElasticSearch integration diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..52fafa2 --- /dev/null +++ b/setup.py @@ -0,0 +1,65 @@ +#! /usr/bin/env python + +''' Setup script for wcs-es +''' + +import os +import subprocess +from setuptools import setup, find_packages +from distutils.command.sdist import sdist as _sdist + + +class eo_sdist(_sdist): + def run(self): + if os.path.exists('VERSION'): + os.remove('VERSION') + version = get_version() + version_file = open('VERSION', 'w') + version_file.write(version) + version_file.close() + _sdist.run(self) + if os.path.exists('VERSION'): + os.remove('VERSION') + + +def get_version(): + '''Use the VERSION, if absent generates a version with git describe, if not + tag exists, take 0.0.0- and add the length of the commit log. + ''' + if os.path.exists('VERSION'): + with open('VERSION', 'r') as v: + return v.read() + if os.path.exists('.git'): + p = subprocess.Popen(['git', 'describe', '--dirty', '--match=v*'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + result = p.communicate()[0] + if p.returncode == 0: + return result.split()[0][1:].replace('-', '.') + else: + return '0.0.0-%s' % len(subprocess.check_output( + ['git', 'rev-list', 'HEAD']).splitlines()) + return '0.0.0' + +setup(name="wcs-es", + version=get_version(), + license="AGPLv3 or later", + description="W.C.S. ElasticSearch feeder", + long_description=file('README').read(), + url="http://dev.entrouvert.org/projects/wcs-es/", + author="Entr'ouvert", + author_email="info@entrouvert.org", + include_package_data=True, + packages=find_packages(), + install_requires=[ + 'requests', + 'elasticsearch', + ], + setup_requires=[ + ], + tests_require=[ + 'pytest', + ], + dependency_links=[], + cmdclass={ + 'sdist': eo_sdist, + }) diff --git a/wcs_es/__init__.py b/wcs_es/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wcs_es/cmd.py b/wcs_es/cmd.py new file mode 100644 index 0000000..34e3662 --- /dev/null +++ b/wcs_es/cmd.py @@ -0,0 +1,57 @@ +from . import wcs_api + +def provision(api, es, index_name, recreate=False): + if recreate and es.indices.exists(index_name): + es.indices.delete(index_name) + for formdef in api.get_formdefs(): + es.create(index=index_name, doc_type='formdef', body=formdef.json(), id=formdef.slug) + try: + for data in formdef.datas: + es.create(index_name, doc_type='form-'+formdef.slug, body=data.json(), + id=data.display_id) + print formdef.slug, 'populated' + except wcs_api.WcsApiError: + print '!!!', formdef.slug, 'failed' + +if __name__ == '__main__': + import argparse + import ConfigParser + import os + import elasticsearch + import urlparse + + config = ConfigParser.ConfigParser() + config_file = os.path.expanduser('~/.wcs_es.ini') + if os.path.exists(config_file): + config.read(config_file) + urls = [url for url in config.sections() if url.startswith('http://') or + url.startswith('https://')] + parser = argparse.ArgumentParser(description='Provision ES with W.C.S. data') + parser.add_argument('--url', help='url of the w.c.s. instance', required=not urls, + default=(urls or [None])[0]) + args, rest = parser.parse_known_args() + defaults = {} + if getattr(args, 'url') and config.has_section(args.url): + defaults = dict(config.items(args.url)) + parser.add_argument('--orig', help='origin of the request for signatures', + required='orig' not in defaults) + parser.add_argument('--key', help='HMAC key for signatures', required='key' not in defaults) + group = parser.add_mutually_exclusive_group( + required='email' not in defaults and 'name_id' not in defaults) + group.add_argument('--email', help='email for authentication') + group.add_argument('--name-id', help='NameID for authentication') + parser.add_argument('--es-host', help='ElasticSearch hostname', default='localhost') + parser.add_argument('--es-port', help='ElasticSearch hostname', type=int, default=9200) + parser.add_argument('--es-no-recreate', help='ElasticSearch hostname', dest='es_recreate', + default=True, action='store_false') + if defaults: + parser.set_defaults(**defaults) + + args = parser.parse_args() + api = wcs_api.WcsApi(url=args.url, orig=args.orig, key=args.key, email=args.email, + name_id=args.name_id) + index_name = urlparse.urlparse(args.url).netloc.split(':')[0].replace('.', '_') + es_hosts = [{'host': args.es_host, 'port': args.es_port, 'use_ssl': False}] + print es_hosts + es = elasticsearch.Elasticsearch(es_hosts) + provision(api, es, index_name, recreate=args.es_recreate) diff --git a/wcs_es/signature.py b/wcs_es/signature.py new file mode 100644 index 0000000..30124f9 --- /dev/null +++ b/wcs_es/signature.py @@ -0,0 +1,72 @@ +import datetime +import base64 +import hmac +import hashlib +import urllib +import random +import urlparse + +'''Simple signature scheme for query strings''' + + +def sign_url(url, key, algo='sha256', timestamp=None, nonce=None): + parsed = urlparse.urlparse(url) + new_query = sign_query(parsed.query, key, algo, timestamp, nonce) + return urlparse.urlunparse(parsed[:4] + (new_query,) + parsed[5:]) + + +def sign_query(query, key, algo='sha256', timestamp=None, nonce=None): + if timestamp is None: + timestamp = datetime.datetime.utcnow() + timestamp = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') + if nonce is None: + nonce = hex(random.SystemRandom().getrandbits(128))[2:-1] + new_query = query + if new_query: + new_query += '&' + new_query += urllib.urlencode(( + ('algo', algo), + ('timestamp', timestamp), + ('nonce', nonce))) + signature = base64.b64encode(sign_string(new_query, key, algo=algo)) + new_query += '&signature=' + urllib.quote(signature) + return new_query + + +def sign_string(s, key, algo='sha256', timedelta=30): + digestmod = getattr(hashlib, algo) + if isinstance(key, unicode): + key = key.encode('utf-8') + hash = hmac.HMAC(key, digestmod=digestmod, msg=s) + return hash.digest() + + +def check_url(url, key, known_nonce=None, timedelta=30): + parsed = urlparse.urlparse(url, 'https') + return check_query(parsed.query, key) + + +def check_query(query, key, known_nonce=None, timedelta=30): + parsed = urlparse.parse_qs(query) + signature = base64.b64decode(parsed['signature'][0]) + algo = parsed['algo'][0] + timestamp = parsed['timestamp'][0] + timestamp = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ') + nonce = parsed['nonce'] + unsigned_query = query.split('&signature=')[0] + if known_nonce is not None and known_nonce(nonce): + return False + if abs(datetime.datetime.utcnow() - timestamp) > datetime.timedelta(seconds=timedelta): + return False + return check_string(unsigned_query, signature, key, algo=algo) + + +def check_string(s, signature, key, algo='sha256'): + # constant time compare + signature2 = sign_string(s, key, algo=algo) + if len(signature2) != len(signature): + return False + res = 0 + for a, b in zip(signature, signature2): + res |= ord(a) ^ ord(b) + return res == 0 diff --git a/wcs_es/wcs_api.py b/wcs_es/wcs_api.py new file mode 100644 index 0000000..f14ba1e --- /dev/null +++ b/wcs_es/wcs_api.py @@ -0,0 +1,96 @@ +import requests +import urlparse +import urllib + +from . import signature + +class WcsApiError(Exception): + pass + + +class BaseObject(object): + def __init__(self, wcs_api, **kwargs): + self.__wcs_api = wcs_api + self.__dict__.update(**kwargs) + + def json(self): + d = self.__dict__.copy() + for key in d.keys(): + if key.startswith('_'): + del d[key] + return d + + +class FormData(BaseObject): + def json(self): + d = super(FormData, self).json() + formdef = d.pop('formdef') + d['formdef_slug'] = formdef.slug + return d + + def __repr__(self): + return '<{klass} {display_id!r}>'.format(klass=self.__class__.__name__, + display_id=self.id) + +class FormDef(BaseObject): + def __init__(self, wcs_api, **kwargs): + self.__wcs_api = wcs_api + self.__dict__.update(**kwargs) + + def __unicode__(self): + return self.title + + @property + def datas(self): + datas = self.__wcs_api.get_formdata(self.slug) + for data in datas: + data.formdef = self + return datas + + def __repr__(self): + return '<{klass} {slug!r}>'.format(klass=self.__class__.__name__, slug=self.slug) + + +class WcsApi(object): + def __init__(self, url, orig, key, name_id=None, email=None, verify=False): + self.url = url + self.orig = orig + self.key = key + self.email = email + self.name_id = name_id + self.verify = verify + + @property + def formdefs_url(self): + return urlparse.urljoin(self.url, 'api/formdefs/') + + @property + def forms_url(self): + return urlparse.urljoin(self.url, 'api/forms/') + + def get_json(self, *url_parts): + url = reduce(lambda x, y: urlparse.urljoin(x, y), url_parts) + params = {'orig': self.orig} + if self.email: + params['email'] = self.email + if self.name_id: + params['NameID'] = self.name_id + query_string = urllib.urlencode(params) + presigned_url = url + ('&' if '?' in url else '?') + query_string + signed_url = signature.sign_url(presigned_url, self.key) + try: + response = requests.get(signed_url, verify=False) + except requests.RequestException, e: + raise WcsApiError('GET request failed', signed_url, e) + else: + try: + return response.json() + except ValueError, e: + raise WcsApiError('Invalid JSON content', signed_url, e) + + def get_formdefs(self): + return [FormDef(wcs_api=self, **d) for d in self.get_json(self.formdefs_url)] + + def get_formdata(self, slug): + return [FormData(wcs_api=self, **d) for d in self.get_json(self.forms_url, slug + '/', + 'list?full=on')]