achatpublic parser

2011-05-09 23:53:42 +02:00 · 2011-05-09 23:53:42 +02:00 · 3468ab9b6e
parent 9bdeaaa633
commit 3468ab9b6e
1 changed files with 118 additions and 0 deletions
--- a/achatpublic.py
+++ b/achatpublic.py
@ -0,0 +1,118 @@
+#!/usr/bin/python
+# -*- encoding: utf-8 -*-
+
+import datetime
+import urllib
+import urllib2
+import re
+import BeautifulSoup
+import time
+
+achatpublic_url = 'https://www.achatpublic.com/sdm/ent/gen/ent_recherche.do'
+# delai entre deux requetes POST
+sleep = 3
+
+today = datetime.date.today()
+params = {
+    'debug':'jg',
+    'objetRecherche':'0',
+    'reference':'',
+    'personnepublique':'',
+    'region':'',
+    'departement':'',
+    'procedure':'-1',
+    'marche':'-1',
+    'intitule':'',
+    'codeCPV':'',
+    'jour':'%.2d' % today.day,
+    'mois':'%.2d' % today.month,
+    'annee':'%.4d' % today.year,
+    'precisionDate':'apres',
+    'orderby':'0',
+    'nbAffiche':'50',
+    'page':'0',
+}
+
+page = 0
+aos = []
+
+while True:
+
+    params['page'] = '%d' % page
+    post_params = urllib.urlencode(params)
+    result = urllib2.urlopen(achatpublic_url, post_params)
+
+    # si l'URL a été redirigée, on arrête le parcours des pages
+    url = result.geturl()
+    if url != achatpublic_url:
+        break
+
+    # info = result.info()
+    # print info
+
+    content = ''.join([l for l in result])
+    div = re.search('<div class="ResultTable">.*?</div>', content, re.DOTALL)
+
+    # pas de div ResultTable => on arrête le parcours des pages
+    if div == None:
+        break
+
+    # on interprète le résultat en HTML
+    soup = BeautifulSoup.BeautifulSoup(div.group(0))
+    div = soup.contents[0]
+
+    table = None
+    # on attrape la première <table> 
+    for tag in div:
+        if isinstance(tag, BeautifulSoup.Tag) and tag.name == 'table':
+            table = tag
+            break
+
+    # pas de table => on arrête le parcours des pages
+    if table == None:
+        break
+
+    # on extrait toutes les lignes <tr> de la table (sauf les 2 premières qui sont les titres)
+    trs=[tr for tr in table.contents
+            if isinstance(tr, BeautifulSoup.Tag) and tr.name == 'tr'][2::]
+
+    # on prend les lignes 2 par deux (2 lignes = un appel d'offre)
+    for n in xrange(len(trs) / 2):
+        # on sort les td des lignes
+        tds = [ td for td in trs[n*2] if isinstance(td, BeautifulSoup.Tag) and td.name == 'td']
+        tds += [ td for td in trs[n*2+1] if isinstance(td, BeautifulSoup.Tag) and td.name == 'td']
+        ao = {}
+        try:
+            ao['organisme'] = unicode(tds[0].string.strip())
+            ao['reference'] = unicode(tds[1].string.strip())
+            ao['procedure'] = unicode(tds[2].string.strip())
+            ao['intitule'] = unicode(tds[3].string.strip())
+            ao['type_marche'] = unicode(tds[4].string.strip())
+            ao['date_limite_str'] = unicode(tds[5].string.strip())
+        except:
+            # si problème pour lire un td : on passe à la ligne suivante
+            continue
+        # extraction de l'id dans "Sablier();detail(document.getElementById('consulId'),'CSL_2011_P_VBdK7hW8',document.getElementById('entMoteurDeRechercheForm'));return(false);"
+        try:
+            onclick = dict(tds[0].attrs)['onclick']
+            re_id = re.search("detail\(document\.getElementById\('consulId'\),'(.*?)',", onclick)
+            ao['id'] = unicode(re_id.group(1))
+        except:
+            ao['id'] = 'inconnu'
+        # essai d'interpretation de la date
+        try:
+            ao['date_limite'] = datetime.datetime.strptime(re.sub('\s*', '', ao['date_limite_str']), '%d/%m/%Y-%H:%M')
+        except:
+            ao['date_limite'] = datetime.datetime(1970,1,1,12,00)
+
+        print
+        print "::: ao %d :" % (len(aos)+1)
+        print ao
+
+        aos.append(ao)
+
+    # et on boucle sur la page suivante
+    page += 1
+    time.sleep(sleep)
+
+