achatpublic parser
This commit is contained in:
parent
9bdeaaa633
commit
3468ab9b6e
|
@ -0,0 +1,118 @@
|
|||
#!/usr/bin/python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import datetime
|
||||
import urllib
|
||||
import urllib2
|
||||
import re
|
||||
import BeautifulSoup
|
||||
import time
|
||||
|
||||
achatpublic_url = 'https://www.achatpublic.com/sdm/ent/gen/ent_recherche.do'
|
||||
# delai entre deux requetes POST
|
||||
sleep = 3
|
||||
|
||||
today = datetime.date.today()
|
||||
params = {
|
||||
'debug':'jg',
|
||||
'objetRecherche':'0',
|
||||
'reference':'',
|
||||
'personnepublique':'',
|
||||
'region':'',
|
||||
'departement':'',
|
||||
'procedure':'-1',
|
||||
'marche':'-1',
|
||||
'intitule':'',
|
||||
'codeCPV':'',
|
||||
'jour':'%.2d' % today.day,
|
||||
'mois':'%.2d' % today.month,
|
||||
'annee':'%.4d' % today.year,
|
||||
'precisionDate':'apres',
|
||||
'orderby':'0',
|
||||
'nbAffiche':'50',
|
||||
'page':'0',
|
||||
}
|
||||
|
||||
page = 0
|
||||
aos = []
|
||||
|
||||
while True:
|
||||
|
||||
params['page'] = '%d' % page
|
||||
post_params = urllib.urlencode(params)
|
||||
result = urllib2.urlopen(achatpublic_url, post_params)
|
||||
|
||||
# si l'URL a été redirigée, on arrête le parcours des pages
|
||||
url = result.geturl()
|
||||
if url != achatpublic_url:
|
||||
break
|
||||
|
||||
# info = result.info()
|
||||
# print info
|
||||
|
||||
content = ''.join([l for l in result])
|
||||
div = re.search('<div class="ResultTable">.*?</div>', content, re.DOTALL)
|
||||
|
||||
# pas de div ResultTable => on arrête le parcours des pages
|
||||
if div == None:
|
||||
break
|
||||
|
||||
# on interprète le résultat en HTML
|
||||
soup = BeautifulSoup.BeautifulSoup(div.group(0))
|
||||
div = soup.contents[0]
|
||||
|
||||
table = None
|
||||
# on attrape la première <table>
|
||||
for tag in div:
|
||||
if isinstance(tag, BeautifulSoup.Tag) and tag.name == 'table':
|
||||
table = tag
|
||||
break
|
||||
|
||||
# pas de table => on arrête le parcours des pages
|
||||
if table == None:
|
||||
break
|
||||
|
||||
# on extrait toutes les lignes <tr> de la table (sauf les 2 premières qui sont les titres)
|
||||
trs=[tr for tr in table.contents
|
||||
if isinstance(tr, BeautifulSoup.Tag) and tr.name == 'tr'][2::]
|
||||
|
||||
# on prend les lignes 2 par deux (2 lignes = un appel d'offre)
|
||||
for n in xrange(len(trs) / 2):
|
||||
# on sort les td des lignes
|
||||
tds = [ td for td in trs[n*2] if isinstance(td, BeautifulSoup.Tag) and td.name == 'td']
|
||||
tds += [ td for td in trs[n*2+1] if isinstance(td, BeautifulSoup.Tag) and td.name == 'td']
|
||||
ao = {}
|
||||
try:
|
||||
ao['organisme'] = unicode(tds[0].string.strip())
|
||||
ao['reference'] = unicode(tds[1].string.strip())
|
||||
ao['procedure'] = unicode(tds[2].string.strip())
|
||||
ao['intitule'] = unicode(tds[3].string.strip())
|
||||
ao['type_marche'] = unicode(tds[4].string.strip())
|
||||
ao['date_limite_str'] = unicode(tds[5].string.strip())
|
||||
except:
|
||||
# si problème pour lire un td : on passe à la ligne suivante
|
||||
continue
|
||||
# extraction de l'id dans "Sablier();detail(document.getElementById('consulId'),'CSL_2011_P_VBdK7hW8',document.getElementById('entMoteurDeRechercheForm'));return(false);"
|
||||
try:
|
||||
onclick = dict(tds[0].attrs)['onclick']
|
||||
re_id = re.search("detail\(document\.getElementById\('consulId'\),'(.*?)',", onclick)
|
||||
ao['id'] = unicode(re_id.group(1))
|
||||
except:
|
||||
ao['id'] = 'inconnu'
|
||||
# essai d'interpretation de la date
|
||||
try:
|
||||
ao['date_limite'] = datetime.datetime.strptime(re.sub('\s*', '', ao['date_limite_str']), '%d/%m/%Y-%H:%M')
|
||||
except:
|
||||
ao['date_limite'] = datetime.datetime(1970,1,1,12,00)
|
||||
|
||||
print
|
||||
print "::: ao %d :" % (len(aos)+1)
|
||||
print ao
|
||||
|
||||
aos.append(ao)
|
||||
|
||||
# et on boucle sur la page suivante
|
||||
page += 1
|
||||
time.sleep(sleep)
|
||||
|
||||
|
Reference in New Issue