This repository has been archived on 2023-02-21. You can view files and clone it, but cannot push or open issues or pull requests.
pfwb-ged-proto/server/search_server.py

213 lines
7.4 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Search Server - SCGI server interfacing with Solr
# Copyright (C) 2007-2012 Parlement de la Communauté française de Belgique
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
import sys
import os
import resource
from optparse import OptionParser
from scgi.scgi_server import SCGIServer, SCGIHandler
import time
import syslog
import socket
from pysolr import Solr
import cgi
import json
import cairo
import poppler
import hashlib
import subprocess
class SearchHandler(SCGIHandler):
debug = False
daemon = False
solr = None
def handle_connection(self, conn):
if not self.solr:
self.solr = Solr('http://127.0.0.1:8080/solr/')
input = conn.makefile('r')
output = conn.makefile('w')
env = self.read_env(input)
bodysize = int(env.get('CONTENT_LENGTH', 0))
try:
self.produce(env, bodysize, input, output)
finally:
output.close()
input.close()
conn.close()
def produce(self, env, bodysize, input, output):
if self.debug:
print 'Request received at', time.strftime('[%Y-%m-%d %H:%M]')
print ' - body size:', bodysize
uri = env.get('REQUEST_URI')[len('/search/'):]
if self.debug:
print ' - uri:', uri
if uri == 'upload':
if bodysize == 0:
return self.redirect_home(env, output)
hdr = {'content-type': env.get('CONTENT_TYPE')}
field = cgi.FieldStorage(input, headers=hdr, environ=env).value[0]
filename = 'pdfs-N/N-%s' % os.path.basename(field.filename)
fd = file(filename, 'w')
fd.write(field.value)
fd.close()
if not self.index(filename=filename, id=os.path.basename(filename)):
return self.error_page(output, 'failed to index')
return self.redirect_home(env, output)
if uri.startswith('pdfs/') or uri.startswith('pdfs-N/'):
preview_filename = '/tmp/preview-%s.png' % hashlib.sha1(uri).hexdigest()
if uri.endswith('.png'):
uri = uri[:-4]
if not os.path.exists(preview_filename):
uri = 'file://' + os.getcwd() + '/' + uri
document = poppler.document_new_from_file(uri, None)
n_pages = document.get_n_pages()
current_page = document.get_page(0)
scale = 0.8
width, height = current_page.get_size()
surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(width*scale), int(height*scale))
cr = cairo.Context(surface)
cr.set_source_rgb(1, 1, 1)
if scale != 1:
cr.scale(scale, scale)
cr.rectangle(0, 0, width, height)
cr.fill()
current_page.render(cr)
surface.write_to_png(preview_filename)
print >> output, 'Content-type: image/png'
print >> output, ''
print ' - preview:', preview_filename
output.write(file(preview_filename).read())
return
if not env.get('QUERY_STRING'):
return self.redirect_home(env, output)
qs = cgi.parse_qs(env.get('QUERY_STRING'))
try:
query = unicode(qs['q'][0], 'utf-8')
except KeyError:
return self.error_page(output, 'missing q parameter')
if 'count' in qs:
count = int(qs['count'][0])
else:
count = 20
if 'offset' in qs:
offset = int(qs['offset'][0])
else:
offset = 0
results = self.solr.search(query, rows=count, start=offset, fl='* score')
response = {
'hits': results.hits,
'qtime': results.qtime,
'results': [{'score': x['score'], 'id': x['id']} for x in results],
}
print >> output, 'Content-type: application/json'
print >> output, ''
json.dump(response, output)
def redirect_home(self, env, output):
# go to homepage
root_url = 'http://%s' % env.get('SERVER_NAME')
if env.get('SERVER_PORT') != '80':
root_url = root_url + ':%s' % env.get('SERVER_PORT')
print >> output, 'Status: 302 Redirect'
print >> output, 'Content-type: text/plain'
print >> output, 'Location: ' + root_url
print >> output, ''
print >> output, 'Redirection'
def index(self, filename, id):
cmd = ['pdftotext', filename, '-']
try:
p = subprocess.Popen(cmd,
close_fds=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except OSError, e:
if self.debug:
print 'E: error running pdftotext'
return False
stdout, stderr = p.communicate()
if p.returncode != 0:
if self.debug:
print 'E: error running pdftotext (rc:%d)' % p.returncode
return False
self.solr.add([{'id': id, 'text': unicode(stdout, 'utf-8')}])
self.solr.commit()
return True
def error_page(self, output, message):
print >> output, 'Content-type: text/plain'
print >> output, ''
print >> output, message
def main():
parser = OptionParser()
parser.add_option('-p', '--port', dest = 'port', type='int', default = 2152)
parser.add_option('--debug', action = 'store_true', dest = 'debug')
parser.add_option('-f', '--foreground', dest='foreground', action='store_true')
parser.add_option('--pid', dest='pid')
options, args = parser.parse_args()
if not options.foreground:
SearchHandler.daemon = True
if os.fork():
os._exit(0)
os.setsid()
maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
if maxfd == resource.RLIM_INFINITY:
maxfd = 1024
for fd in range(maxfd):
try:
os.close(fd)
except OSError:
pass
os.open('/dev/null', os.O_RDWR)
os.dup2(0, 1)
os.dup2(0, 2)
if os.fork():
os._exit(0)
if options.pid:
file(options.pid, 'w').write(str(os.getpid()))
syslog.openlog('tabellio-search')
SearchHandler.debug = options.debug
try:
SCGIServer(handler_class=SearchHandler, port=options.port).serve()
except socket.error:
if SearchHandler.daemon:
syslog.syslog(syslog.LOG_CRIT, 'socket error (another instance is running?)')
print >> sys.stderr, 'E: socket error (another instance is running?)'
sys.exit(1)
if __name__ == '__main__':
main()