213 lines
7.4 KiB
Python
Executable File
213 lines
7.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: UTF-8 -*-
|
|
#
|
|
# Search Server - SCGI server interfacing with Solr
|
|
# Copyright (C) 2007-2012 Parlement de la Communauté française de Belgique
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
import sys
|
|
import os
|
|
import resource
|
|
from optparse import OptionParser
|
|
from scgi.scgi_server import SCGIServer, SCGIHandler
|
|
import time
|
|
import syslog
|
|
import socket
|
|
from pysolr import Solr
|
|
import cgi
|
|
import json
|
|
import cairo
|
|
import poppler
|
|
import hashlib
|
|
import subprocess
|
|
|
|
class SearchHandler(SCGIHandler):
|
|
debug = False
|
|
daemon = False
|
|
solr = None
|
|
|
|
def handle_connection(self, conn):
|
|
if not self.solr:
|
|
self.solr = Solr('http://127.0.0.1:8080/solr/')
|
|
input = conn.makefile('r')
|
|
output = conn.makefile('w')
|
|
env = self.read_env(input)
|
|
bodysize = int(env.get('CONTENT_LENGTH', 0))
|
|
try:
|
|
self.produce(env, bodysize, input, output)
|
|
finally:
|
|
output.close()
|
|
input.close()
|
|
conn.close()
|
|
|
|
def produce(self, env, bodysize, input, output):
|
|
if self.debug:
|
|
print 'Request received at', time.strftime('[%Y-%m-%d %H:%M]')
|
|
print ' - body size:', bodysize
|
|
|
|
uri = env.get('REQUEST_URI')[len('/search/'):]
|
|
if self.debug:
|
|
print ' - uri:', uri
|
|
|
|
if uri == 'upload':
|
|
if bodysize == 0:
|
|
return self.redirect_home(env, output)
|
|
hdr = {'content-type': env.get('CONTENT_TYPE')}
|
|
field = cgi.FieldStorage(input, headers=hdr, environ=env).value[0]
|
|
filename = 'pdfs-N/N-%s' % os.path.basename(field.filename)
|
|
fd = file(filename, 'w')
|
|
fd.write(field.value)
|
|
fd.close()
|
|
if not self.index(filename=filename, id=os.path.basename(filename)):
|
|
return self.error_page(output, 'failed to index')
|
|
return self.redirect_home(env, output)
|
|
|
|
if uri.startswith('pdfs/') or uri.startswith('pdfs-N/'):
|
|
preview_filename = '/tmp/preview-%s.png' % hashlib.sha1(uri).hexdigest()
|
|
if uri.endswith('.png'):
|
|
uri = uri[:-4]
|
|
if not os.path.exists(preview_filename):
|
|
uri = 'file://' + os.getcwd() + '/' + uri
|
|
document = poppler.document_new_from_file(uri, None)
|
|
n_pages = document.get_n_pages()
|
|
current_page = document.get_page(0)
|
|
scale = 0.8
|
|
width, height = current_page.get_size()
|
|
surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(width*scale), int(height*scale))
|
|
cr = cairo.Context(surface)
|
|
cr.set_source_rgb(1, 1, 1)
|
|
if scale != 1:
|
|
cr.scale(scale, scale)
|
|
cr.rectangle(0, 0, width, height)
|
|
cr.fill()
|
|
current_page.render(cr)
|
|
surface.write_to_png(preview_filename)
|
|
print >> output, 'Content-type: image/png'
|
|
print >> output, ''
|
|
print ' - preview:', preview_filename
|
|
output.write(file(preview_filename).read())
|
|
return
|
|
|
|
if not env.get('QUERY_STRING'):
|
|
return self.redirect_home(env, output)
|
|
|
|
qs = cgi.parse_qs(env.get('QUERY_STRING'))
|
|
try:
|
|
query = unicode(qs['q'][0], 'utf-8')
|
|
except KeyError:
|
|
return self.error_page(output, 'missing q parameter')
|
|
if 'count' in qs:
|
|
count = int(qs['count'][0])
|
|
else:
|
|
count = 20
|
|
if 'offset' in qs:
|
|
offset = int(qs['offset'][0])
|
|
else:
|
|
offset = 0
|
|
|
|
results = self.solr.search(query, rows=count, start=offset, fl='* score')
|
|
|
|
response = {
|
|
'hits': results.hits,
|
|
'qtime': results.qtime,
|
|
'results': [{'score': x['score'], 'id': x['id']} for x in results],
|
|
}
|
|
|
|
print >> output, 'Content-type: application/json'
|
|
print >> output, ''
|
|
json.dump(response, output)
|
|
|
|
def redirect_home(self, env, output):
|
|
# go to homepage
|
|
root_url = 'http://%s' % env.get('SERVER_NAME')
|
|
if env.get('SERVER_PORT') != '80':
|
|
root_url = root_url + ':%s' % env.get('SERVER_PORT')
|
|
|
|
print >> output, 'Status: 302 Redirect'
|
|
print >> output, 'Content-type: text/plain'
|
|
print >> output, 'Location: ' + root_url
|
|
print >> output, ''
|
|
print >> output, 'Redirection'
|
|
|
|
def index(self, filename, id):
|
|
cmd = ['pdftotext', filename, '-']
|
|
try:
|
|
p = subprocess.Popen(cmd,
|
|
close_fds=True,
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE)
|
|
except OSError, e:
|
|
if self.debug:
|
|
print 'E: error running pdftotext'
|
|
return False
|
|
stdout, stderr = p.communicate()
|
|
if p.returncode != 0:
|
|
if self.debug:
|
|
print 'E: error running pdftotext (rc:%d)' % p.returncode
|
|
return False
|
|
self.solr.add([{'id': id, 'text': unicode(stdout, 'utf-8')}])
|
|
self.solr.commit()
|
|
return True
|
|
|
|
def error_page(self, output, message):
|
|
print >> output, 'Content-type: text/plain'
|
|
print >> output, ''
|
|
print >> output, message
|
|
|
|
|
|
def main():
|
|
parser = OptionParser()
|
|
parser.add_option('-p', '--port', dest = 'port', type='int', default = 2152)
|
|
parser.add_option('--debug', action = 'store_true', dest = 'debug')
|
|
parser.add_option('-f', '--foreground', dest='foreground', action='store_true')
|
|
parser.add_option('--pid', dest='pid')
|
|
options, args = parser.parse_args()
|
|
|
|
if not options.foreground:
|
|
SearchHandler.daemon = True
|
|
if os.fork():
|
|
os._exit(0)
|
|
os.setsid()
|
|
maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
|
|
if maxfd == resource.RLIM_INFINITY:
|
|
maxfd = 1024
|
|
for fd in range(maxfd):
|
|
try:
|
|
os.close(fd)
|
|
except OSError:
|
|
pass
|
|
os.open('/dev/null', os.O_RDWR)
|
|
os.dup2(0, 1)
|
|
os.dup2(0, 2)
|
|
if os.fork():
|
|
os._exit(0)
|
|
if options.pid:
|
|
file(options.pid, 'w').write(str(os.getpid()))
|
|
syslog.openlog('tabellio-search')
|
|
|
|
SearchHandler.debug = options.debug
|
|
try:
|
|
SCGIServer(handler_class=SearchHandler, port=options.port).serve()
|
|
except socket.error:
|
|
if SearchHandler.daemon:
|
|
syslog.syslog(syslog.LOG_CRIT, 'socket error (another instance is running?)')
|
|
print >> sys.stderr, 'E: socket error (another instance is running?)'
|
|
sys.exit(1)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|