summaryrefslogtreecommitdiffstats
path: root/benchmark
diff options
context:
space:
mode:
authorأحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net>2015-05-07 11:22:20 (GMT)
committerأحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net>2015-05-07 11:22:20 (GMT)
commit50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c (patch)
tree7a1733a984c947f52e7d3a6a19bdfa9f35908ba9 /benchmark
downloadpython-whoosh-50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c.zip
python-whoosh-50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c.tar.gz
python-whoosh-50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c.tar.bz2
python-whoosh (2.7.0-1) unstable; urgency=medium
* New upstream release. * Update watch file. Thanks to Piotr Ożarowski * debian/copyright: Update copyright years. * debian/upstream/metadata: Added upstream metadata. # imported from the archive
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/dcvgr10.txt.gzbin0 -> 201819 bytes
-rw-r--r--benchmark/dictionary.py43
-rw-r--r--benchmark/enron.py185
-rw-r--r--benchmark/marc21.py297
-rw-r--r--benchmark/reuters.py38
-rw-r--r--benchmark/reuters21578.txt.gzbin0 -> 181938 bytes
6 files changed, 563 insertions, 0 deletions
diff --git a/benchmark/dcvgr10.txt.gz b/benchmark/dcvgr10.txt.gz
new file mode 100644
index 0000000..e0e2877
--- /dev/null
+++ b/benchmark/dcvgr10.txt.gz
Binary files differ
diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py
new file mode 100644
index 0000000..77feb48
--- /dev/null
+++ b/benchmark/dictionary.py
@@ -0,0 +1,43 @@
+import os.path, gzip
+
+from whoosh import analysis, fields
+from whoosh.support.bench import Bench, Spec
+
+
+class VulgarTongue(Spec):
+ name = "dictionary"
+ filename = "dcvgr10.txt.gz"
+ headline_field = "head"
+
+ def documents(self):
+ path = os.path.join(self.options.dir, self.filename)
+ f = gzip.GzipFile(path)
+
+ head = body = None
+ for line in f:
+ line = line.decode("latin1")
+ if line[0].isalpha():
+ if head:
+ yield {"head": head, "body": head + body}
+ head, body = line.split(".", 1)
+ else:
+ body += line
+
+ if head:
+ yield {"head": head, "body": head + body}
+
+ def whoosh_schema(self):
+ ana = analysis.StemmingAnalyzer()
+ #ana = analysis.StandardAnalyzer()
+ schema = fields.Schema(head=fields.ID(stored=True),
+ body=fields.TEXT(analyzer=ana, stored=True))
+ return schema
+
+ def zcatalog_setup(self, cat):
+ from zcatalog import indexes #@UnresolvedImport
+ cat["head"] = indexes.FieldIndex(field_name="head")
+ cat["body"] = indexes.TextIndex(field_name="body")
+
+
+if __name__ == "__main__":
+ Bench().run(VulgarTongue)
diff --git a/benchmark/enron.py b/benchmark/enron.py
new file mode 100644
index 0000000..80650c3
--- /dev/null
+++ b/benchmark/enron.py
@@ -0,0 +1,185 @@
+from __future__ import division
+import os.path, tarfile
+from email import message_from_string
+from marshal import dump, load
+from zlib import compress, decompress
+
+try:
+ import xappy
+except ImportError:
+ pass
+
+from whoosh import analysis, fields
+from whoosh.compat import urlretrieve, next
+from whoosh.support.bench import Bench, Spec
+from whoosh.util import now
+
+
+# Benchmark class
+
+class Enron(Spec):
+ name = "enron"
+
+ enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz"
+ enron_archive_filename = "enron_mail_082109.tar.gz"
+ cache_filename = "enron_cache.pickle"
+
+ header_to_field = {"Date": "date", "From": "frm", "To": "to",
+ "Subject": "subject", "Cc": "cc", "Bcc": "bcc"}
+
+ main_field = "body"
+ headline_field = "subject"
+
+ field_order = ("subject", "date", "from", "to", "cc", "bcc", "body")
+
+ cachefile = None
+
+ # Functions for downloading and then reading the email archive and caching
+ # the messages in an easier-to-digest format
+
+ def download_archive(self, archive):
+ print("Downloading Enron email archive to %r..." % archive)
+ t = now()
+ urlretrieve(self.enron_archive_url, archive)
+ print("Downloaded in ", now() - t, "seconds")
+
+ @staticmethod
+ def get_texts(archive):
+ archive = tarfile.open(archive, "r:gz")
+ while True:
+ entry = next(archive)
+ archive.members = []
+ if entry is None:
+ break
+ f = archive.extractfile(entry)
+ if f is not None:
+ text = f.read()
+ yield text
+
+ @staticmethod
+ def get_messages(archive, headers=True):
+ header_to_field = Enron.header_to_field
+ for text in Enron.get_texts(archive):
+ message = message_from_string(text)
+ body = message.as_string().decode("latin_1")
+ blank = body.find("\n\n")
+ if blank > -1:
+ body = body[blank+2:]
+ d = {"body": body}
+ if headers:
+ for k in message.keys():
+ fn = header_to_field.get(k)
+ if not fn: continue
+ v = message.get(k).strip()
+ if v:
+ d[fn] = v.decode("latin_1")
+ yield d
+
+ def cache_messages(self, archive, cache):
+ print("Caching messages in %s..." % cache)
+
+ if not os.path.exists(archive):
+ raise Exception("Archive file %r does not exist" % archive)
+
+ t = now()
+ f = open(cache, "wb")
+ c = 0
+ for d in self.get_messages(archive):
+ c += 1
+ dump(d, f)
+ if not c % 1000: print(c)
+ f.close()
+ print("Cached messages in ", now() - t, "seconds")
+
+ def setup(self):
+ archive = os.path.abspath(os.path.join(self.options.dir, self.enron_archive_filename))
+ cache = os.path.abspath(os.path.join(self.options.dir, self.cache_filename))
+
+ if not os.path.exists(archive):
+ self.download_archive(archive)
+ else:
+ print("Archive is OK")
+
+ if not os.path.exists(cache):
+ self.cache_messages(archive, cache)
+ else:
+ print("Cache is OK")
+
+ def documents(self):
+ if not os.path.exists(self.cache_filename):
+ raise Exception("Message cache does not exist, use --setup")
+
+ f = open(self.cache_filename, "rb")
+ try:
+ while True:
+ self.filepos = f.tell()
+ d = load(f)
+ yield d
+ except EOFError:
+ pass
+ f.close()
+
+ def whoosh_schema(self):
+ ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None)
+ storebody = self.options.storebody
+ schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=storebody),
+ filepos=fields.STORED,
+ date=fields.ID(stored=True),
+ frm=fields.ID(stored=True),
+ to=fields.IDLIST(stored=True),
+ subject=fields.TEXT(stored=True),
+ cc=fields.IDLIST,
+ bcc=fields.IDLIST)
+ return schema
+
+ def xappy_indexer_connection(self, path):
+ conn = xappy.IndexerConnection(path)
+ conn.add_field_action('body', xappy.FieldActions.INDEX_FREETEXT, language='en')
+ if self.options.storebody:
+ conn.add_field_action('body', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('date', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('date', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('frm', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('frm', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('to', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('to', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('subject', xappy.FieldActions.INDEX_FREETEXT, language='en')
+ conn.add_field_action('subject', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT)
+ return conn
+
+ def zcatalog_setup(self, cat):
+ from zcatalog import indexes
+ for name in ("date", "frm"):
+ cat[name] = indexes.FieldIndex(field_name=name)
+ for name in ("to", "subject", "cc", "bcc", "body"):
+ cat[name] = indexes.TextIndex(field_name=name)
+
+ def process_document_whoosh(self, d):
+ d["filepos"] = self.filepos
+ if self.options.storebody:
+ mf = self.main_field
+ d["_stored_%s" % mf] = compress(d[mf], 9)
+
+ def process_result_whoosh(self, d):
+ mf = self.main_field
+ if mf in d:
+ d.fields()[mf] = decompress(d[mf])
+ else:
+ if not self.cachefile:
+ self.cachefile = open(self.cache_filename, "rb")
+ filepos = d["filepos"]
+ self.cachefile.seek(filepos)
+ dd = load(self.cachefile)
+ d.fields()[mf] = dd[mf]
+ return d
+
+ def process_document_xapian(self, d):
+ d[self.main_field] = " ".join([d.get(name, "") for name
+ in self.field_order])
+
+
+
+if __name__=="__main__":
+ Bench().run(Enron)
diff --git a/benchmark/marc21.py b/benchmark/marc21.py
new file mode 100644
index 0000000..9a2bb9b
--- /dev/null
+++ b/benchmark/marc21.py
@@ -0,0 +1,297 @@
+from __future__ import with_statement, print_function
+import fnmatch, logging, os.path, re
+
+from whoosh import analysis, fields, index, qparser, query, scoring
+from whoosh.compat import xrange
+from whoosh.util import now
+
+
+log = logging.getLogger(__name__)
+
+
+# Functions for reading MARC format
+
+LEADER = (' ' * 10) + '22' + (' ' * 8) + '4500'
+LEADER_LEN = len(LEADER)
+DIRECTORY_ENTRY_LEN = 12
+SUBFIELD_INDICATOR = "\x1F"
+END_OF_FIELD = "\x1E"
+END_OF_RECORD = "\x1D"
+isbn_regex = re.compile(r'[-0-9xX]+')
+
+
+def read_file(dbfile, tags=None):
+ while True:
+ pos = dbfile.tell()
+ first5 = dbfile.read(5)
+ if not first5:
+ return
+ if len(first5) < 5:
+ raise Exception
+ length = int(first5)
+ chunk = dbfile.read(length - 5)
+ yield parse_record(first5 + chunk, tags), pos
+
+
+def read_record(filename, pos, tags=None):
+ f = open(filename, "rb")
+ f.seek(pos)
+ first5 = f.read(5)
+ length = int(first5)
+ chunk = f.read(length - 5)
+ return parse_record(first5 + chunk, tags)
+
+
+def parse_record(data, tags=None):
+ leader = data[:LEADER_LEN]
+ assert len(leader) == LEADER_LEN
+
+ dataoffset = int(data[12:17])
+ assert dataoffset > 0
+ assert dataoffset < len(data)
+
+ # dataoffset - 1 to avoid END-OF-FIELD byte
+ dirstart = LEADER_LEN
+ dirend = dataoffset - 1
+
+ # Number of fields in record
+ assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0
+ field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN
+
+ result = {}
+ for i in xrange(field_count):
+ start = dirstart + i * DIRECTORY_ENTRY_LEN
+ end = start + DIRECTORY_ENTRY_LEN
+ tag = data[start:start + 3]
+ if tags and not tag in tags:
+ continue
+
+ entry = data[start:end]
+ elen = int(entry[3:7])
+ offset = dataoffset + int(entry[7:12])
+ edata = data[offset:offset + elen - 1]
+
+ if not (tag < "010" and tag.isdigit()):
+ edata = edata.split(SUBFIELD_INDICATOR)[1:]
+ if tag in result:
+ result[tag].extend(edata)
+ else:
+ result[tag] = edata
+ else:
+ result[tag] = edata
+ return result
+
+
+def subfield(vs, code):
+ for v in vs:
+ if v.startswith(code):
+ return v[1:]
+ return None
+
+
+def joinsubfields(vs):
+ return " ".join(v[1:] for v in vs if v and v[0] != "6")
+
+
+def getfields(d, *tags):
+ return (d[tag] for tag in tags if tag in d)
+
+
+def title(d):
+ title = None
+ if "245" in d:
+ svs = d["245"]
+ title = subfield(svs, "a")
+ if title:
+ t2 = subfield(svs, "b")
+ if t2:
+ title += t2
+ return title
+
+
+def isbn(d):
+ if "020" in d:
+ num = subfield(d["020"], "a")
+ if num:
+ match = isbn_regex.search(num)
+ if match:
+ return match.group(0).replace('-', '')
+
+
+def author(d):
+ if "100" in d:
+ return joinsubfields(d["100"])
+ elif "110" in d:
+ return joinsubfields(d["110"])
+ elif "111" in d:
+ return joinsubfields(d["111"])
+
+
+def uniform_title(d):
+ if "130" in d:
+ return joinsubfields(d["130"])
+ elif "240" in d:
+ return joinsubfields(d["240"])
+
+
+subjectfields = ("600 610 611 630 648 650 651 653 654 655 656 657 658 662 "
+ "690 691 696 697 698 699").split()
+
+
+def subjects(d):
+ return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields))
+
+
+def physical(d):
+ return joinsubfields(d["300"])
+
+
+def location(d):
+ return joinsubfields(d["852"])
+
+
+def publisher(d):
+ if "260" in d:
+ return subfield(d["260"], "b")
+
+
+def pubyear(d):
+ if "260" in d:
+ return subfield(d["260"], "c")
+
+
+def uni(v):
+ return u"" if v is None else v.decode("utf-8", "replace")
+
+
+# Indexing and searching
+
+def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
+ glob="*.mrc"):
+ if not os.path.exists(ixdir):
+ os.mkdir(ixdir)
+
+ # Multi-lingual stop words
+ stoplist = (analysis.STOP_WORDS
+ | set("de la der und le die et en al no von di du da "
+ "del zur ein".split()))
+ # Schema
+ ana = analysis.StemmingAnalyzer(stoplist=stoplist)
+ schema = fields.Schema(title=fields.TEXT(analyzer=ana),
+ author=fields.TEXT(phrase=False),
+ subject=fields.TEXT(analyzer=ana, phrase=False),
+ file=fields.STORED, pos=fields.STORED,
+ )
+
+ # MARC fields to extract
+ mfields = set(subjectfields) # Subjects
+ mfields.update("100 110 111".split()) # Author
+ mfields.add("245") # Title
+
+ print("Indexing with %d processor(s) and %d MB per processor"
+ % (procs, limitmb))
+ c = 0
+ t = now()
+ ix = index.create_in(ixdir, schema)
+ with ix.writer(procs=procs, limitmb=limitmb,
+ multisegment=multisegment) as w:
+ filenames = [filename for filename in os.listdir(basedir)
+ if fnmatch.fnmatch(filename, glob)]
+ for filename in filenames:
+ path = os.path.join(basedir, filename)
+ print("Indexing", path)
+ f = open(path, 'rb')
+ for x, pos in read_file(f, mfields):
+ w.add_document(title=uni(title(x)), author=uni(author(x)),
+ subject=uni(subjects(x)),
+ file=filename, pos=pos)
+ c += 1
+ f.close()
+ print("Committing...")
+ print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
+
+
+def print_record(no, basedir, filename, pos):
+ path = os.path.join(basedir, filename)
+ record = read_record(path, pos)
+ print("% 5d. %s" % (no + 1, title(record)))
+ print(" ", author(record))
+ print(" ", subjects(record))
+ isbn_num = isbn(record)
+ if isbn_num:
+ print(" ISBN:", isbn_num)
+ print()
+
+
+def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
+ ix = index.open_dir(ixdir)
+ qp = qparser.QueryParser("title", ix.schema)
+ q = qp.parse(qstring)
+
+ with ix.searcher(weighting=scoring.PL2()) as s:
+ if scores:
+ r = s.search(q, limit=limit, optimize=optimize)
+ for hit in r:
+ print_record(hit.rank, basedir, hit["file"], hit["pos"])
+ print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
+ else:
+ t = now()
+ for i, docnum in enumerate(s.docs_for_query(q)):
+ if not limit or i < limit:
+ fields = s.stored_fields(docnum)
+ print_record(i, basedir, fields["file"], fields["pos"])
+ print("Found %d records in %0.06f seconds" % (i, now() - t))
+
+
+if __name__ == "__main__":
+ from optparse import OptionParser
+
+ p = OptionParser(usage="usage: %prog [options] query")
+ # Common options
+ p.add_option("-f", "--filedir", metavar="DIR", dest="basedir",
+ help="Directory containing the .mrc files to index",
+ default="data/HLOM")
+ p.add_option("-d", "--dir", metavar="DIR", dest="ixdir",
+ help="Directory containing the index", default="marc_index")
+
+ # Indexing options
+ p.add_option("-i", "--index", dest="index",
+ help="Index the records", action="store_true", default=False)
+ p.add_option("-p", "--procs", metavar="NPROCS", dest="procs",
+ help="Number of processors to use", default="1")
+ p.add_option("-m", "--mb", metavar="MB", dest="limitmb",
+ help="Limit the indexer to this many MB of memory per writer",
+ default="128")
+ p.add_option("-M", "--merge-segments", dest="multisegment",
+ help="If indexing with multiproc, merge the segments after"
+ " indexing", action="store_false", default=True)
+ p.add_option("-g", "--match", metavar="GLOB", dest="glob",
+ help="Only index file names matching the given pattern",
+ default="*.mrc")
+
+ # Search options
+ p.add_option("-l", "--limit", metavar="NHITS", dest="limit",
+ help="Maximum number of search results to print (0=no limit)",
+ default="10")
+ p.add_option("-O", "--no-optimize", dest="optimize",
+ help="Turn off searcher optimization (for debugging)",
+ action="store_false", default=True)
+ p.add_option("-s", "--scoring", dest="scores",
+ help="Score the results", action="store_true", default=False)
+
+ options, args = p.parse_args()
+
+ if options.index:
+ make_index(options.basedir, options.ixdir,
+ procs=int(options.procs),
+ limitmb=int(options.limitmb),
+ multisegment=options.multisegment,
+ glob=options.glob)
+
+ if args:
+ qstring = " ".join(args).decode("utf-8")
+ limit = int(options.limit)
+ if limit < 1:
+ limit = None
+ search(qstring, options.ixdir, options.basedir, limit=limit,
+ optimize=options.optimize, scores=options.scores)
diff --git a/benchmark/reuters.py b/benchmark/reuters.py
new file mode 100644
index 0000000..aa20c74
--- /dev/null
+++ b/benchmark/reuters.py
@@ -0,0 +1,38 @@
+import gzip, os.path
+
+from whoosh import analysis, fields, index, qparser, query
+from whoosh.support.bench import Bench, Spec
+from whoosh.util import now
+
+
+class Reuters(Spec):
+ name = "reuters"
+ filename = "reuters21578.txt.gz"
+ main_field = "text"
+ headline_text = "headline"
+
+ def whoosh_schema(self):
+ #ana = analysis.StemmingAnalyzer()
+ ana = analysis.StandardAnalyzer()
+ schema = fields.Schema(id=fields.ID(stored=True),
+ headline=fields.STORED,
+ text=fields.TEXT(analyzer=ana, stored=True))
+ return schema
+
+ def zcatalog_setup(self, cat):
+ from zcatalog import indexes #@UnresolvedImport
+ cat["id"] = indexes.FieldIndex(field_name="id")
+ cat["headline"] = indexes.TextIndex(field_name="headline")
+ cat["body"] = indexes.TextIndex(field_name="text")
+
+ def documents(self):
+ path = os.path.join(self.options.dir, self.filename)
+ f = gzip.GzipFile(path)
+
+ for line in f:
+ id, text = line.decode("latin1").split("\t")
+ yield {"id": id, "text": text, "headline": text[:70]}
+
+
+if __name__ == "__main__":
+ Bench().run(Reuters)
diff --git a/benchmark/reuters21578.txt.gz b/benchmark/reuters21578.txt.gz
new file mode 100644
index 0000000..cdf0677
--- /dev/null
+++ b/benchmark/reuters21578.txt.gz
Binary files differ